Version bumps after new release
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sparc64_hpc_ace_double / nb_kernel_ElecEw_VdwLJEw_GeomW4W4_sparc64_hpc_ace_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "gromacs/legacyheaders/vec.h"
47 #include "nrnb.h"
48
49 #include "kernelutil_sparc64_hpc_ace_double.h"
50
51 /*
52  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sparc64_hpc_ace_double
53  * Electrostatics interaction: Ewald
54  * VdW interaction:            LJEwald
55  * Geometry:                   Water4-Water4
56  * Calculate force/pot:        PotentialAndForce
57  */
58 void
59 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sparc64_hpc_ace_double
60                     (t_nblist                    * gmx_restrict       nlist,
61                      rvec                        * gmx_restrict          xx,
62                      rvec                        * gmx_restrict          ff,
63                      t_forcerec                  * gmx_restrict          fr,
64                      t_mdatoms                   * gmx_restrict     mdatoms,
65                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
66                      t_nrnb                      * gmx_restrict        nrnb)
67 {
68     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69      * just 0 for non-waters.
70      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
71      * jnr indices corresponding to data put in the four positions in the SIMD register.
72      */
73     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
74     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75     int              jnrA,jnrB;
76     int              j_coord_offsetA,j_coord_offsetB;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
81     int              vdwioffset0;
82     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
83     int              vdwioffset1;
84     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85     int              vdwioffset2;
86     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87     int              vdwioffset3;
88     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
89     int              vdwjidx0A,vdwjidx0B;
90     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91     int              vdwjidx1A,vdwjidx1B;
92     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93     int              vdwjidx2A,vdwjidx2B;
94     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95     int              vdwjidx3A,vdwjidx3B;
96     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
97     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
101     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
104     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
105     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
106     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
107     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
108     real             *charge;
109     int              nvdwtype;
110     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
111     int              *vdwtype;
112     real             *vdwparam;
113     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
114     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
115     _fjsp_v2r8           c6grid_00;
116     _fjsp_v2r8           c6grid_11;
117     _fjsp_v2r8           c6grid_12;
118     _fjsp_v2r8           c6grid_13;
119     _fjsp_v2r8           c6grid_21;
120     _fjsp_v2r8           c6grid_22;
121     _fjsp_v2r8           c6grid_23;
122     _fjsp_v2r8           c6grid_31;
123     _fjsp_v2r8           c6grid_32;
124     _fjsp_v2r8           c6grid_33;
125     real                 *vdwgridparam;
126     _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
127     _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
128     _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
129     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
130     real             *ewtab;
131     _fjsp_v2r8       itab_tmp;
132     _fjsp_v2r8       dummy_mask,cutoff_mask;
133     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
134     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
135     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
136
137     x                = xx[0];
138     f                = ff[0];
139
140     nri              = nlist->nri;
141     iinr             = nlist->iinr;
142     jindex           = nlist->jindex;
143     jjnr             = nlist->jjnr;
144     shiftidx         = nlist->shift;
145     gid              = nlist->gid;
146     shiftvec         = fr->shift_vec[0];
147     fshift           = fr->fshift[0];
148     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
149     charge           = mdatoms->chargeA;
150     nvdwtype         = fr->ntype;
151     vdwparam         = fr->nbfp;
152     vdwtype          = mdatoms->typeA;
153     vdwgridparam     = fr->ljpme_c6grid;
154     sh_lj_ewald      = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
155     ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
156     ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
157
158     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
159     ewtab            = fr->ic->tabq_coul_FDV0;
160     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
161     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
162
163     /* Setup water-specific parameters */
164     inr              = nlist->iinr[0];
165     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
166     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
167     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
168     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
169
170     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
171     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
172     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
173     vdwjidx0A        = 2*vdwtype[inr+0];
174     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
175     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
176     c6grid_00        = gmx_fjsp_set1_v2r8(vdwgridparam[vdwioffset0+vdwjidx0A]);
177     qq11             = _fjsp_mul_v2r8(iq1,jq1);
178     qq12             = _fjsp_mul_v2r8(iq1,jq2);
179     qq13             = _fjsp_mul_v2r8(iq1,jq3);
180     qq21             = _fjsp_mul_v2r8(iq2,jq1);
181     qq22             = _fjsp_mul_v2r8(iq2,jq2);
182     qq23             = _fjsp_mul_v2r8(iq2,jq3);
183     qq31             = _fjsp_mul_v2r8(iq3,jq1);
184     qq32             = _fjsp_mul_v2r8(iq3,jq2);
185     qq33             = _fjsp_mul_v2r8(iq3,jq3);
186
187     /* Avoid stupid compiler warnings */
188     jnrA = jnrB = 0;
189     j_coord_offsetA = 0;
190     j_coord_offsetB = 0;
191
192     outeriter        = 0;
193     inneriter        = 0;
194
195     /* Start outer loop over neighborlists */
196     for(iidx=0; iidx<nri; iidx++)
197     {
198         /* Load shift vector for this list */
199         i_shift_offset   = DIM*shiftidx[iidx];
200
201         /* Load limits for loop over neighbors */
202         j_index_start    = jindex[iidx];
203         j_index_end      = jindex[iidx+1];
204
205         /* Get outer coordinate index */
206         inr              = iinr[iidx];
207         i_coord_offset   = DIM*inr;
208
209         /* Load i particle coords and add shift vector */
210         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
211                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
212
213         fix0             = _fjsp_setzero_v2r8();
214         fiy0             = _fjsp_setzero_v2r8();
215         fiz0             = _fjsp_setzero_v2r8();
216         fix1             = _fjsp_setzero_v2r8();
217         fiy1             = _fjsp_setzero_v2r8();
218         fiz1             = _fjsp_setzero_v2r8();
219         fix2             = _fjsp_setzero_v2r8();
220         fiy2             = _fjsp_setzero_v2r8();
221         fiz2             = _fjsp_setzero_v2r8();
222         fix3             = _fjsp_setzero_v2r8();
223         fiy3             = _fjsp_setzero_v2r8();
224         fiz3             = _fjsp_setzero_v2r8();
225
226         /* Reset potential sums */
227         velecsum         = _fjsp_setzero_v2r8();
228         vvdwsum          = _fjsp_setzero_v2r8();
229
230         /* Start inner kernel loop */
231         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
232         {
233
234             /* Get j neighbor index, and coordinate index */
235             jnrA             = jjnr[jidx];
236             jnrB             = jjnr[jidx+1];
237             j_coord_offsetA  = DIM*jnrA;
238             j_coord_offsetB  = DIM*jnrB;
239
240             /* load j atom coordinates */
241             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
242                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
243                                               &jy2,&jz2,&jx3,&jy3,&jz3);
244
245             /* Calculate displacement vector */
246             dx00             = _fjsp_sub_v2r8(ix0,jx0);
247             dy00             = _fjsp_sub_v2r8(iy0,jy0);
248             dz00             = _fjsp_sub_v2r8(iz0,jz0);
249             dx11             = _fjsp_sub_v2r8(ix1,jx1);
250             dy11             = _fjsp_sub_v2r8(iy1,jy1);
251             dz11             = _fjsp_sub_v2r8(iz1,jz1);
252             dx12             = _fjsp_sub_v2r8(ix1,jx2);
253             dy12             = _fjsp_sub_v2r8(iy1,jy2);
254             dz12             = _fjsp_sub_v2r8(iz1,jz2);
255             dx13             = _fjsp_sub_v2r8(ix1,jx3);
256             dy13             = _fjsp_sub_v2r8(iy1,jy3);
257             dz13             = _fjsp_sub_v2r8(iz1,jz3);
258             dx21             = _fjsp_sub_v2r8(ix2,jx1);
259             dy21             = _fjsp_sub_v2r8(iy2,jy1);
260             dz21             = _fjsp_sub_v2r8(iz2,jz1);
261             dx22             = _fjsp_sub_v2r8(ix2,jx2);
262             dy22             = _fjsp_sub_v2r8(iy2,jy2);
263             dz22             = _fjsp_sub_v2r8(iz2,jz2);
264             dx23             = _fjsp_sub_v2r8(ix2,jx3);
265             dy23             = _fjsp_sub_v2r8(iy2,jy3);
266             dz23             = _fjsp_sub_v2r8(iz2,jz3);
267             dx31             = _fjsp_sub_v2r8(ix3,jx1);
268             dy31             = _fjsp_sub_v2r8(iy3,jy1);
269             dz31             = _fjsp_sub_v2r8(iz3,jz1);
270             dx32             = _fjsp_sub_v2r8(ix3,jx2);
271             dy32             = _fjsp_sub_v2r8(iy3,jy2);
272             dz32             = _fjsp_sub_v2r8(iz3,jz2);
273             dx33             = _fjsp_sub_v2r8(ix3,jx3);
274             dy33             = _fjsp_sub_v2r8(iy3,jy3);
275             dz33             = _fjsp_sub_v2r8(iz3,jz3);
276
277             /* Calculate squared distance and things based on it */
278             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
279             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
280             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
281             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
282             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
283             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
284             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
285             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
286             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
287             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
288
289             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
290             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
291             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
292             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
293             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
294             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
295             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
296             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
297             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
298             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
299
300             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
301             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
302             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
303             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
304             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
305             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
306             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
307             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
308             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
309             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
310
311             fjx0             = _fjsp_setzero_v2r8();
312             fjy0             = _fjsp_setzero_v2r8();
313             fjz0             = _fjsp_setzero_v2r8();
314             fjx1             = _fjsp_setzero_v2r8();
315             fjy1             = _fjsp_setzero_v2r8();
316             fjz1             = _fjsp_setzero_v2r8();
317             fjx2             = _fjsp_setzero_v2r8();
318             fjy2             = _fjsp_setzero_v2r8();
319             fjz2             = _fjsp_setzero_v2r8();
320             fjx3             = _fjsp_setzero_v2r8();
321             fjy3             = _fjsp_setzero_v2r8();
322             fjz3             = _fjsp_setzero_v2r8();
323
324             /**************************
325              * CALCULATE INTERACTIONS *
326              **************************/
327
328             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
329
330             /* Analytical LJ-PME */
331             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
332             ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
333             ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
334             exponent         = gmx_simd_exp_d(ewcljrsq);
335             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
336             poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
337             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
338             vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(c6grid_00,_fjsp_sub_v2r8(poly,one),c6_00),rinvsix);
339             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
340             vvdw             = _fjsp_msub_v2r8(vvdw12,one_twelfth,_fjsp_mul_v2r8(vvdw6,one_sixth));         
341             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
342             fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
343
344             /* Update potential sum for this i atom from the interaction with this j atom. */
345             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
346
347             fscal            = fvdw;
348
349             /* Update vectorial force */
350             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
351             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
352             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
353             
354             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
355             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
356             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
357
358             /**************************
359              * CALCULATE INTERACTIONS *
360              **************************/
361
362             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
363
364             /* EWALD ELECTROSTATICS */
365
366             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
367             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
368             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
369             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
370             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
371
372             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
373             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
374             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
375             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
376             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
377             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
378             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
379             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
380             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
381             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
382
383             /* Update potential sum for this i atom from the interaction with this j atom. */
384             velecsum         = _fjsp_add_v2r8(velecsum,velec);
385
386             fscal            = felec;
387
388             /* Update vectorial force */
389             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
390             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
391             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
392             
393             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
394             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
395             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
396
397             /**************************
398              * CALCULATE INTERACTIONS *
399              **************************/
400
401             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
402
403             /* EWALD ELECTROSTATICS */
404
405             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
406             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
407             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
408             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
409             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
410
411             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
412             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
413             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
414             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
415             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
416             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
417             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
418             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
419             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
420             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
421
422             /* Update potential sum for this i atom from the interaction with this j atom. */
423             velecsum         = _fjsp_add_v2r8(velecsum,velec);
424
425             fscal            = felec;
426
427             /* Update vectorial force */
428             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
429             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
430             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
431             
432             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
433             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
434             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
435
436             /**************************
437              * CALCULATE INTERACTIONS *
438              **************************/
439
440             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
441
442             /* EWALD ELECTROSTATICS */
443
444             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
445             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
446             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
447             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
448             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
449
450             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
451             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
452             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
453             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
454             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
455             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
456             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
457             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
458             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
459             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
460
461             /* Update potential sum for this i atom from the interaction with this j atom. */
462             velecsum         = _fjsp_add_v2r8(velecsum,velec);
463
464             fscal            = felec;
465
466             /* Update vectorial force */
467             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
468             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
469             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
470             
471             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
472             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
473             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
474
475             /**************************
476              * CALCULATE INTERACTIONS *
477              **************************/
478
479             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
480
481             /* EWALD ELECTROSTATICS */
482
483             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
484             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
485             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
486             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
487             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
488
489             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
490             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
491             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
492             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
493             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
494             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
495             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
496             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
497             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
498             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
499
500             /* Update potential sum for this i atom from the interaction with this j atom. */
501             velecsum         = _fjsp_add_v2r8(velecsum,velec);
502
503             fscal            = felec;
504
505             /* Update vectorial force */
506             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
507             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
508             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
509             
510             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
511             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
512             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
513
514             /**************************
515              * CALCULATE INTERACTIONS *
516              **************************/
517
518             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
519
520             /* EWALD ELECTROSTATICS */
521
522             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
523             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
524             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
525             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
526             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
527
528             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
529             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
530             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
531             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
532             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
533             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
534             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
535             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
536             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
537             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
538
539             /* Update potential sum for this i atom from the interaction with this j atom. */
540             velecsum         = _fjsp_add_v2r8(velecsum,velec);
541
542             fscal            = felec;
543
544             /* Update vectorial force */
545             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
546             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
547             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
548             
549             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
550             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
551             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
552
553             /**************************
554              * CALCULATE INTERACTIONS *
555              **************************/
556
557             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
558
559             /* EWALD ELECTROSTATICS */
560
561             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
562             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
563             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
564             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
565             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
566
567             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
568             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
569             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
570             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
571             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
572             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
573             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
574             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
575             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
576             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
577
578             /* Update potential sum for this i atom from the interaction with this j atom. */
579             velecsum         = _fjsp_add_v2r8(velecsum,velec);
580
581             fscal            = felec;
582
583             /* Update vectorial force */
584             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
585             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
586             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
587             
588             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
589             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
590             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
591
592             /**************************
593              * CALCULATE INTERACTIONS *
594              **************************/
595
596             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
597
598             /* EWALD ELECTROSTATICS */
599
600             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
601             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
602             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
603             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
604             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
605
606             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
607             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
608             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
609             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
610             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
611             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
612             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
613             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
614             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
615             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
616
617             /* Update potential sum for this i atom from the interaction with this j atom. */
618             velecsum         = _fjsp_add_v2r8(velecsum,velec);
619
620             fscal            = felec;
621
622             /* Update vectorial force */
623             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
624             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
625             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
626             
627             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
628             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
629             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
630
631             /**************************
632              * CALCULATE INTERACTIONS *
633              **************************/
634
635             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
636
637             /* EWALD ELECTROSTATICS */
638
639             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
640             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
641             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
642             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
643             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
644
645             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
646             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
647             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
648             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
649             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
650             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
651             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
652             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
653             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
654             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
655
656             /* Update potential sum for this i atom from the interaction with this j atom. */
657             velecsum         = _fjsp_add_v2r8(velecsum,velec);
658
659             fscal            = felec;
660
661             /* Update vectorial force */
662             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
663             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
664             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
665             
666             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
667             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
668             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
669
670             /**************************
671              * CALCULATE INTERACTIONS *
672              **************************/
673
674             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
675
676             /* EWALD ELECTROSTATICS */
677
678             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
679             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
680             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
681             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
682             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
683
684             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
685             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
686             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
687             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
688             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
689             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
690             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
691             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
692             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
693             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
694
695             /* Update potential sum for this i atom from the interaction with this j atom. */
696             velecsum         = _fjsp_add_v2r8(velecsum,velec);
697
698             fscal            = felec;
699
700             /* Update vectorial force */
701             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
702             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
703             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
704             
705             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
706             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
707             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
708
709             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
710
711             /* Inner loop uses 449 flops */
712         }
713
714         if(jidx<j_index_end)
715         {
716
717             jnrA             = jjnr[jidx];
718             j_coord_offsetA  = DIM*jnrA;
719
720             /* load j atom coordinates */
721             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
722                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
723                                               &jy2,&jz2,&jx3,&jy3,&jz3);
724
725             /* Calculate displacement vector */
726             dx00             = _fjsp_sub_v2r8(ix0,jx0);
727             dy00             = _fjsp_sub_v2r8(iy0,jy0);
728             dz00             = _fjsp_sub_v2r8(iz0,jz0);
729             dx11             = _fjsp_sub_v2r8(ix1,jx1);
730             dy11             = _fjsp_sub_v2r8(iy1,jy1);
731             dz11             = _fjsp_sub_v2r8(iz1,jz1);
732             dx12             = _fjsp_sub_v2r8(ix1,jx2);
733             dy12             = _fjsp_sub_v2r8(iy1,jy2);
734             dz12             = _fjsp_sub_v2r8(iz1,jz2);
735             dx13             = _fjsp_sub_v2r8(ix1,jx3);
736             dy13             = _fjsp_sub_v2r8(iy1,jy3);
737             dz13             = _fjsp_sub_v2r8(iz1,jz3);
738             dx21             = _fjsp_sub_v2r8(ix2,jx1);
739             dy21             = _fjsp_sub_v2r8(iy2,jy1);
740             dz21             = _fjsp_sub_v2r8(iz2,jz1);
741             dx22             = _fjsp_sub_v2r8(ix2,jx2);
742             dy22             = _fjsp_sub_v2r8(iy2,jy2);
743             dz22             = _fjsp_sub_v2r8(iz2,jz2);
744             dx23             = _fjsp_sub_v2r8(ix2,jx3);
745             dy23             = _fjsp_sub_v2r8(iy2,jy3);
746             dz23             = _fjsp_sub_v2r8(iz2,jz3);
747             dx31             = _fjsp_sub_v2r8(ix3,jx1);
748             dy31             = _fjsp_sub_v2r8(iy3,jy1);
749             dz31             = _fjsp_sub_v2r8(iz3,jz1);
750             dx32             = _fjsp_sub_v2r8(ix3,jx2);
751             dy32             = _fjsp_sub_v2r8(iy3,jy2);
752             dz32             = _fjsp_sub_v2r8(iz3,jz2);
753             dx33             = _fjsp_sub_v2r8(ix3,jx3);
754             dy33             = _fjsp_sub_v2r8(iy3,jy3);
755             dz33             = _fjsp_sub_v2r8(iz3,jz3);
756
757             /* Calculate squared distance and things based on it */
758             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
759             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
760             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
761             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
762             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
763             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
764             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
765             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
766             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
767             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
768
769             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
770             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
771             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
772             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
773             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
774             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
775             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
776             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
777             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
778             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
779
780             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
781             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
782             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
783             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
784             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
785             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
786             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
787             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
788             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
789             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
790
791             fjx0             = _fjsp_setzero_v2r8();
792             fjy0             = _fjsp_setzero_v2r8();
793             fjz0             = _fjsp_setzero_v2r8();
794             fjx1             = _fjsp_setzero_v2r8();
795             fjy1             = _fjsp_setzero_v2r8();
796             fjz1             = _fjsp_setzero_v2r8();
797             fjx2             = _fjsp_setzero_v2r8();
798             fjy2             = _fjsp_setzero_v2r8();
799             fjz2             = _fjsp_setzero_v2r8();
800             fjx3             = _fjsp_setzero_v2r8();
801             fjy3             = _fjsp_setzero_v2r8();
802             fjz3             = _fjsp_setzero_v2r8();
803
804             /**************************
805              * CALCULATE INTERACTIONS *
806              **************************/
807
808             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
809
810             /* Analytical LJ-PME */
811             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
812             ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
813             ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
814             exponent         = gmx_simd_exp_d(ewcljrsq);
815             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
816             poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
817             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
818             vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(c6grid_00,_fjsp_sub_v2r8(poly,one),c6_00),rinvsix);
819             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
820             vvdw             = _fjsp_msub_v2r8(vvdw12,one_twelfth,_fjsp_mul_v2r8(vvdw6,one_sixth));         
821             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
822             fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
823
824             /* Update potential sum for this i atom from the interaction with this j atom. */
825             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
826             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
827
828             fscal            = fvdw;
829
830             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
831
832             /* Update vectorial force */
833             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
834             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
835             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
836             
837             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
838             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
839             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
840
841             /**************************
842              * CALCULATE INTERACTIONS *
843              **************************/
844
845             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
846
847             /* EWALD ELECTROSTATICS */
848
849             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
850             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
851             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
852             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
853             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
854
855             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
856             ewtabD           = _fjsp_setzero_v2r8();
857             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
858             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
859             ewtabFn          = _fjsp_setzero_v2r8();
860             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
861             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
862             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
863             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
864             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
865
866             /* Update potential sum for this i atom from the interaction with this j atom. */
867             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
868             velecsum         = _fjsp_add_v2r8(velecsum,velec);
869
870             fscal            = felec;
871
872             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
873
874             /* Update vectorial force */
875             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
876             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
877             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
878             
879             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
880             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
881             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
882
883             /**************************
884              * CALCULATE INTERACTIONS *
885              **************************/
886
887             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
888
889             /* EWALD ELECTROSTATICS */
890
891             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
892             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
893             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
894             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
895             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
896
897             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
898             ewtabD           = _fjsp_setzero_v2r8();
899             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
900             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
901             ewtabFn          = _fjsp_setzero_v2r8();
902             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
903             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
904             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
905             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
906             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
907
908             /* Update potential sum for this i atom from the interaction with this j atom. */
909             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
910             velecsum         = _fjsp_add_v2r8(velecsum,velec);
911
912             fscal            = felec;
913
914             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
915
916             /* Update vectorial force */
917             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
918             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
919             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
920             
921             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
922             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
923             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
924
925             /**************************
926              * CALCULATE INTERACTIONS *
927              **************************/
928
929             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
930
931             /* EWALD ELECTROSTATICS */
932
933             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
934             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
935             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
936             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
937             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
938
939             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
940             ewtabD           = _fjsp_setzero_v2r8();
941             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
942             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
943             ewtabFn          = _fjsp_setzero_v2r8();
944             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
945             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
946             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
947             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
948             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
949
950             /* Update potential sum for this i atom from the interaction with this j atom. */
951             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
952             velecsum         = _fjsp_add_v2r8(velecsum,velec);
953
954             fscal            = felec;
955
956             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
957
958             /* Update vectorial force */
959             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
960             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
961             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
962             
963             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
964             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
965             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
966
967             /**************************
968              * CALCULATE INTERACTIONS *
969              **************************/
970
971             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
972
973             /* EWALD ELECTROSTATICS */
974
975             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
976             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
977             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
978             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
979             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
980
981             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
982             ewtabD           = _fjsp_setzero_v2r8();
983             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
984             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
985             ewtabFn          = _fjsp_setzero_v2r8();
986             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
987             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
988             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
989             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
990             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
991
992             /* Update potential sum for this i atom from the interaction with this j atom. */
993             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
994             velecsum         = _fjsp_add_v2r8(velecsum,velec);
995
996             fscal            = felec;
997
998             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
999
1000             /* Update vectorial force */
1001             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
1002             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
1003             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
1004             
1005             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
1006             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
1007             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
1008
1009             /**************************
1010              * CALCULATE INTERACTIONS *
1011              **************************/
1012
1013             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
1014
1015             /* EWALD ELECTROSTATICS */
1016
1017             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1018             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
1019             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1020             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1021             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1022
1023             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1024             ewtabD           = _fjsp_setzero_v2r8();
1025             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1026             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1027             ewtabFn          = _fjsp_setzero_v2r8();
1028             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1029             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1030             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1031             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
1032             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
1033
1034             /* Update potential sum for this i atom from the interaction with this j atom. */
1035             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1036             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1037
1038             fscal            = felec;
1039
1040             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1041
1042             /* Update vectorial force */
1043             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
1044             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
1045             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
1046             
1047             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
1048             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
1049             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
1050
1051             /**************************
1052              * CALCULATE INTERACTIONS *
1053              **************************/
1054
1055             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
1056
1057             /* EWALD ELECTROSTATICS */
1058
1059             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1060             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
1061             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1062             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1063             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1064
1065             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1066             ewtabD           = _fjsp_setzero_v2r8();
1067             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1068             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1069             ewtabFn          = _fjsp_setzero_v2r8();
1070             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1071             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1072             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1073             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
1074             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
1075
1076             /* Update potential sum for this i atom from the interaction with this j atom. */
1077             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1078             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1079
1080             fscal            = felec;
1081
1082             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1083
1084             /* Update vectorial force */
1085             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
1086             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
1087             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
1088             
1089             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
1090             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
1091             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
1092
1093             /**************************
1094              * CALCULATE INTERACTIONS *
1095              **************************/
1096
1097             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
1098
1099             /* EWALD ELECTROSTATICS */
1100
1101             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1102             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
1103             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1104             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1105             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1106
1107             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1108             ewtabD           = _fjsp_setzero_v2r8();
1109             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1110             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1111             ewtabFn          = _fjsp_setzero_v2r8();
1112             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1113             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1114             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1115             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
1116             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
1117
1118             /* Update potential sum for this i atom from the interaction with this j atom. */
1119             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1120             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1121
1122             fscal            = felec;
1123
1124             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1125
1126             /* Update vectorial force */
1127             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
1128             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
1129             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
1130             
1131             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
1132             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
1133             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
1134
1135             /**************************
1136              * CALCULATE INTERACTIONS *
1137              **************************/
1138
1139             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
1140
1141             /* EWALD ELECTROSTATICS */
1142
1143             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1144             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
1145             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1146             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1147             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1148
1149             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1150             ewtabD           = _fjsp_setzero_v2r8();
1151             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1152             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1153             ewtabFn          = _fjsp_setzero_v2r8();
1154             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1155             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1156             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1157             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
1158             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
1159
1160             /* Update potential sum for this i atom from the interaction with this j atom. */
1161             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1162             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1163
1164             fscal            = felec;
1165
1166             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1167
1168             /* Update vectorial force */
1169             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
1170             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
1171             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
1172             
1173             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
1174             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
1175             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
1176
1177             /**************************
1178              * CALCULATE INTERACTIONS *
1179              **************************/
1180
1181             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
1182
1183             /* EWALD ELECTROSTATICS */
1184
1185             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1186             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
1187             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1188             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1189             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1190
1191             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1192             ewtabD           = _fjsp_setzero_v2r8();
1193             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1194             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1195             ewtabFn          = _fjsp_setzero_v2r8();
1196             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1197             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1198             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1199             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
1200             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
1201
1202             /* Update potential sum for this i atom from the interaction with this j atom. */
1203             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1204             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1205
1206             fscal            = felec;
1207
1208             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1209
1210             /* Update vectorial force */
1211             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
1212             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
1213             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
1214             
1215             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
1216             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
1217             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
1218
1219             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1220
1221             /* Inner loop uses 449 flops */
1222         }
1223
1224         /* End of innermost loop */
1225
1226         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1227                                               f+i_coord_offset,fshift+i_shift_offset);
1228
1229         ggid                        = gid[iidx];
1230         /* Update potential energies */
1231         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
1232         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
1233
1234         /* Increment number of inner iterations */
1235         inneriter                  += j_index_end - j_index_start;
1236
1237         /* Outer loop uses 26 flops */
1238     }
1239
1240     /* Increment number of outer iterations */
1241     outeriter        += nri;
1242
1243     /* Update outer/inner flops */
1244
1245     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*449);
1246 }
1247 /*
1248  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sparc64_hpc_ace_double
1249  * Electrostatics interaction: Ewald
1250  * VdW interaction:            LJEwald
1251  * Geometry:                   Water4-Water4
1252  * Calculate force/pot:        Force
1253  */
1254 void
1255 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sparc64_hpc_ace_double
1256                     (t_nblist                    * gmx_restrict       nlist,
1257                      rvec                        * gmx_restrict          xx,
1258                      rvec                        * gmx_restrict          ff,
1259                      t_forcerec                  * gmx_restrict          fr,
1260                      t_mdatoms                   * gmx_restrict     mdatoms,
1261                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1262                      t_nrnb                      * gmx_restrict        nrnb)
1263 {
1264     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1265      * just 0 for non-waters.
1266      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
1267      * jnr indices corresponding to data put in the four positions in the SIMD register.
1268      */
1269     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1270     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1271     int              jnrA,jnrB;
1272     int              j_coord_offsetA,j_coord_offsetB;
1273     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1274     real             rcutoff_scalar;
1275     real             *shiftvec,*fshift,*x,*f;
1276     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1277     int              vdwioffset0;
1278     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1279     int              vdwioffset1;
1280     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1281     int              vdwioffset2;
1282     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1283     int              vdwioffset3;
1284     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1285     int              vdwjidx0A,vdwjidx0B;
1286     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1287     int              vdwjidx1A,vdwjidx1B;
1288     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1289     int              vdwjidx2A,vdwjidx2B;
1290     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1291     int              vdwjidx3A,vdwjidx3B;
1292     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1293     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1294     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1295     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1296     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1297     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1298     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1299     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1300     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1301     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1302     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1303     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
1304     real             *charge;
1305     int              nvdwtype;
1306     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1307     int              *vdwtype;
1308     real             *vdwparam;
1309     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
1310     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
1311     _fjsp_v2r8           c6grid_00;
1312     _fjsp_v2r8           c6grid_11;
1313     _fjsp_v2r8           c6grid_12;
1314     _fjsp_v2r8           c6grid_13;
1315     _fjsp_v2r8           c6grid_21;
1316     _fjsp_v2r8           c6grid_22;
1317     _fjsp_v2r8           c6grid_23;
1318     _fjsp_v2r8           c6grid_31;
1319     _fjsp_v2r8           c6grid_32;
1320     _fjsp_v2r8           c6grid_33;
1321     real                 *vdwgridparam;
1322     _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1323     _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
1324     _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
1325     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1326     real             *ewtab;
1327     _fjsp_v2r8       itab_tmp;
1328     _fjsp_v2r8       dummy_mask,cutoff_mask;
1329     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
1330     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
1331     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
1332
1333     x                = xx[0];
1334     f                = ff[0];
1335
1336     nri              = nlist->nri;
1337     iinr             = nlist->iinr;
1338     jindex           = nlist->jindex;
1339     jjnr             = nlist->jjnr;
1340     shiftidx         = nlist->shift;
1341     gid              = nlist->gid;
1342     shiftvec         = fr->shift_vec[0];
1343     fshift           = fr->fshift[0];
1344     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
1345     charge           = mdatoms->chargeA;
1346     nvdwtype         = fr->ntype;
1347     vdwparam         = fr->nbfp;
1348     vdwtype          = mdatoms->typeA;
1349     vdwgridparam     = fr->ljpme_c6grid;
1350     sh_lj_ewald      = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
1351     ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
1352     ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
1353
1354     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
1355     ewtab            = fr->ic->tabq_coul_F;
1356     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
1357     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
1358
1359     /* Setup water-specific parameters */
1360     inr              = nlist->iinr[0];
1361     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
1362     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
1363     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
1364     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1365
1366     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
1367     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
1368     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
1369     vdwjidx0A        = 2*vdwtype[inr+0];
1370     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
1371     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
1372     c6grid_00        = gmx_fjsp_set1_v2r8(vdwgridparam[vdwioffset0+vdwjidx0A]);
1373     qq11             = _fjsp_mul_v2r8(iq1,jq1);
1374     qq12             = _fjsp_mul_v2r8(iq1,jq2);
1375     qq13             = _fjsp_mul_v2r8(iq1,jq3);
1376     qq21             = _fjsp_mul_v2r8(iq2,jq1);
1377     qq22             = _fjsp_mul_v2r8(iq2,jq2);
1378     qq23             = _fjsp_mul_v2r8(iq2,jq3);
1379     qq31             = _fjsp_mul_v2r8(iq3,jq1);
1380     qq32             = _fjsp_mul_v2r8(iq3,jq2);
1381     qq33             = _fjsp_mul_v2r8(iq3,jq3);
1382
1383     /* Avoid stupid compiler warnings */
1384     jnrA = jnrB = 0;
1385     j_coord_offsetA = 0;
1386     j_coord_offsetB = 0;
1387
1388     outeriter        = 0;
1389     inneriter        = 0;
1390
1391     /* Start outer loop over neighborlists */
1392     for(iidx=0; iidx<nri; iidx++)
1393     {
1394         /* Load shift vector for this list */
1395         i_shift_offset   = DIM*shiftidx[iidx];
1396
1397         /* Load limits for loop over neighbors */
1398         j_index_start    = jindex[iidx];
1399         j_index_end      = jindex[iidx+1];
1400
1401         /* Get outer coordinate index */
1402         inr              = iinr[iidx];
1403         i_coord_offset   = DIM*inr;
1404
1405         /* Load i particle coords and add shift vector */
1406         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
1407                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1408
1409         fix0             = _fjsp_setzero_v2r8();
1410         fiy0             = _fjsp_setzero_v2r8();
1411         fiz0             = _fjsp_setzero_v2r8();
1412         fix1             = _fjsp_setzero_v2r8();
1413         fiy1             = _fjsp_setzero_v2r8();
1414         fiz1             = _fjsp_setzero_v2r8();
1415         fix2             = _fjsp_setzero_v2r8();
1416         fiy2             = _fjsp_setzero_v2r8();
1417         fiz2             = _fjsp_setzero_v2r8();
1418         fix3             = _fjsp_setzero_v2r8();
1419         fiy3             = _fjsp_setzero_v2r8();
1420         fiz3             = _fjsp_setzero_v2r8();
1421
1422         /* Start inner kernel loop */
1423         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1424         {
1425
1426             /* Get j neighbor index, and coordinate index */
1427             jnrA             = jjnr[jidx];
1428             jnrB             = jjnr[jidx+1];
1429             j_coord_offsetA  = DIM*jnrA;
1430             j_coord_offsetB  = DIM*jnrB;
1431
1432             /* load j atom coordinates */
1433             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
1434                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1435                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1436
1437             /* Calculate displacement vector */
1438             dx00             = _fjsp_sub_v2r8(ix0,jx0);
1439             dy00             = _fjsp_sub_v2r8(iy0,jy0);
1440             dz00             = _fjsp_sub_v2r8(iz0,jz0);
1441             dx11             = _fjsp_sub_v2r8(ix1,jx1);
1442             dy11             = _fjsp_sub_v2r8(iy1,jy1);
1443             dz11             = _fjsp_sub_v2r8(iz1,jz1);
1444             dx12             = _fjsp_sub_v2r8(ix1,jx2);
1445             dy12             = _fjsp_sub_v2r8(iy1,jy2);
1446             dz12             = _fjsp_sub_v2r8(iz1,jz2);
1447             dx13             = _fjsp_sub_v2r8(ix1,jx3);
1448             dy13             = _fjsp_sub_v2r8(iy1,jy3);
1449             dz13             = _fjsp_sub_v2r8(iz1,jz3);
1450             dx21             = _fjsp_sub_v2r8(ix2,jx1);
1451             dy21             = _fjsp_sub_v2r8(iy2,jy1);
1452             dz21             = _fjsp_sub_v2r8(iz2,jz1);
1453             dx22             = _fjsp_sub_v2r8(ix2,jx2);
1454             dy22             = _fjsp_sub_v2r8(iy2,jy2);
1455             dz22             = _fjsp_sub_v2r8(iz2,jz2);
1456             dx23             = _fjsp_sub_v2r8(ix2,jx3);
1457             dy23             = _fjsp_sub_v2r8(iy2,jy3);
1458             dz23             = _fjsp_sub_v2r8(iz2,jz3);
1459             dx31             = _fjsp_sub_v2r8(ix3,jx1);
1460             dy31             = _fjsp_sub_v2r8(iy3,jy1);
1461             dz31             = _fjsp_sub_v2r8(iz3,jz1);
1462             dx32             = _fjsp_sub_v2r8(ix3,jx2);
1463             dy32             = _fjsp_sub_v2r8(iy3,jy2);
1464             dz32             = _fjsp_sub_v2r8(iz3,jz2);
1465             dx33             = _fjsp_sub_v2r8(ix3,jx3);
1466             dy33             = _fjsp_sub_v2r8(iy3,jy3);
1467             dz33             = _fjsp_sub_v2r8(iz3,jz3);
1468
1469             /* Calculate squared distance and things based on it */
1470             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
1471             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
1472             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
1473             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
1474             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
1475             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
1476             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
1477             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
1478             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
1479             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
1480
1481             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
1482             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
1483             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
1484             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
1485             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
1486             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
1487             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
1488             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
1489             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
1490             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
1491
1492             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
1493             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
1494             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
1495             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
1496             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
1497             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
1498             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
1499             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
1500             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
1501             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
1502
1503             fjx0             = _fjsp_setzero_v2r8();
1504             fjy0             = _fjsp_setzero_v2r8();
1505             fjz0             = _fjsp_setzero_v2r8();
1506             fjx1             = _fjsp_setzero_v2r8();
1507             fjy1             = _fjsp_setzero_v2r8();
1508             fjz1             = _fjsp_setzero_v2r8();
1509             fjx2             = _fjsp_setzero_v2r8();
1510             fjy2             = _fjsp_setzero_v2r8();
1511             fjz2             = _fjsp_setzero_v2r8();
1512             fjx3             = _fjsp_setzero_v2r8();
1513             fjy3             = _fjsp_setzero_v2r8();
1514             fjz3             = _fjsp_setzero_v2r8();
1515
1516             /**************************
1517              * CALCULATE INTERACTIONS *
1518              **************************/
1519
1520             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
1521
1522             /* Analytical LJ-PME */
1523             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
1524             ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
1525             ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
1526             exponent         = gmx_simd_exp_d(ewcljrsq);
1527             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1528             poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
1529             /* f6A = 6 * C6grid * (1 - poly) */
1530             f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_sub_v2r8(one,poly));
1531             /* f6B = C6grid * exponent * beta^6 */
1532             f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
1533             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1534             fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
1535
1536             fscal            = fvdw;
1537
1538             /* Update vectorial force */
1539             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
1540             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
1541             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
1542             
1543             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
1544             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
1545             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
1546
1547             /**************************
1548              * CALCULATE INTERACTIONS *
1549              **************************/
1550
1551             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
1552
1553             /* EWALD ELECTROSTATICS */
1554
1555             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1556             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
1557             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1558             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1559             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1560
1561             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1562                                          &ewtabF,&ewtabFn);
1563             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1564             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
1565
1566             fscal            = felec;
1567
1568             /* Update vectorial force */
1569             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
1570             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
1571             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
1572             
1573             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
1574             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
1575             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
1576
1577             /**************************
1578              * CALCULATE INTERACTIONS *
1579              **************************/
1580
1581             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
1582
1583             /* EWALD ELECTROSTATICS */
1584
1585             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1586             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
1587             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1588             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1589             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1590
1591             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1592                                          &ewtabF,&ewtabFn);
1593             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1594             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
1595
1596             fscal            = felec;
1597
1598             /* Update vectorial force */
1599             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
1600             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
1601             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
1602             
1603             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
1604             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
1605             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
1606
1607             /**************************
1608              * CALCULATE INTERACTIONS *
1609              **************************/
1610
1611             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
1612
1613             /* EWALD ELECTROSTATICS */
1614
1615             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1616             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
1617             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1618             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1619             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1620
1621             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1622                                          &ewtabF,&ewtabFn);
1623             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1624             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
1625
1626             fscal            = felec;
1627
1628             /* Update vectorial force */
1629             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
1630             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
1631             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
1632             
1633             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
1634             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
1635             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
1636
1637             /**************************
1638              * CALCULATE INTERACTIONS *
1639              **************************/
1640
1641             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
1642
1643             /* EWALD ELECTROSTATICS */
1644
1645             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1646             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
1647             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1648             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1649             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1650
1651             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1652                                          &ewtabF,&ewtabFn);
1653             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1654             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
1655
1656             fscal            = felec;
1657
1658             /* Update vectorial force */
1659             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
1660             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
1661             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
1662             
1663             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
1664             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
1665             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
1666
1667             /**************************
1668              * CALCULATE INTERACTIONS *
1669              **************************/
1670
1671             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
1672
1673             /* EWALD ELECTROSTATICS */
1674
1675             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1676             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
1677             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1678             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1679             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1680
1681             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1682                                          &ewtabF,&ewtabFn);
1683             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1684             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
1685
1686             fscal            = felec;
1687
1688             /* Update vectorial force */
1689             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
1690             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
1691             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
1692             
1693             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
1694             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
1695             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
1696
1697             /**************************
1698              * CALCULATE INTERACTIONS *
1699              **************************/
1700
1701             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
1702
1703             /* EWALD ELECTROSTATICS */
1704
1705             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1706             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
1707             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1708             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1709             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1710
1711             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1712                                          &ewtabF,&ewtabFn);
1713             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1714             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
1715
1716             fscal            = felec;
1717
1718             /* Update vectorial force */
1719             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
1720             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
1721             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
1722             
1723             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
1724             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
1725             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
1726
1727             /**************************
1728              * CALCULATE INTERACTIONS *
1729              **************************/
1730
1731             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
1732
1733             /* EWALD ELECTROSTATICS */
1734
1735             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1736             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
1737             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1738             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1739             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1740
1741             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1742                                          &ewtabF,&ewtabFn);
1743             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1744             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
1745
1746             fscal            = felec;
1747
1748             /* Update vectorial force */
1749             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
1750             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
1751             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
1752             
1753             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
1754             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
1755             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
1756
1757             /**************************
1758              * CALCULATE INTERACTIONS *
1759              **************************/
1760
1761             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
1762
1763             /* EWALD ELECTROSTATICS */
1764
1765             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1766             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
1767             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1768             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1769             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1770
1771             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1772                                          &ewtabF,&ewtabFn);
1773             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1774             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
1775
1776             fscal            = felec;
1777
1778             /* Update vectorial force */
1779             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
1780             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
1781             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
1782             
1783             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
1784             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
1785             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
1786
1787             /**************************
1788              * CALCULATE INTERACTIONS *
1789              **************************/
1790
1791             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
1792
1793             /* EWALD ELECTROSTATICS */
1794
1795             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1796             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
1797             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1798             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1799             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1800
1801             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1802                                          &ewtabF,&ewtabFn);
1803             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1804             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
1805
1806             fscal            = felec;
1807
1808             /* Update vectorial force */
1809             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
1810             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
1811             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
1812             
1813             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
1814             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
1815             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
1816
1817             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1818
1819             /* Inner loop uses 402 flops */
1820         }
1821
1822         if(jidx<j_index_end)
1823         {
1824
1825             jnrA             = jjnr[jidx];
1826             j_coord_offsetA  = DIM*jnrA;
1827
1828             /* load j atom coordinates */
1829             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
1830                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1831                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1832
1833             /* Calculate displacement vector */
1834             dx00             = _fjsp_sub_v2r8(ix0,jx0);
1835             dy00             = _fjsp_sub_v2r8(iy0,jy0);
1836             dz00             = _fjsp_sub_v2r8(iz0,jz0);
1837             dx11             = _fjsp_sub_v2r8(ix1,jx1);
1838             dy11             = _fjsp_sub_v2r8(iy1,jy1);
1839             dz11             = _fjsp_sub_v2r8(iz1,jz1);
1840             dx12             = _fjsp_sub_v2r8(ix1,jx2);
1841             dy12             = _fjsp_sub_v2r8(iy1,jy2);
1842             dz12             = _fjsp_sub_v2r8(iz1,jz2);
1843             dx13             = _fjsp_sub_v2r8(ix1,jx3);
1844             dy13             = _fjsp_sub_v2r8(iy1,jy3);
1845             dz13             = _fjsp_sub_v2r8(iz1,jz3);
1846             dx21             = _fjsp_sub_v2r8(ix2,jx1);
1847             dy21             = _fjsp_sub_v2r8(iy2,jy1);
1848             dz21             = _fjsp_sub_v2r8(iz2,jz1);
1849             dx22             = _fjsp_sub_v2r8(ix2,jx2);
1850             dy22             = _fjsp_sub_v2r8(iy2,jy2);
1851             dz22             = _fjsp_sub_v2r8(iz2,jz2);
1852             dx23             = _fjsp_sub_v2r8(ix2,jx3);
1853             dy23             = _fjsp_sub_v2r8(iy2,jy3);
1854             dz23             = _fjsp_sub_v2r8(iz2,jz3);
1855             dx31             = _fjsp_sub_v2r8(ix3,jx1);
1856             dy31             = _fjsp_sub_v2r8(iy3,jy1);
1857             dz31             = _fjsp_sub_v2r8(iz3,jz1);
1858             dx32             = _fjsp_sub_v2r8(ix3,jx2);
1859             dy32             = _fjsp_sub_v2r8(iy3,jy2);
1860             dz32             = _fjsp_sub_v2r8(iz3,jz2);
1861             dx33             = _fjsp_sub_v2r8(ix3,jx3);
1862             dy33             = _fjsp_sub_v2r8(iy3,jy3);
1863             dz33             = _fjsp_sub_v2r8(iz3,jz3);
1864
1865             /* Calculate squared distance and things based on it */
1866             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
1867             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
1868             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
1869             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
1870             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
1871             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
1872             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
1873             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
1874             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
1875             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
1876
1877             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
1878             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
1879             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
1880             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
1881             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
1882             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
1883             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
1884             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
1885             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
1886             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
1887
1888             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
1889             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
1890             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
1891             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
1892             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
1893             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
1894             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
1895             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
1896             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
1897             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
1898
1899             fjx0             = _fjsp_setzero_v2r8();
1900             fjy0             = _fjsp_setzero_v2r8();
1901             fjz0             = _fjsp_setzero_v2r8();
1902             fjx1             = _fjsp_setzero_v2r8();
1903             fjy1             = _fjsp_setzero_v2r8();
1904             fjz1             = _fjsp_setzero_v2r8();
1905             fjx2             = _fjsp_setzero_v2r8();
1906             fjy2             = _fjsp_setzero_v2r8();
1907             fjz2             = _fjsp_setzero_v2r8();
1908             fjx3             = _fjsp_setzero_v2r8();
1909             fjy3             = _fjsp_setzero_v2r8();
1910             fjz3             = _fjsp_setzero_v2r8();
1911
1912             /**************************
1913              * CALCULATE INTERACTIONS *
1914              **************************/
1915
1916             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
1917
1918             /* Analytical LJ-PME */
1919             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
1920             ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
1921             ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
1922             exponent         = gmx_simd_exp_d(ewcljrsq);
1923             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1924             poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
1925             /* f6A = 6 * C6grid * (1 - poly) */
1926             f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_sub_v2r8(one,poly));
1927             /* f6B = C6grid * exponent * beta^6 */
1928             f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
1929             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1930             fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
1931
1932             fscal            = fvdw;
1933
1934             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1935
1936             /* Update vectorial force */
1937             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
1938             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
1939             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
1940             
1941             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
1942             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
1943             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
1944
1945             /**************************
1946              * CALCULATE INTERACTIONS *
1947              **************************/
1948
1949             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
1950
1951             /* EWALD ELECTROSTATICS */
1952
1953             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1954             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
1955             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1956             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1957             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1958
1959             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
1960             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1961             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
1962
1963             fscal            = felec;
1964
1965             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1966
1967             /* Update vectorial force */
1968             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
1969             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
1970             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
1971             
1972             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
1973             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
1974             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
1975
1976             /**************************
1977              * CALCULATE INTERACTIONS *
1978              **************************/
1979
1980             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
1981
1982             /* EWALD ELECTROSTATICS */
1983
1984             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1985             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
1986             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1987             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1988             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1989
1990             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
1991             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1992             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
1993
1994             fscal            = felec;
1995
1996             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1997
1998             /* Update vectorial force */
1999             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
2000             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
2001             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
2002             
2003             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
2004             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
2005             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
2006
2007             /**************************
2008              * CALCULATE INTERACTIONS *
2009              **************************/
2010
2011             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
2012
2013             /* EWALD ELECTROSTATICS */
2014
2015             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2016             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
2017             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2018             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2019             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2020
2021             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2022             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2023             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
2024
2025             fscal            = felec;
2026
2027             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2028
2029             /* Update vectorial force */
2030             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
2031             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
2032             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
2033             
2034             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
2035             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
2036             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
2037
2038             /**************************
2039              * CALCULATE INTERACTIONS *
2040              **************************/
2041
2042             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
2043
2044             /* EWALD ELECTROSTATICS */
2045
2046             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2047             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
2048             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2049             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2050             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2051
2052             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2053             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2054             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
2055
2056             fscal            = felec;
2057
2058             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2059
2060             /* Update vectorial force */
2061             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
2062             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
2063             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
2064             
2065             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
2066             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
2067             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
2068
2069             /**************************
2070              * CALCULATE INTERACTIONS *
2071              **************************/
2072
2073             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
2074
2075             /* EWALD ELECTROSTATICS */
2076
2077             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2078             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
2079             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2080             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2081             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2082
2083             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2084             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2085             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
2086
2087             fscal            = felec;
2088
2089             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2090
2091             /* Update vectorial force */
2092             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
2093             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
2094             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
2095             
2096             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
2097             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
2098             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
2099
2100             /**************************
2101              * CALCULATE INTERACTIONS *
2102              **************************/
2103
2104             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
2105
2106             /* EWALD ELECTROSTATICS */
2107
2108             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2109             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
2110             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2111             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2112             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2113
2114             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2115             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2116             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
2117
2118             fscal            = felec;
2119
2120             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2121
2122             /* Update vectorial force */
2123             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
2124             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
2125             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
2126             
2127             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
2128             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
2129             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
2130
2131             /**************************
2132              * CALCULATE INTERACTIONS *
2133              **************************/
2134
2135             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
2136
2137             /* EWALD ELECTROSTATICS */
2138
2139             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2140             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
2141             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2142             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2143             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2144
2145             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2146             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2147             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
2148
2149             fscal            = felec;
2150
2151             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2152
2153             /* Update vectorial force */
2154             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
2155             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
2156             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
2157             
2158             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
2159             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
2160             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
2161
2162             /**************************
2163              * CALCULATE INTERACTIONS *
2164              **************************/
2165
2166             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
2167
2168             /* EWALD ELECTROSTATICS */
2169
2170             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2171             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
2172             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2173             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2174             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2175
2176             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2177             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2178             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
2179
2180             fscal            = felec;
2181
2182             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2183
2184             /* Update vectorial force */
2185             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
2186             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
2187             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
2188             
2189             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
2190             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
2191             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
2192
2193             /**************************
2194              * CALCULATE INTERACTIONS *
2195              **************************/
2196
2197             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
2198
2199             /* EWALD ELECTROSTATICS */
2200
2201             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2202             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
2203             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2204             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2205             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2206
2207             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2208             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2209             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
2210
2211             fscal            = felec;
2212
2213             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2214
2215             /* Update vectorial force */
2216             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
2217             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
2218             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
2219             
2220             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
2221             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
2222             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
2223
2224             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2225
2226             /* Inner loop uses 402 flops */
2227         }
2228
2229         /* End of innermost loop */
2230
2231         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2232                                               f+i_coord_offset,fshift+i_shift_offset);
2233
2234         /* Increment number of inner iterations */
2235         inneriter                  += j_index_end - j_index_start;
2236
2237         /* Outer loop uses 24 flops */
2238     }
2239
2240     /* Increment number of outer iterations */
2241     outeriter        += nri;
2242
2243     /* Update outer/inner flops */
2244
2245     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*402);
2246 }