Remove all unnecessary HAVE_CONFIG_H
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sparc64_hpc_ace_double / nb_kernel_ElecEw_VdwLJEw_GeomW4W4_sparc64_hpc_ace_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
46
47 #include "kernelutil_sparc64_hpc_ace_double.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sparc64_hpc_ace_double
51  * Electrostatics interaction: Ewald
52  * VdW interaction:            LJEwald
53  * Geometry:                   Water4-Water4
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sparc64_hpc_ace_double
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      t_forcerec                  * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67      * just 0 for non-waters.
68      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB;
74     int              j_coord_offsetA,j_coord_offsetB;
75     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
76     real             rcutoff_scalar;
77     real             *shiftvec,*fshift,*x,*f;
78     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
79     int              vdwioffset0;
80     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
81     int              vdwioffset1;
82     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
83     int              vdwioffset2;
84     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
85     int              vdwioffset3;
86     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
87     int              vdwjidx0A,vdwjidx0B;
88     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
89     int              vdwjidx1A,vdwjidx1B;
90     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
91     int              vdwjidx2A,vdwjidx2B;
92     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
93     int              vdwjidx3A,vdwjidx3B;
94     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
95     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
97     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
98     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
99     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
100     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
101     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
102     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
103     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
104     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
105     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
106     real             *charge;
107     int              nvdwtype;
108     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
109     int              *vdwtype;
110     real             *vdwparam;
111     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
112     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
113     _fjsp_v2r8           c6grid_00;
114     _fjsp_v2r8           c6grid_11;
115     _fjsp_v2r8           c6grid_12;
116     _fjsp_v2r8           c6grid_13;
117     _fjsp_v2r8           c6grid_21;
118     _fjsp_v2r8           c6grid_22;
119     _fjsp_v2r8           c6grid_23;
120     _fjsp_v2r8           c6grid_31;
121     _fjsp_v2r8           c6grid_32;
122     _fjsp_v2r8           c6grid_33;
123     real                 *vdwgridparam;
124     _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
125     _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
126     _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
127     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
128     real             *ewtab;
129     _fjsp_v2r8       itab_tmp;
130     _fjsp_v2r8       dummy_mask,cutoff_mask;
131     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
132     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
133     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
134
135     x                = xx[0];
136     f                = ff[0];
137
138     nri              = nlist->nri;
139     iinr             = nlist->iinr;
140     jindex           = nlist->jindex;
141     jjnr             = nlist->jjnr;
142     shiftidx         = nlist->shift;
143     gid              = nlist->gid;
144     shiftvec         = fr->shift_vec[0];
145     fshift           = fr->fshift[0];
146     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
147     charge           = mdatoms->chargeA;
148     nvdwtype         = fr->ntype;
149     vdwparam         = fr->nbfp;
150     vdwtype          = mdatoms->typeA;
151     vdwgridparam     = fr->ljpme_c6grid;
152     sh_lj_ewald      = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
153     ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
154     ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
155
156     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
157     ewtab            = fr->ic->tabq_coul_FDV0;
158     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
159     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
160
161     /* Setup water-specific parameters */
162     inr              = nlist->iinr[0];
163     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
164     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
165     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
166     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
167
168     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
169     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
170     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
171     vdwjidx0A        = 2*vdwtype[inr+0];
172     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
173     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
174     c6grid_00        = gmx_fjsp_set1_v2r8(vdwgridparam[vdwioffset0+vdwjidx0A]);
175     qq11             = _fjsp_mul_v2r8(iq1,jq1);
176     qq12             = _fjsp_mul_v2r8(iq1,jq2);
177     qq13             = _fjsp_mul_v2r8(iq1,jq3);
178     qq21             = _fjsp_mul_v2r8(iq2,jq1);
179     qq22             = _fjsp_mul_v2r8(iq2,jq2);
180     qq23             = _fjsp_mul_v2r8(iq2,jq3);
181     qq31             = _fjsp_mul_v2r8(iq3,jq1);
182     qq32             = _fjsp_mul_v2r8(iq3,jq2);
183     qq33             = _fjsp_mul_v2r8(iq3,jq3);
184
185     /* Avoid stupid compiler warnings */
186     jnrA = jnrB = 0;
187     j_coord_offsetA = 0;
188     j_coord_offsetB = 0;
189
190     outeriter        = 0;
191     inneriter        = 0;
192
193     /* Start outer loop over neighborlists */
194     for(iidx=0; iidx<nri; iidx++)
195     {
196         /* Load shift vector for this list */
197         i_shift_offset   = DIM*shiftidx[iidx];
198
199         /* Load limits for loop over neighbors */
200         j_index_start    = jindex[iidx];
201         j_index_end      = jindex[iidx+1];
202
203         /* Get outer coordinate index */
204         inr              = iinr[iidx];
205         i_coord_offset   = DIM*inr;
206
207         /* Load i particle coords and add shift vector */
208         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
209                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
210
211         fix0             = _fjsp_setzero_v2r8();
212         fiy0             = _fjsp_setzero_v2r8();
213         fiz0             = _fjsp_setzero_v2r8();
214         fix1             = _fjsp_setzero_v2r8();
215         fiy1             = _fjsp_setzero_v2r8();
216         fiz1             = _fjsp_setzero_v2r8();
217         fix2             = _fjsp_setzero_v2r8();
218         fiy2             = _fjsp_setzero_v2r8();
219         fiz2             = _fjsp_setzero_v2r8();
220         fix3             = _fjsp_setzero_v2r8();
221         fiy3             = _fjsp_setzero_v2r8();
222         fiz3             = _fjsp_setzero_v2r8();
223
224         /* Reset potential sums */
225         velecsum         = _fjsp_setzero_v2r8();
226         vvdwsum          = _fjsp_setzero_v2r8();
227
228         /* Start inner kernel loop */
229         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
230         {
231
232             /* Get j neighbor index, and coordinate index */
233             jnrA             = jjnr[jidx];
234             jnrB             = jjnr[jidx+1];
235             j_coord_offsetA  = DIM*jnrA;
236             j_coord_offsetB  = DIM*jnrB;
237
238             /* load j atom coordinates */
239             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
240                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
241                                               &jy2,&jz2,&jx3,&jy3,&jz3);
242
243             /* Calculate displacement vector */
244             dx00             = _fjsp_sub_v2r8(ix0,jx0);
245             dy00             = _fjsp_sub_v2r8(iy0,jy0);
246             dz00             = _fjsp_sub_v2r8(iz0,jz0);
247             dx11             = _fjsp_sub_v2r8(ix1,jx1);
248             dy11             = _fjsp_sub_v2r8(iy1,jy1);
249             dz11             = _fjsp_sub_v2r8(iz1,jz1);
250             dx12             = _fjsp_sub_v2r8(ix1,jx2);
251             dy12             = _fjsp_sub_v2r8(iy1,jy2);
252             dz12             = _fjsp_sub_v2r8(iz1,jz2);
253             dx13             = _fjsp_sub_v2r8(ix1,jx3);
254             dy13             = _fjsp_sub_v2r8(iy1,jy3);
255             dz13             = _fjsp_sub_v2r8(iz1,jz3);
256             dx21             = _fjsp_sub_v2r8(ix2,jx1);
257             dy21             = _fjsp_sub_v2r8(iy2,jy1);
258             dz21             = _fjsp_sub_v2r8(iz2,jz1);
259             dx22             = _fjsp_sub_v2r8(ix2,jx2);
260             dy22             = _fjsp_sub_v2r8(iy2,jy2);
261             dz22             = _fjsp_sub_v2r8(iz2,jz2);
262             dx23             = _fjsp_sub_v2r8(ix2,jx3);
263             dy23             = _fjsp_sub_v2r8(iy2,jy3);
264             dz23             = _fjsp_sub_v2r8(iz2,jz3);
265             dx31             = _fjsp_sub_v2r8(ix3,jx1);
266             dy31             = _fjsp_sub_v2r8(iy3,jy1);
267             dz31             = _fjsp_sub_v2r8(iz3,jz1);
268             dx32             = _fjsp_sub_v2r8(ix3,jx2);
269             dy32             = _fjsp_sub_v2r8(iy3,jy2);
270             dz32             = _fjsp_sub_v2r8(iz3,jz2);
271             dx33             = _fjsp_sub_v2r8(ix3,jx3);
272             dy33             = _fjsp_sub_v2r8(iy3,jy3);
273             dz33             = _fjsp_sub_v2r8(iz3,jz3);
274
275             /* Calculate squared distance and things based on it */
276             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
277             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
278             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
279             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
280             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
281             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
282             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
283             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
284             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
285             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
286
287             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
288             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
289             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
290             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
291             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
292             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
293             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
294             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
295             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
296             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
297
298             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
299             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
300             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
301             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
302             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
303             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
304             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
305             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
306             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
307             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
308
309             fjx0             = _fjsp_setzero_v2r8();
310             fjy0             = _fjsp_setzero_v2r8();
311             fjz0             = _fjsp_setzero_v2r8();
312             fjx1             = _fjsp_setzero_v2r8();
313             fjy1             = _fjsp_setzero_v2r8();
314             fjz1             = _fjsp_setzero_v2r8();
315             fjx2             = _fjsp_setzero_v2r8();
316             fjy2             = _fjsp_setzero_v2r8();
317             fjz2             = _fjsp_setzero_v2r8();
318             fjx3             = _fjsp_setzero_v2r8();
319             fjy3             = _fjsp_setzero_v2r8();
320             fjz3             = _fjsp_setzero_v2r8();
321
322             /**************************
323              * CALCULATE INTERACTIONS *
324              **************************/
325
326             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
327
328             /* Analytical LJ-PME */
329             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
330             ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
331             ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
332             exponent         = gmx_simd_exp_d(-ewcljrsq);
333             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
334             poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
335             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
336             vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(-c6grid_00,_fjsp_sub_v2r8(one,poly),c6_00),rinvsix);
337             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
338             vvdw             = _fjsp_msub_v2r8(vvdw12,one_twelfth,_fjsp_mul_v2r8(vvdw6,one_sixth));         
339             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
340             fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
341
342             /* Update potential sum for this i atom from the interaction with this j atom. */
343             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
344
345             fscal            = fvdw;
346
347             /* Update vectorial force */
348             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
349             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
350             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
351             
352             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
353             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
354             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
355
356             /**************************
357              * CALCULATE INTERACTIONS *
358              **************************/
359
360             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
361
362             /* EWALD ELECTROSTATICS */
363
364             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
365             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
366             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
367             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
368             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
369
370             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
371             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
372             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
373             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
374             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
375             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
376             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
377             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
378             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
379             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
380
381             /* Update potential sum for this i atom from the interaction with this j atom. */
382             velecsum         = _fjsp_add_v2r8(velecsum,velec);
383
384             fscal            = felec;
385
386             /* Update vectorial force */
387             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
388             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
389             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
390             
391             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
392             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
393             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
394
395             /**************************
396              * CALCULATE INTERACTIONS *
397              **************************/
398
399             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
400
401             /* EWALD ELECTROSTATICS */
402
403             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
404             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
405             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
406             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
407             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
408
409             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
410             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
411             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
412             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
413             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
414             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
415             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
416             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
417             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
418             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
419
420             /* Update potential sum for this i atom from the interaction with this j atom. */
421             velecsum         = _fjsp_add_v2r8(velecsum,velec);
422
423             fscal            = felec;
424
425             /* Update vectorial force */
426             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
427             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
428             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
429             
430             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
431             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
432             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
433
434             /**************************
435              * CALCULATE INTERACTIONS *
436              **************************/
437
438             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
439
440             /* EWALD ELECTROSTATICS */
441
442             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
443             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
444             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
445             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
446             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
447
448             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
449             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
450             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
451             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
452             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
453             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
454             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
455             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
456             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
457             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
458
459             /* Update potential sum for this i atom from the interaction with this j atom. */
460             velecsum         = _fjsp_add_v2r8(velecsum,velec);
461
462             fscal            = felec;
463
464             /* Update vectorial force */
465             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
466             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
467             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
468             
469             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
470             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
471             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
472
473             /**************************
474              * CALCULATE INTERACTIONS *
475              **************************/
476
477             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
478
479             /* EWALD ELECTROSTATICS */
480
481             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
482             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
483             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
484             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
485             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
486
487             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
488             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
489             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
490             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
491             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
492             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
493             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
494             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
495             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
496             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
497
498             /* Update potential sum for this i atom from the interaction with this j atom. */
499             velecsum         = _fjsp_add_v2r8(velecsum,velec);
500
501             fscal            = felec;
502
503             /* Update vectorial force */
504             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
505             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
506             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
507             
508             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
509             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
510             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
511
512             /**************************
513              * CALCULATE INTERACTIONS *
514              **************************/
515
516             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
517
518             /* EWALD ELECTROSTATICS */
519
520             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
521             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
522             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
523             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
524             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
525
526             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
527             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
528             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
529             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
530             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
531             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
532             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
533             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
534             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
535             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
536
537             /* Update potential sum for this i atom from the interaction with this j atom. */
538             velecsum         = _fjsp_add_v2r8(velecsum,velec);
539
540             fscal            = felec;
541
542             /* Update vectorial force */
543             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
544             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
545             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
546             
547             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
548             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
549             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
550
551             /**************************
552              * CALCULATE INTERACTIONS *
553              **************************/
554
555             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
556
557             /* EWALD ELECTROSTATICS */
558
559             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
560             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
561             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
562             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
563             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
564
565             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
566             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
567             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
568             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
569             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
570             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
571             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
572             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
573             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
574             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
575
576             /* Update potential sum for this i atom from the interaction with this j atom. */
577             velecsum         = _fjsp_add_v2r8(velecsum,velec);
578
579             fscal            = felec;
580
581             /* Update vectorial force */
582             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
583             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
584             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
585             
586             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
587             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
588             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
589
590             /**************************
591              * CALCULATE INTERACTIONS *
592              **************************/
593
594             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
595
596             /* EWALD ELECTROSTATICS */
597
598             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
599             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
600             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
601             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
602             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
603
604             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
605             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
606             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
607             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
608             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
609             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
610             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
611             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
612             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
613             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
614
615             /* Update potential sum for this i atom from the interaction with this j atom. */
616             velecsum         = _fjsp_add_v2r8(velecsum,velec);
617
618             fscal            = felec;
619
620             /* Update vectorial force */
621             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
622             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
623             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
624             
625             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
626             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
627             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
628
629             /**************************
630              * CALCULATE INTERACTIONS *
631              **************************/
632
633             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
634
635             /* EWALD ELECTROSTATICS */
636
637             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
638             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
639             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
640             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
641             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
642
643             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
644             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
645             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
646             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
647             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
648             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
649             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
650             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
651             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
652             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
653
654             /* Update potential sum for this i atom from the interaction with this j atom. */
655             velecsum         = _fjsp_add_v2r8(velecsum,velec);
656
657             fscal            = felec;
658
659             /* Update vectorial force */
660             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
661             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
662             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
663             
664             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
665             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
666             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
667
668             /**************************
669              * CALCULATE INTERACTIONS *
670              **************************/
671
672             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
673
674             /* EWALD ELECTROSTATICS */
675
676             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
677             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
678             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
679             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
680             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
681
682             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
683             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
684             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
685             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
686             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
687             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
688             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
689             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
690             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
691             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
692
693             /* Update potential sum for this i atom from the interaction with this j atom. */
694             velecsum         = _fjsp_add_v2r8(velecsum,velec);
695
696             fscal            = felec;
697
698             /* Update vectorial force */
699             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
700             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
701             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
702             
703             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
704             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
705             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
706
707             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
708
709             /* Inner loop uses 449 flops */
710         }
711
712         if(jidx<j_index_end)
713         {
714
715             jnrA             = jjnr[jidx];
716             j_coord_offsetA  = DIM*jnrA;
717
718             /* load j atom coordinates */
719             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
720                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
721                                               &jy2,&jz2,&jx3,&jy3,&jz3);
722
723             /* Calculate displacement vector */
724             dx00             = _fjsp_sub_v2r8(ix0,jx0);
725             dy00             = _fjsp_sub_v2r8(iy0,jy0);
726             dz00             = _fjsp_sub_v2r8(iz0,jz0);
727             dx11             = _fjsp_sub_v2r8(ix1,jx1);
728             dy11             = _fjsp_sub_v2r8(iy1,jy1);
729             dz11             = _fjsp_sub_v2r8(iz1,jz1);
730             dx12             = _fjsp_sub_v2r8(ix1,jx2);
731             dy12             = _fjsp_sub_v2r8(iy1,jy2);
732             dz12             = _fjsp_sub_v2r8(iz1,jz2);
733             dx13             = _fjsp_sub_v2r8(ix1,jx3);
734             dy13             = _fjsp_sub_v2r8(iy1,jy3);
735             dz13             = _fjsp_sub_v2r8(iz1,jz3);
736             dx21             = _fjsp_sub_v2r8(ix2,jx1);
737             dy21             = _fjsp_sub_v2r8(iy2,jy1);
738             dz21             = _fjsp_sub_v2r8(iz2,jz1);
739             dx22             = _fjsp_sub_v2r8(ix2,jx2);
740             dy22             = _fjsp_sub_v2r8(iy2,jy2);
741             dz22             = _fjsp_sub_v2r8(iz2,jz2);
742             dx23             = _fjsp_sub_v2r8(ix2,jx3);
743             dy23             = _fjsp_sub_v2r8(iy2,jy3);
744             dz23             = _fjsp_sub_v2r8(iz2,jz3);
745             dx31             = _fjsp_sub_v2r8(ix3,jx1);
746             dy31             = _fjsp_sub_v2r8(iy3,jy1);
747             dz31             = _fjsp_sub_v2r8(iz3,jz1);
748             dx32             = _fjsp_sub_v2r8(ix3,jx2);
749             dy32             = _fjsp_sub_v2r8(iy3,jy2);
750             dz32             = _fjsp_sub_v2r8(iz3,jz2);
751             dx33             = _fjsp_sub_v2r8(ix3,jx3);
752             dy33             = _fjsp_sub_v2r8(iy3,jy3);
753             dz33             = _fjsp_sub_v2r8(iz3,jz3);
754
755             /* Calculate squared distance and things based on it */
756             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
757             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
758             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
759             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
760             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
761             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
762             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
763             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
764             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
765             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
766
767             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
768             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
769             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
770             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
771             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
772             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
773             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
774             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
775             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
776             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
777
778             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
779             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
780             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
781             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
782             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
783             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
784             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
785             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
786             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
787             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
788
789             fjx0             = _fjsp_setzero_v2r8();
790             fjy0             = _fjsp_setzero_v2r8();
791             fjz0             = _fjsp_setzero_v2r8();
792             fjx1             = _fjsp_setzero_v2r8();
793             fjy1             = _fjsp_setzero_v2r8();
794             fjz1             = _fjsp_setzero_v2r8();
795             fjx2             = _fjsp_setzero_v2r8();
796             fjy2             = _fjsp_setzero_v2r8();
797             fjz2             = _fjsp_setzero_v2r8();
798             fjx3             = _fjsp_setzero_v2r8();
799             fjy3             = _fjsp_setzero_v2r8();
800             fjz3             = _fjsp_setzero_v2r8();
801
802             /**************************
803              * CALCULATE INTERACTIONS *
804              **************************/
805
806             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
807
808             /* Analytical LJ-PME */
809             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
810             ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
811             ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
812             exponent         = gmx_simd_exp_d(-ewcljrsq);
813             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
814             poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
815             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
816             vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(-c6grid_00,_fjsp_sub_v2r8(one,poly),c6_00),rinvsix);
817             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
818             vvdw             = _fjsp_msub_v2r8(vvdw12,one_twelfth,_fjsp_mul_v2r8(vvdw6,one_sixth));         
819             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
820             fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
821
822             /* Update potential sum for this i atom from the interaction with this j atom. */
823             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
824             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
825
826             fscal            = fvdw;
827
828             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
829
830             /* Update vectorial force */
831             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
832             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
833             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
834             
835             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
836             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
837             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
838
839             /**************************
840              * CALCULATE INTERACTIONS *
841              **************************/
842
843             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
844
845             /* EWALD ELECTROSTATICS */
846
847             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
848             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
849             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
850             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
851             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
852
853             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
854             ewtabD           = _fjsp_setzero_v2r8();
855             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
856             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
857             ewtabFn          = _fjsp_setzero_v2r8();
858             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
859             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
860             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
861             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
862             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
863
864             /* Update potential sum for this i atom from the interaction with this j atom. */
865             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
866             velecsum         = _fjsp_add_v2r8(velecsum,velec);
867
868             fscal            = felec;
869
870             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
871
872             /* Update vectorial force */
873             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
874             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
875             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
876             
877             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
878             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
879             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
880
881             /**************************
882              * CALCULATE INTERACTIONS *
883              **************************/
884
885             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
886
887             /* EWALD ELECTROSTATICS */
888
889             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
890             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
891             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
892             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
893             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
894
895             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
896             ewtabD           = _fjsp_setzero_v2r8();
897             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
898             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
899             ewtabFn          = _fjsp_setzero_v2r8();
900             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
901             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
902             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
903             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
904             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
905
906             /* Update potential sum for this i atom from the interaction with this j atom. */
907             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
908             velecsum         = _fjsp_add_v2r8(velecsum,velec);
909
910             fscal            = felec;
911
912             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
913
914             /* Update vectorial force */
915             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
916             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
917             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
918             
919             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
920             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
921             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
922
923             /**************************
924              * CALCULATE INTERACTIONS *
925              **************************/
926
927             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
928
929             /* EWALD ELECTROSTATICS */
930
931             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
932             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
933             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
934             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
935             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
936
937             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
938             ewtabD           = _fjsp_setzero_v2r8();
939             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
940             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
941             ewtabFn          = _fjsp_setzero_v2r8();
942             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
943             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
944             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
945             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
946             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
947
948             /* Update potential sum for this i atom from the interaction with this j atom. */
949             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
950             velecsum         = _fjsp_add_v2r8(velecsum,velec);
951
952             fscal            = felec;
953
954             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
955
956             /* Update vectorial force */
957             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
958             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
959             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
960             
961             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
962             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
963             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
964
965             /**************************
966              * CALCULATE INTERACTIONS *
967              **************************/
968
969             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
970
971             /* EWALD ELECTROSTATICS */
972
973             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
974             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
975             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
976             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
977             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
978
979             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
980             ewtabD           = _fjsp_setzero_v2r8();
981             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
982             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
983             ewtabFn          = _fjsp_setzero_v2r8();
984             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
985             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
986             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
987             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
988             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
989
990             /* Update potential sum for this i atom from the interaction with this j atom. */
991             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
992             velecsum         = _fjsp_add_v2r8(velecsum,velec);
993
994             fscal            = felec;
995
996             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
997
998             /* Update vectorial force */
999             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
1000             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
1001             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
1002             
1003             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
1004             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
1005             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
1006
1007             /**************************
1008              * CALCULATE INTERACTIONS *
1009              **************************/
1010
1011             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
1012
1013             /* EWALD ELECTROSTATICS */
1014
1015             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1016             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
1017             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1018             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1019             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1020
1021             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1022             ewtabD           = _fjsp_setzero_v2r8();
1023             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1024             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1025             ewtabFn          = _fjsp_setzero_v2r8();
1026             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1027             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1028             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1029             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
1030             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
1031
1032             /* Update potential sum for this i atom from the interaction with this j atom. */
1033             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1034             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1035
1036             fscal            = felec;
1037
1038             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1039
1040             /* Update vectorial force */
1041             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
1042             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
1043             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
1044             
1045             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
1046             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
1047             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
1048
1049             /**************************
1050              * CALCULATE INTERACTIONS *
1051              **************************/
1052
1053             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
1054
1055             /* EWALD ELECTROSTATICS */
1056
1057             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1058             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
1059             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1060             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1061             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1062
1063             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1064             ewtabD           = _fjsp_setzero_v2r8();
1065             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1066             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1067             ewtabFn          = _fjsp_setzero_v2r8();
1068             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1069             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1070             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1071             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
1072             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
1073
1074             /* Update potential sum for this i atom from the interaction with this j atom. */
1075             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1076             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1077
1078             fscal            = felec;
1079
1080             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1081
1082             /* Update vectorial force */
1083             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
1084             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
1085             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
1086             
1087             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
1088             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
1089             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
1090
1091             /**************************
1092              * CALCULATE INTERACTIONS *
1093              **************************/
1094
1095             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
1096
1097             /* EWALD ELECTROSTATICS */
1098
1099             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1100             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
1101             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1102             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1103             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1104
1105             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1106             ewtabD           = _fjsp_setzero_v2r8();
1107             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1108             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1109             ewtabFn          = _fjsp_setzero_v2r8();
1110             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1111             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1112             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1113             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
1114             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
1115
1116             /* Update potential sum for this i atom from the interaction with this j atom. */
1117             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1118             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1119
1120             fscal            = felec;
1121
1122             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1123
1124             /* Update vectorial force */
1125             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
1126             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
1127             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
1128             
1129             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
1130             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
1131             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
1132
1133             /**************************
1134              * CALCULATE INTERACTIONS *
1135              **************************/
1136
1137             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
1138
1139             /* EWALD ELECTROSTATICS */
1140
1141             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1142             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
1143             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1144             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1145             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1146
1147             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1148             ewtabD           = _fjsp_setzero_v2r8();
1149             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1150             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1151             ewtabFn          = _fjsp_setzero_v2r8();
1152             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1153             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1154             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1155             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
1156             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
1157
1158             /* Update potential sum for this i atom from the interaction with this j atom. */
1159             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1160             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1161
1162             fscal            = felec;
1163
1164             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1165
1166             /* Update vectorial force */
1167             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
1168             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
1169             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
1170             
1171             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
1172             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
1173             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
1174
1175             /**************************
1176              * CALCULATE INTERACTIONS *
1177              **************************/
1178
1179             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
1180
1181             /* EWALD ELECTROSTATICS */
1182
1183             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1184             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
1185             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1186             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1187             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1188
1189             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1190             ewtabD           = _fjsp_setzero_v2r8();
1191             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1192             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1193             ewtabFn          = _fjsp_setzero_v2r8();
1194             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1195             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1196             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1197             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
1198             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
1199
1200             /* Update potential sum for this i atom from the interaction with this j atom. */
1201             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1202             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1203
1204             fscal            = felec;
1205
1206             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1207
1208             /* Update vectorial force */
1209             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
1210             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
1211             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
1212             
1213             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
1214             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
1215             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
1216
1217             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1218
1219             /* Inner loop uses 449 flops */
1220         }
1221
1222         /* End of innermost loop */
1223
1224         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1225                                               f+i_coord_offset,fshift+i_shift_offset);
1226
1227         ggid                        = gid[iidx];
1228         /* Update potential energies */
1229         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
1230         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
1231
1232         /* Increment number of inner iterations */
1233         inneriter                  += j_index_end - j_index_start;
1234
1235         /* Outer loop uses 26 flops */
1236     }
1237
1238     /* Increment number of outer iterations */
1239     outeriter        += nri;
1240
1241     /* Update outer/inner flops */
1242
1243     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*449);
1244 }
1245 /*
1246  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sparc64_hpc_ace_double
1247  * Electrostatics interaction: Ewald
1248  * VdW interaction:            LJEwald
1249  * Geometry:                   Water4-Water4
1250  * Calculate force/pot:        Force
1251  */
1252 void
1253 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sparc64_hpc_ace_double
1254                     (t_nblist                    * gmx_restrict       nlist,
1255                      rvec                        * gmx_restrict          xx,
1256                      rvec                        * gmx_restrict          ff,
1257                      t_forcerec                  * gmx_restrict          fr,
1258                      t_mdatoms                   * gmx_restrict     mdatoms,
1259                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1260                      t_nrnb                      * gmx_restrict        nrnb)
1261 {
1262     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1263      * just 0 for non-waters.
1264      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
1265      * jnr indices corresponding to data put in the four positions in the SIMD register.
1266      */
1267     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1268     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1269     int              jnrA,jnrB;
1270     int              j_coord_offsetA,j_coord_offsetB;
1271     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1272     real             rcutoff_scalar;
1273     real             *shiftvec,*fshift,*x,*f;
1274     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1275     int              vdwioffset0;
1276     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1277     int              vdwioffset1;
1278     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1279     int              vdwioffset2;
1280     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1281     int              vdwioffset3;
1282     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1283     int              vdwjidx0A,vdwjidx0B;
1284     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1285     int              vdwjidx1A,vdwjidx1B;
1286     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1287     int              vdwjidx2A,vdwjidx2B;
1288     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1289     int              vdwjidx3A,vdwjidx3B;
1290     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1291     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1292     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1293     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1294     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1295     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1296     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1297     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1298     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1299     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1300     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1301     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
1302     real             *charge;
1303     int              nvdwtype;
1304     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1305     int              *vdwtype;
1306     real             *vdwparam;
1307     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
1308     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
1309     _fjsp_v2r8           c6grid_00;
1310     _fjsp_v2r8           c6grid_11;
1311     _fjsp_v2r8           c6grid_12;
1312     _fjsp_v2r8           c6grid_13;
1313     _fjsp_v2r8           c6grid_21;
1314     _fjsp_v2r8           c6grid_22;
1315     _fjsp_v2r8           c6grid_23;
1316     _fjsp_v2r8           c6grid_31;
1317     _fjsp_v2r8           c6grid_32;
1318     _fjsp_v2r8           c6grid_33;
1319     real                 *vdwgridparam;
1320     _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1321     _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
1322     _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
1323     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1324     real             *ewtab;
1325     _fjsp_v2r8       itab_tmp;
1326     _fjsp_v2r8       dummy_mask,cutoff_mask;
1327     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
1328     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
1329     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
1330
1331     x                = xx[0];
1332     f                = ff[0];
1333
1334     nri              = nlist->nri;
1335     iinr             = nlist->iinr;
1336     jindex           = nlist->jindex;
1337     jjnr             = nlist->jjnr;
1338     shiftidx         = nlist->shift;
1339     gid              = nlist->gid;
1340     shiftvec         = fr->shift_vec[0];
1341     fshift           = fr->fshift[0];
1342     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
1343     charge           = mdatoms->chargeA;
1344     nvdwtype         = fr->ntype;
1345     vdwparam         = fr->nbfp;
1346     vdwtype          = mdatoms->typeA;
1347     vdwgridparam     = fr->ljpme_c6grid;
1348     sh_lj_ewald      = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
1349     ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
1350     ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
1351
1352     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
1353     ewtab            = fr->ic->tabq_coul_F;
1354     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
1355     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
1356
1357     /* Setup water-specific parameters */
1358     inr              = nlist->iinr[0];
1359     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
1360     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
1361     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
1362     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1363
1364     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
1365     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
1366     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
1367     vdwjidx0A        = 2*vdwtype[inr+0];
1368     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
1369     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
1370     c6grid_00        = gmx_fjsp_set1_v2r8(vdwgridparam[vdwioffset0+vdwjidx0A]);
1371     qq11             = _fjsp_mul_v2r8(iq1,jq1);
1372     qq12             = _fjsp_mul_v2r8(iq1,jq2);
1373     qq13             = _fjsp_mul_v2r8(iq1,jq3);
1374     qq21             = _fjsp_mul_v2r8(iq2,jq1);
1375     qq22             = _fjsp_mul_v2r8(iq2,jq2);
1376     qq23             = _fjsp_mul_v2r8(iq2,jq3);
1377     qq31             = _fjsp_mul_v2r8(iq3,jq1);
1378     qq32             = _fjsp_mul_v2r8(iq3,jq2);
1379     qq33             = _fjsp_mul_v2r8(iq3,jq3);
1380
1381     /* Avoid stupid compiler warnings */
1382     jnrA = jnrB = 0;
1383     j_coord_offsetA = 0;
1384     j_coord_offsetB = 0;
1385
1386     outeriter        = 0;
1387     inneriter        = 0;
1388
1389     /* Start outer loop over neighborlists */
1390     for(iidx=0; iidx<nri; iidx++)
1391     {
1392         /* Load shift vector for this list */
1393         i_shift_offset   = DIM*shiftidx[iidx];
1394
1395         /* Load limits for loop over neighbors */
1396         j_index_start    = jindex[iidx];
1397         j_index_end      = jindex[iidx+1];
1398
1399         /* Get outer coordinate index */
1400         inr              = iinr[iidx];
1401         i_coord_offset   = DIM*inr;
1402
1403         /* Load i particle coords and add shift vector */
1404         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
1405                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1406
1407         fix0             = _fjsp_setzero_v2r8();
1408         fiy0             = _fjsp_setzero_v2r8();
1409         fiz0             = _fjsp_setzero_v2r8();
1410         fix1             = _fjsp_setzero_v2r8();
1411         fiy1             = _fjsp_setzero_v2r8();
1412         fiz1             = _fjsp_setzero_v2r8();
1413         fix2             = _fjsp_setzero_v2r8();
1414         fiy2             = _fjsp_setzero_v2r8();
1415         fiz2             = _fjsp_setzero_v2r8();
1416         fix3             = _fjsp_setzero_v2r8();
1417         fiy3             = _fjsp_setzero_v2r8();
1418         fiz3             = _fjsp_setzero_v2r8();
1419
1420         /* Start inner kernel loop */
1421         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1422         {
1423
1424             /* Get j neighbor index, and coordinate index */
1425             jnrA             = jjnr[jidx];
1426             jnrB             = jjnr[jidx+1];
1427             j_coord_offsetA  = DIM*jnrA;
1428             j_coord_offsetB  = DIM*jnrB;
1429
1430             /* load j atom coordinates */
1431             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
1432                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1433                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1434
1435             /* Calculate displacement vector */
1436             dx00             = _fjsp_sub_v2r8(ix0,jx0);
1437             dy00             = _fjsp_sub_v2r8(iy0,jy0);
1438             dz00             = _fjsp_sub_v2r8(iz0,jz0);
1439             dx11             = _fjsp_sub_v2r8(ix1,jx1);
1440             dy11             = _fjsp_sub_v2r8(iy1,jy1);
1441             dz11             = _fjsp_sub_v2r8(iz1,jz1);
1442             dx12             = _fjsp_sub_v2r8(ix1,jx2);
1443             dy12             = _fjsp_sub_v2r8(iy1,jy2);
1444             dz12             = _fjsp_sub_v2r8(iz1,jz2);
1445             dx13             = _fjsp_sub_v2r8(ix1,jx3);
1446             dy13             = _fjsp_sub_v2r8(iy1,jy3);
1447             dz13             = _fjsp_sub_v2r8(iz1,jz3);
1448             dx21             = _fjsp_sub_v2r8(ix2,jx1);
1449             dy21             = _fjsp_sub_v2r8(iy2,jy1);
1450             dz21             = _fjsp_sub_v2r8(iz2,jz1);
1451             dx22             = _fjsp_sub_v2r8(ix2,jx2);
1452             dy22             = _fjsp_sub_v2r8(iy2,jy2);
1453             dz22             = _fjsp_sub_v2r8(iz2,jz2);
1454             dx23             = _fjsp_sub_v2r8(ix2,jx3);
1455             dy23             = _fjsp_sub_v2r8(iy2,jy3);
1456             dz23             = _fjsp_sub_v2r8(iz2,jz3);
1457             dx31             = _fjsp_sub_v2r8(ix3,jx1);
1458             dy31             = _fjsp_sub_v2r8(iy3,jy1);
1459             dz31             = _fjsp_sub_v2r8(iz3,jz1);
1460             dx32             = _fjsp_sub_v2r8(ix3,jx2);
1461             dy32             = _fjsp_sub_v2r8(iy3,jy2);
1462             dz32             = _fjsp_sub_v2r8(iz3,jz2);
1463             dx33             = _fjsp_sub_v2r8(ix3,jx3);
1464             dy33             = _fjsp_sub_v2r8(iy3,jy3);
1465             dz33             = _fjsp_sub_v2r8(iz3,jz3);
1466
1467             /* Calculate squared distance and things based on it */
1468             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
1469             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
1470             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
1471             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
1472             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
1473             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
1474             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
1475             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
1476             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
1477             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
1478
1479             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
1480             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
1481             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
1482             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
1483             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
1484             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
1485             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
1486             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
1487             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
1488             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
1489
1490             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
1491             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
1492             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
1493             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
1494             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
1495             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
1496             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
1497             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
1498             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
1499             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
1500
1501             fjx0             = _fjsp_setzero_v2r8();
1502             fjy0             = _fjsp_setzero_v2r8();
1503             fjz0             = _fjsp_setzero_v2r8();
1504             fjx1             = _fjsp_setzero_v2r8();
1505             fjy1             = _fjsp_setzero_v2r8();
1506             fjz1             = _fjsp_setzero_v2r8();
1507             fjx2             = _fjsp_setzero_v2r8();
1508             fjy2             = _fjsp_setzero_v2r8();
1509             fjz2             = _fjsp_setzero_v2r8();
1510             fjx3             = _fjsp_setzero_v2r8();
1511             fjy3             = _fjsp_setzero_v2r8();
1512             fjz3             = _fjsp_setzero_v2r8();
1513
1514             /**************************
1515              * CALCULATE INTERACTIONS *
1516              **************************/
1517
1518             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
1519
1520             /* Analytical LJ-PME */
1521             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
1522             ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
1523             ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
1524             exponent         = gmx_simd_exp_d(-ewcljrsq);
1525             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1526             poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
1527             /* f6A = 6 * C6grid * (1 - poly) */
1528             f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_msub_v2r8(one,poly));
1529             /* f6B = C6grid * exponent * beta^6 */
1530             f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
1531             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1532             fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
1533
1534             fscal            = fvdw;
1535
1536             /* Update vectorial force */
1537             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
1538             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
1539             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
1540             
1541             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
1542             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
1543             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
1544
1545             /**************************
1546              * CALCULATE INTERACTIONS *
1547              **************************/
1548
1549             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
1550
1551             /* EWALD ELECTROSTATICS */
1552
1553             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1554             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
1555             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1556             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1557             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1558
1559             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1560                                          &ewtabF,&ewtabFn);
1561             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1562             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
1563
1564             fscal            = felec;
1565
1566             /* Update vectorial force */
1567             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
1568             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
1569             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
1570             
1571             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
1572             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
1573             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
1574
1575             /**************************
1576              * CALCULATE INTERACTIONS *
1577              **************************/
1578
1579             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
1580
1581             /* EWALD ELECTROSTATICS */
1582
1583             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1584             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
1585             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1586             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1587             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1588
1589             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1590                                          &ewtabF,&ewtabFn);
1591             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1592             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
1593
1594             fscal            = felec;
1595
1596             /* Update vectorial force */
1597             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
1598             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
1599             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
1600             
1601             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
1602             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
1603             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
1604
1605             /**************************
1606              * CALCULATE INTERACTIONS *
1607              **************************/
1608
1609             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
1610
1611             /* EWALD ELECTROSTATICS */
1612
1613             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1614             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
1615             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1616             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1617             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1618
1619             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1620                                          &ewtabF,&ewtabFn);
1621             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1622             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
1623
1624             fscal            = felec;
1625
1626             /* Update vectorial force */
1627             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
1628             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
1629             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
1630             
1631             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
1632             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
1633             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
1634
1635             /**************************
1636              * CALCULATE INTERACTIONS *
1637              **************************/
1638
1639             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
1640
1641             /* EWALD ELECTROSTATICS */
1642
1643             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1644             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
1645             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1646             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1647             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1648
1649             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1650                                          &ewtabF,&ewtabFn);
1651             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1652             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
1653
1654             fscal            = felec;
1655
1656             /* Update vectorial force */
1657             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
1658             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
1659             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
1660             
1661             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
1662             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
1663             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
1664
1665             /**************************
1666              * CALCULATE INTERACTIONS *
1667              **************************/
1668
1669             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
1670
1671             /* EWALD ELECTROSTATICS */
1672
1673             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1674             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
1675             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1676             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1677             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1678
1679             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1680                                          &ewtabF,&ewtabFn);
1681             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1682             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
1683
1684             fscal            = felec;
1685
1686             /* Update vectorial force */
1687             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
1688             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
1689             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
1690             
1691             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
1692             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
1693             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
1694
1695             /**************************
1696              * CALCULATE INTERACTIONS *
1697              **************************/
1698
1699             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
1700
1701             /* EWALD ELECTROSTATICS */
1702
1703             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1704             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
1705             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1706             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1707             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1708
1709             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1710                                          &ewtabF,&ewtabFn);
1711             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1712             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
1713
1714             fscal            = felec;
1715
1716             /* Update vectorial force */
1717             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
1718             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
1719             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
1720             
1721             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
1722             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
1723             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
1724
1725             /**************************
1726              * CALCULATE INTERACTIONS *
1727              **************************/
1728
1729             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
1730
1731             /* EWALD ELECTROSTATICS */
1732
1733             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1734             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
1735             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1736             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1737             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1738
1739             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1740                                          &ewtabF,&ewtabFn);
1741             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1742             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
1743
1744             fscal            = felec;
1745
1746             /* Update vectorial force */
1747             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
1748             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
1749             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
1750             
1751             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
1752             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
1753             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
1754
1755             /**************************
1756              * CALCULATE INTERACTIONS *
1757              **************************/
1758
1759             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
1760
1761             /* EWALD ELECTROSTATICS */
1762
1763             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1764             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
1765             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1766             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1767             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1768
1769             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1770                                          &ewtabF,&ewtabFn);
1771             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1772             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
1773
1774             fscal            = felec;
1775
1776             /* Update vectorial force */
1777             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
1778             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
1779             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
1780             
1781             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
1782             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
1783             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
1784
1785             /**************************
1786              * CALCULATE INTERACTIONS *
1787              **************************/
1788
1789             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
1790
1791             /* EWALD ELECTROSTATICS */
1792
1793             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1794             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
1795             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1796             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1797             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1798
1799             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1800                                          &ewtabF,&ewtabFn);
1801             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1802             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
1803
1804             fscal            = felec;
1805
1806             /* Update vectorial force */
1807             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
1808             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
1809             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
1810             
1811             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
1812             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
1813             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
1814
1815             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1816
1817             /* Inner loop uses 402 flops */
1818         }
1819
1820         if(jidx<j_index_end)
1821         {
1822
1823             jnrA             = jjnr[jidx];
1824             j_coord_offsetA  = DIM*jnrA;
1825
1826             /* load j atom coordinates */
1827             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
1828                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1829                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1830
1831             /* Calculate displacement vector */
1832             dx00             = _fjsp_sub_v2r8(ix0,jx0);
1833             dy00             = _fjsp_sub_v2r8(iy0,jy0);
1834             dz00             = _fjsp_sub_v2r8(iz0,jz0);
1835             dx11             = _fjsp_sub_v2r8(ix1,jx1);
1836             dy11             = _fjsp_sub_v2r8(iy1,jy1);
1837             dz11             = _fjsp_sub_v2r8(iz1,jz1);
1838             dx12             = _fjsp_sub_v2r8(ix1,jx2);
1839             dy12             = _fjsp_sub_v2r8(iy1,jy2);
1840             dz12             = _fjsp_sub_v2r8(iz1,jz2);
1841             dx13             = _fjsp_sub_v2r8(ix1,jx3);
1842             dy13             = _fjsp_sub_v2r8(iy1,jy3);
1843             dz13             = _fjsp_sub_v2r8(iz1,jz3);
1844             dx21             = _fjsp_sub_v2r8(ix2,jx1);
1845             dy21             = _fjsp_sub_v2r8(iy2,jy1);
1846             dz21             = _fjsp_sub_v2r8(iz2,jz1);
1847             dx22             = _fjsp_sub_v2r8(ix2,jx2);
1848             dy22             = _fjsp_sub_v2r8(iy2,jy2);
1849             dz22             = _fjsp_sub_v2r8(iz2,jz2);
1850             dx23             = _fjsp_sub_v2r8(ix2,jx3);
1851             dy23             = _fjsp_sub_v2r8(iy2,jy3);
1852             dz23             = _fjsp_sub_v2r8(iz2,jz3);
1853             dx31             = _fjsp_sub_v2r8(ix3,jx1);
1854             dy31             = _fjsp_sub_v2r8(iy3,jy1);
1855             dz31             = _fjsp_sub_v2r8(iz3,jz1);
1856             dx32             = _fjsp_sub_v2r8(ix3,jx2);
1857             dy32             = _fjsp_sub_v2r8(iy3,jy2);
1858             dz32             = _fjsp_sub_v2r8(iz3,jz2);
1859             dx33             = _fjsp_sub_v2r8(ix3,jx3);
1860             dy33             = _fjsp_sub_v2r8(iy3,jy3);
1861             dz33             = _fjsp_sub_v2r8(iz3,jz3);
1862
1863             /* Calculate squared distance and things based on it */
1864             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
1865             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
1866             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
1867             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
1868             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
1869             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
1870             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
1871             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
1872             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
1873             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
1874
1875             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
1876             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
1877             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
1878             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
1879             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
1880             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
1881             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
1882             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
1883             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
1884             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
1885
1886             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
1887             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
1888             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
1889             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
1890             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
1891             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
1892             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
1893             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
1894             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
1895             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
1896
1897             fjx0             = _fjsp_setzero_v2r8();
1898             fjy0             = _fjsp_setzero_v2r8();
1899             fjz0             = _fjsp_setzero_v2r8();
1900             fjx1             = _fjsp_setzero_v2r8();
1901             fjy1             = _fjsp_setzero_v2r8();
1902             fjz1             = _fjsp_setzero_v2r8();
1903             fjx2             = _fjsp_setzero_v2r8();
1904             fjy2             = _fjsp_setzero_v2r8();
1905             fjz2             = _fjsp_setzero_v2r8();
1906             fjx3             = _fjsp_setzero_v2r8();
1907             fjy3             = _fjsp_setzero_v2r8();
1908             fjz3             = _fjsp_setzero_v2r8();
1909
1910             /**************************
1911              * CALCULATE INTERACTIONS *
1912              **************************/
1913
1914             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
1915
1916             /* Analytical LJ-PME */
1917             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
1918             ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
1919             ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
1920             exponent         = gmx_simd_exp_d(-ewcljrsq);
1921             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1922             poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
1923             /* f6A = 6 * C6grid * (1 - poly) */
1924             f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_msub_v2r8(one,poly));
1925             /* f6B = C6grid * exponent * beta^6 */
1926             f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
1927             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1928             fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
1929
1930             fscal            = fvdw;
1931
1932             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1933
1934             /* Update vectorial force */
1935             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
1936             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
1937             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
1938             
1939             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
1940             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
1941             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
1942
1943             /**************************
1944              * CALCULATE INTERACTIONS *
1945              **************************/
1946
1947             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
1948
1949             /* EWALD ELECTROSTATICS */
1950
1951             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1952             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
1953             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1954             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1955             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1956
1957             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
1958             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1959             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
1960
1961             fscal            = felec;
1962
1963             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1964
1965             /* Update vectorial force */
1966             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
1967             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
1968             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
1969             
1970             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
1971             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
1972             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
1973
1974             /**************************
1975              * CALCULATE INTERACTIONS *
1976              **************************/
1977
1978             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
1979
1980             /* EWALD ELECTROSTATICS */
1981
1982             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1983             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
1984             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1985             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1986             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1987
1988             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
1989             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1990             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
1991
1992             fscal            = felec;
1993
1994             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1995
1996             /* Update vectorial force */
1997             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
1998             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
1999             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
2000             
2001             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
2002             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
2003             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
2004
2005             /**************************
2006              * CALCULATE INTERACTIONS *
2007              **************************/
2008
2009             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
2010
2011             /* EWALD ELECTROSTATICS */
2012
2013             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2014             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
2015             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2016             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2017             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2018
2019             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2020             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2021             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
2022
2023             fscal            = felec;
2024
2025             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2026
2027             /* Update vectorial force */
2028             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
2029             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
2030             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
2031             
2032             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
2033             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
2034             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
2035
2036             /**************************
2037              * CALCULATE INTERACTIONS *
2038              **************************/
2039
2040             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
2041
2042             /* EWALD ELECTROSTATICS */
2043
2044             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2045             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
2046             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2047             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2048             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2049
2050             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2051             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2052             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
2053
2054             fscal            = felec;
2055
2056             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2057
2058             /* Update vectorial force */
2059             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
2060             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
2061             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
2062             
2063             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
2064             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
2065             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
2066
2067             /**************************
2068              * CALCULATE INTERACTIONS *
2069              **************************/
2070
2071             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
2072
2073             /* EWALD ELECTROSTATICS */
2074
2075             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2076             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
2077             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2078             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2079             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2080
2081             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2082             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2083             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
2084
2085             fscal            = felec;
2086
2087             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2088
2089             /* Update vectorial force */
2090             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
2091             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
2092             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
2093             
2094             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
2095             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
2096             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
2097
2098             /**************************
2099              * CALCULATE INTERACTIONS *
2100              **************************/
2101
2102             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
2103
2104             /* EWALD ELECTROSTATICS */
2105
2106             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2107             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
2108             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2109             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2110             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2111
2112             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2113             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2114             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
2115
2116             fscal            = felec;
2117
2118             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2119
2120             /* Update vectorial force */
2121             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
2122             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
2123             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
2124             
2125             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
2126             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
2127             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
2128
2129             /**************************
2130              * CALCULATE INTERACTIONS *
2131              **************************/
2132
2133             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
2134
2135             /* EWALD ELECTROSTATICS */
2136
2137             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2138             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
2139             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2140             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2141             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2142
2143             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2144             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2145             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
2146
2147             fscal            = felec;
2148
2149             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2150
2151             /* Update vectorial force */
2152             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
2153             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
2154             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
2155             
2156             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
2157             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
2158             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
2159
2160             /**************************
2161              * CALCULATE INTERACTIONS *
2162              **************************/
2163
2164             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
2165
2166             /* EWALD ELECTROSTATICS */
2167
2168             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2169             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
2170             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2171             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2172             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2173
2174             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2175             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2176             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
2177
2178             fscal            = felec;
2179
2180             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2181
2182             /* Update vectorial force */
2183             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
2184             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
2185             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
2186             
2187             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
2188             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
2189             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
2190
2191             /**************************
2192              * CALCULATE INTERACTIONS *
2193              **************************/
2194
2195             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
2196
2197             /* EWALD ELECTROSTATICS */
2198
2199             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2200             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
2201             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2202             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2203             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2204
2205             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2206             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2207             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
2208
2209             fscal            = felec;
2210
2211             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2212
2213             /* Update vectorial force */
2214             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
2215             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
2216             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
2217             
2218             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
2219             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
2220             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
2221
2222             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2223
2224             /* Inner loop uses 402 flops */
2225         }
2226
2227         /* End of innermost loop */
2228
2229         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2230                                               f+i_coord_offset,fshift+i_shift_offset);
2231
2232         /* Increment number of inner iterations */
2233         inneriter                  += j_index_end - j_index_start;
2234
2235         /* Outer loop uses 24 flops */
2236     }
2237
2238     /* Increment number of outer iterations */
2239     outeriter        += nri;
2240
2241     /* Update outer/inner flops */
2242
2243     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*402);
2244 }