Remove no-inline-max-size and suppress remark
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sparc64_hpc_ace_double / nb_kernel_ElecEw_VdwLJEw_GeomW3W3_sparc64_hpc_ace_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "gromacs/legacyheaders/vec.h"
47 #include "nrnb.h"
48
49 #include "kernelutil_sparc64_hpc_ace_double.h"
50
51 /*
52  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sparc64_hpc_ace_double
53  * Electrostatics interaction: Ewald
54  * VdW interaction:            LJEwald
55  * Geometry:                   Water3-Water3
56  * Calculate force/pot:        PotentialAndForce
57  */
58 void
59 nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sparc64_hpc_ace_double
60                     (t_nblist                    * gmx_restrict       nlist,
61                      rvec                        * gmx_restrict          xx,
62                      rvec                        * gmx_restrict          ff,
63                      t_forcerec                  * gmx_restrict          fr,
64                      t_mdatoms                   * gmx_restrict     mdatoms,
65                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
66                      t_nrnb                      * gmx_restrict        nrnb)
67 {
68     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69      * just 0 for non-waters.
70      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
71      * jnr indices corresponding to data put in the four positions in the SIMD register.
72      */
73     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
74     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75     int              jnrA,jnrB;
76     int              j_coord_offsetA,j_coord_offsetB;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
81     int              vdwioffset0;
82     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
83     int              vdwioffset1;
84     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85     int              vdwioffset2;
86     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87     int              vdwjidx0A,vdwjidx0B;
88     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
89     int              vdwjidx1A,vdwjidx1B;
90     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
91     int              vdwjidx2A,vdwjidx2B;
92     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
93     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
94     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
95     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
96     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
97     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
98     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
99     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
100     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
101     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
102     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
103     real             *charge;
104     int              nvdwtype;
105     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
106     int              *vdwtype;
107     real             *vdwparam;
108     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
109     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
110     _fjsp_v2r8           c6grid_00;
111     _fjsp_v2r8           c6grid_01;
112     _fjsp_v2r8           c6grid_02;
113     _fjsp_v2r8           c6grid_10;
114     _fjsp_v2r8           c6grid_11;
115     _fjsp_v2r8           c6grid_12;
116     _fjsp_v2r8           c6grid_20;
117     _fjsp_v2r8           c6grid_21;
118     _fjsp_v2r8           c6grid_22;
119     real                 *vdwgridparam;
120     _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
121     _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
122     _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
123     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
124     real             *ewtab;
125     _fjsp_v2r8       itab_tmp;
126     _fjsp_v2r8       dummy_mask,cutoff_mask;
127     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
128     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
129     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
130
131     x                = xx[0];
132     f                = ff[0];
133
134     nri              = nlist->nri;
135     iinr             = nlist->iinr;
136     jindex           = nlist->jindex;
137     jjnr             = nlist->jjnr;
138     shiftidx         = nlist->shift;
139     gid              = nlist->gid;
140     shiftvec         = fr->shift_vec[0];
141     fshift           = fr->fshift[0];
142     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
143     charge           = mdatoms->chargeA;
144     nvdwtype         = fr->ntype;
145     vdwparam         = fr->nbfp;
146     vdwtype          = mdatoms->typeA;
147     vdwgridparam     = fr->ljpme_c6grid;
148     sh_lj_ewald      = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
149     ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
150     ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
151
152     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
153     ewtab            = fr->ic->tabq_coul_FDV0;
154     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
155     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
156
157     /* Setup water-specific parameters */
158     inr              = nlist->iinr[0];
159     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
160     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
161     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
162     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
163
164     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
165     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
166     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
167     vdwjidx0A        = 2*vdwtype[inr+0];
168     qq00             = _fjsp_mul_v2r8(iq0,jq0);
169     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
170     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
171     c6grid_00        = gmx_fjsp_set1_v2r8(vdwgridparam[vdwioffset0+vdwjidx0A]);
172     qq01             = _fjsp_mul_v2r8(iq0,jq1);
173     qq02             = _fjsp_mul_v2r8(iq0,jq2);
174     qq10             = _fjsp_mul_v2r8(iq1,jq0);
175     qq11             = _fjsp_mul_v2r8(iq1,jq1);
176     qq12             = _fjsp_mul_v2r8(iq1,jq2);
177     qq20             = _fjsp_mul_v2r8(iq2,jq0);
178     qq21             = _fjsp_mul_v2r8(iq2,jq1);
179     qq22             = _fjsp_mul_v2r8(iq2,jq2);
180
181     /* Avoid stupid compiler warnings */
182     jnrA = jnrB = 0;
183     j_coord_offsetA = 0;
184     j_coord_offsetB = 0;
185
186     outeriter        = 0;
187     inneriter        = 0;
188
189     /* Start outer loop over neighborlists */
190     for(iidx=0; iidx<nri; iidx++)
191     {
192         /* Load shift vector for this list */
193         i_shift_offset   = DIM*shiftidx[iidx];
194
195         /* Load limits for loop over neighbors */
196         j_index_start    = jindex[iidx];
197         j_index_end      = jindex[iidx+1];
198
199         /* Get outer coordinate index */
200         inr              = iinr[iidx];
201         i_coord_offset   = DIM*inr;
202
203         /* Load i particle coords and add shift vector */
204         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
205                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
206
207         fix0             = _fjsp_setzero_v2r8();
208         fiy0             = _fjsp_setzero_v2r8();
209         fiz0             = _fjsp_setzero_v2r8();
210         fix1             = _fjsp_setzero_v2r8();
211         fiy1             = _fjsp_setzero_v2r8();
212         fiz1             = _fjsp_setzero_v2r8();
213         fix2             = _fjsp_setzero_v2r8();
214         fiy2             = _fjsp_setzero_v2r8();
215         fiz2             = _fjsp_setzero_v2r8();
216
217         /* Reset potential sums */
218         velecsum         = _fjsp_setzero_v2r8();
219         vvdwsum          = _fjsp_setzero_v2r8();
220
221         /* Start inner kernel loop */
222         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
223         {
224
225             /* Get j neighbor index, and coordinate index */
226             jnrA             = jjnr[jidx];
227             jnrB             = jjnr[jidx+1];
228             j_coord_offsetA  = DIM*jnrA;
229             j_coord_offsetB  = DIM*jnrB;
230
231             /* load j atom coordinates */
232             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
233                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
234
235             /* Calculate displacement vector */
236             dx00             = _fjsp_sub_v2r8(ix0,jx0);
237             dy00             = _fjsp_sub_v2r8(iy0,jy0);
238             dz00             = _fjsp_sub_v2r8(iz0,jz0);
239             dx01             = _fjsp_sub_v2r8(ix0,jx1);
240             dy01             = _fjsp_sub_v2r8(iy0,jy1);
241             dz01             = _fjsp_sub_v2r8(iz0,jz1);
242             dx02             = _fjsp_sub_v2r8(ix0,jx2);
243             dy02             = _fjsp_sub_v2r8(iy0,jy2);
244             dz02             = _fjsp_sub_v2r8(iz0,jz2);
245             dx10             = _fjsp_sub_v2r8(ix1,jx0);
246             dy10             = _fjsp_sub_v2r8(iy1,jy0);
247             dz10             = _fjsp_sub_v2r8(iz1,jz0);
248             dx11             = _fjsp_sub_v2r8(ix1,jx1);
249             dy11             = _fjsp_sub_v2r8(iy1,jy1);
250             dz11             = _fjsp_sub_v2r8(iz1,jz1);
251             dx12             = _fjsp_sub_v2r8(ix1,jx2);
252             dy12             = _fjsp_sub_v2r8(iy1,jy2);
253             dz12             = _fjsp_sub_v2r8(iz1,jz2);
254             dx20             = _fjsp_sub_v2r8(ix2,jx0);
255             dy20             = _fjsp_sub_v2r8(iy2,jy0);
256             dz20             = _fjsp_sub_v2r8(iz2,jz0);
257             dx21             = _fjsp_sub_v2r8(ix2,jx1);
258             dy21             = _fjsp_sub_v2r8(iy2,jy1);
259             dz21             = _fjsp_sub_v2r8(iz2,jz1);
260             dx22             = _fjsp_sub_v2r8(ix2,jx2);
261             dy22             = _fjsp_sub_v2r8(iy2,jy2);
262             dz22             = _fjsp_sub_v2r8(iz2,jz2);
263
264             /* Calculate squared distance and things based on it */
265             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
266             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
267             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
268             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
269             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
270             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
271             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
272             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
273             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
274
275             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
276             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
277             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
278             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
279             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
280             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
281             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
282             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
283             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
284
285             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
286             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
287             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
288             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
289             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
290             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
291             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
292             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
293             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
294
295             fjx0             = _fjsp_setzero_v2r8();
296             fjy0             = _fjsp_setzero_v2r8();
297             fjz0             = _fjsp_setzero_v2r8();
298             fjx1             = _fjsp_setzero_v2r8();
299             fjy1             = _fjsp_setzero_v2r8();
300             fjz1             = _fjsp_setzero_v2r8();
301             fjx2             = _fjsp_setzero_v2r8();
302             fjy2             = _fjsp_setzero_v2r8();
303             fjz2             = _fjsp_setzero_v2r8();
304
305             /**************************
306              * CALCULATE INTERACTIONS *
307              **************************/
308
309             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
310
311             /* EWALD ELECTROSTATICS */
312
313             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
314             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
315             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
316             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
317             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
318
319             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
320             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
321             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
322             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
323             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
324             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
325             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
326             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
327             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
328             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
329
330             /* Analytical LJ-PME */
331             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
332             ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
333             ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
334             exponent         = gmx_simd_exp_d(ewcljrsq);
335             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
336             poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
337             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
338             vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(c6grid_00,_fjsp_sub_v2r8(poly,one),c6_00),rinvsix);
339             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
340             vvdw             = _fjsp_msub_v2r8(vvdw12,one_twelfth,_fjsp_mul_v2r8(vvdw6,one_sixth));         
341             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
342             fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
343
344             /* Update potential sum for this i atom from the interaction with this j atom. */
345             velecsum         = _fjsp_add_v2r8(velecsum,velec);
346             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
347
348             fscal            = _fjsp_add_v2r8(felec,fvdw);
349
350             /* Update vectorial force */
351             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
352             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
353             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
354             
355             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
356             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
357             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
358
359             /**************************
360              * CALCULATE INTERACTIONS *
361              **************************/
362
363             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
364
365             /* EWALD ELECTROSTATICS */
366
367             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
368             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
369             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
370             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
371             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
372
373             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
374             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
375             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
376             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
377             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
378             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
379             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
380             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
381             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
382             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
383
384             /* Update potential sum for this i atom from the interaction with this j atom. */
385             velecsum         = _fjsp_add_v2r8(velecsum,velec);
386
387             fscal            = felec;
388
389             /* Update vectorial force */
390             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
391             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
392             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
393             
394             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
395             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
396             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
397
398             /**************************
399              * CALCULATE INTERACTIONS *
400              **************************/
401
402             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
403
404             /* EWALD ELECTROSTATICS */
405
406             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
407             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
408             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
409             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
410             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
411
412             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
413             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
414             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
415             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
416             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
417             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
418             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
419             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
420             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
421             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
422
423             /* Update potential sum for this i atom from the interaction with this j atom. */
424             velecsum         = _fjsp_add_v2r8(velecsum,velec);
425
426             fscal            = felec;
427
428             /* Update vectorial force */
429             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
430             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
431             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
432             
433             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
434             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
435             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
436
437             /**************************
438              * CALCULATE INTERACTIONS *
439              **************************/
440
441             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
442
443             /* EWALD ELECTROSTATICS */
444
445             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
446             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
447             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
448             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
449             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
450
451             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
452             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
453             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
454             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
455             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
456             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
457             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
458             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
459             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
460             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
461
462             /* Update potential sum for this i atom from the interaction with this j atom. */
463             velecsum         = _fjsp_add_v2r8(velecsum,velec);
464
465             fscal            = felec;
466
467             /* Update vectorial force */
468             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
469             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
470             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
471             
472             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
473             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
474             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
475
476             /**************************
477              * CALCULATE INTERACTIONS *
478              **************************/
479
480             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
481
482             /* EWALD ELECTROSTATICS */
483
484             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
485             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
486             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
487             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
488             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
489
490             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
491             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
492             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
493             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
494             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
495             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
496             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
497             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
498             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
499             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
500
501             /* Update potential sum for this i atom from the interaction with this j atom. */
502             velecsum         = _fjsp_add_v2r8(velecsum,velec);
503
504             fscal            = felec;
505
506             /* Update vectorial force */
507             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
508             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
509             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
510             
511             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
512             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
513             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
514
515             /**************************
516              * CALCULATE INTERACTIONS *
517              **************************/
518
519             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
520
521             /* EWALD ELECTROSTATICS */
522
523             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
524             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
525             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
526             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
527             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
528
529             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
530             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
531             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
532             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
533             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
534             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
535             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
536             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
537             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
538             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
539
540             /* Update potential sum for this i atom from the interaction with this j atom. */
541             velecsum         = _fjsp_add_v2r8(velecsum,velec);
542
543             fscal            = felec;
544
545             /* Update vectorial force */
546             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
547             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
548             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
549             
550             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
551             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
552             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
553
554             /**************************
555              * CALCULATE INTERACTIONS *
556              **************************/
557
558             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
559
560             /* EWALD ELECTROSTATICS */
561
562             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
563             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
564             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
565             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
566             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
567
568             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
569             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
570             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
571             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
572             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
573             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
574             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
575             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
576             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
577             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
578
579             /* Update potential sum for this i atom from the interaction with this j atom. */
580             velecsum         = _fjsp_add_v2r8(velecsum,velec);
581
582             fscal            = felec;
583
584             /* Update vectorial force */
585             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
586             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
587             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
588             
589             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
590             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
591             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
592
593             /**************************
594              * CALCULATE INTERACTIONS *
595              **************************/
596
597             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
598
599             /* EWALD ELECTROSTATICS */
600
601             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
602             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
603             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
604             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
605             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
606
607             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
608             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
609             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
610             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
611             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
612             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
613             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
614             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
615             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
616             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
617
618             /* Update potential sum for this i atom from the interaction with this j atom. */
619             velecsum         = _fjsp_add_v2r8(velecsum,velec);
620
621             fscal            = felec;
622
623             /* Update vectorial force */
624             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
625             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
626             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
627             
628             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
629             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
630             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
631
632             /**************************
633              * CALCULATE INTERACTIONS *
634              **************************/
635
636             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
637
638             /* EWALD ELECTROSTATICS */
639
640             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
641             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
642             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
643             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
644             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
645
646             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
647             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
648             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
649             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
650             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
651             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
652             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
653             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
654             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
655             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
656
657             /* Update potential sum for this i atom from the interaction with this j atom. */
658             velecsum         = _fjsp_add_v2r8(velecsum,velec);
659
660             fscal            = felec;
661
662             /* Update vectorial force */
663             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
664             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
665             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
666             
667             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
668             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
669             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
670
671             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
672
673             /* Inner loop uses 420 flops */
674         }
675
676         if(jidx<j_index_end)
677         {
678
679             jnrA             = jjnr[jidx];
680             j_coord_offsetA  = DIM*jnrA;
681
682             /* load j atom coordinates */
683             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
684                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
685
686             /* Calculate displacement vector */
687             dx00             = _fjsp_sub_v2r8(ix0,jx0);
688             dy00             = _fjsp_sub_v2r8(iy0,jy0);
689             dz00             = _fjsp_sub_v2r8(iz0,jz0);
690             dx01             = _fjsp_sub_v2r8(ix0,jx1);
691             dy01             = _fjsp_sub_v2r8(iy0,jy1);
692             dz01             = _fjsp_sub_v2r8(iz0,jz1);
693             dx02             = _fjsp_sub_v2r8(ix0,jx2);
694             dy02             = _fjsp_sub_v2r8(iy0,jy2);
695             dz02             = _fjsp_sub_v2r8(iz0,jz2);
696             dx10             = _fjsp_sub_v2r8(ix1,jx0);
697             dy10             = _fjsp_sub_v2r8(iy1,jy0);
698             dz10             = _fjsp_sub_v2r8(iz1,jz0);
699             dx11             = _fjsp_sub_v2r8(ix1,jx1);
700             dy11             = _fjsp_sub_v2r8(iy1,jy1);
701             dz11             = _fjsp_sub_v2r8(iz1,jz1);
702             dx12             = _fjsp_sub_v2r8(ix1,jx2);
703             dy12             = _fjsp_sub_v2r8(iy1,jy2);
704             dz12             = _fjsp_sub_v2r8(iz1,jz2);
705             dx20             = _fjsp_sub_v2r8(ix2,jx0);
706             dy20             = _fjsp_sub_v2r8(iy2,jy0);
707             dz20             = _fjsp_sub_v2r8(iz2,jz0);
708             dx21             = _fjsp_sub_v2r8(ix2,jx1);
709             dy21             = _fjsp_sub_v2r8(iy2,jy1);
710             dz21             = _fjsp_sub_v2r8(iz2,jz1);
711             dx22             = _fjsp_sub_v2r8(ix2,jx2);
712             dy22             = _fjsp_sub_v2r8(iy2,jy2);
713             dz22             = _fjsp_sub_v2r8(iz2,jz2);
714
715             /* Calculate squared distance and things based on it */
716             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
717             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
718             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
719             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
720             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
721             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
722             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
723             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
724             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
725
726             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
727             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
728             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
729             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
730             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
731             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
732             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
733             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
734             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
735
736             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
737             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
738             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
739             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
740             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
741             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
742             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
743             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
744             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
745
746             fjx0             = _fjsp_setzero_v2r8();
747             fjy0             = _fjsp_setzero_v2r8();
748             fjz0             = _fjsp_setzero_v2r8();
749             fjx1             = _fjsp_setzero_v2r8();
750             fjy1             = _fjsp_setzero_v2r8();
751             fjz1             = _fjsp_setzero_v2r8();
752             fjx2             = _fjsp_setzero_v2r8();
753             fjy2             = _fjsp_setzero_v2r8();
754             fjz2             = _fjsp_setzero_v2r8();
755
756             /**************************
757              * CALCULATE INTERACTIONS *
758              **************************/
759
760             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
761
762             /* EWALD ELECTROSTATICS */
763
764             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
765             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
766             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
767             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
768             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
769
770             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
771             ewtabD           = _fjsp_setzero_v2r8();
772             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
773             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
774             ewtabFn          = _fjsp_setzero_v2r8();
775             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
776             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
777             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
778             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
779             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
780
781             /* Analytical LJ-PME */
782             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
783             ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
784             ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
785             exponent         = gmx_simd_exp_d(ewcljrsq);
786             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
787             poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
788             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
789             vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(c6grid_00,_fjsp_sub_v2r8(poly,one),c6_00),rinvsix);
790             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
791             vvdw             = _fjsp_msub_v2r8(vvdw12,one_twelfth,_fjsp_mul_v2r8(vvdw6,one_sixth));         
792             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
793             fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
794
795             /* Update potential sum for this i atom from the interaction with this j atom. */
796             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
797             velecsum         = _fjsp_add_v2r8(velecsum,velec);
798             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
799             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
800
801             fscal            = _fjsp_add_v2r8(felec,fvdw);
802
803             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
804
805             /* Update vectorial force */
806             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
807             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
808             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
809             
810             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
811             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
812             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
813
814             /**************************
815              * CALCULATE INTERACTIONS *
816              **************************/
817
818             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
819
820             /* EWALD ELECTROSTATICS */
821
822             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
823             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
824             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
825             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
826             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
827
828             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
829             ewtabD           = _fjsp_setzero_v2r8();
830             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
831             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
832             ewtabFn          = _fjsp_setzero_v2r8();
833             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
834             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
835             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
836             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
837             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
838
839             /* Update potential sum for this i atom from the interaction with this j atom. */
840             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
841             velecsum         = _fjsp_add_v2r8(velecsum,velec);
842
843             fscal            = felec;
844
845             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
846
847             /* Update vectorial force */
848             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
849             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
850             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
851             
852             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
853             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
854             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
855
856             /**************************
857              * CALCULATE INTERACTIONS *
858              **************************/
859
860             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
861
862             /* EWALD ELECTROSTATICS */
863
864             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
865             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
866             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
867             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
868             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
869
870             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
871             ewtabD           = _fjsp_setzero_v2r8();
872             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
873             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
874             ewtabFn          = _fjsp_setzero_v2r8();
875             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
876             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
877             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
878             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
879             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
880
881             /* Update potential sum for this i atom from the interaction with this j atom. */
882             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
883             velecsum         = _fjsp_add_v2r8(velecsum,velec);
884
885             fscal            = felec;
886
887             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
888
889             /* Update vectorial force */
890             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
891             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
892             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
893             
894             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
895             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
896             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
897
898             /**************************
899              * CALCULATE INTERACTIONS *
900              **************************/
901
902             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
903
904             /* EWALD ELECTROSTATICS */
905
906             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
907             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
908             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
909             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
910             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
911
912             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
913             ewtabD           = _fjsp_setzero_v2r8();
914             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
915             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
916             ewtabFn          = _fjsp_setzero_v2r8();
917             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
918             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
919             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
920             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
921             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
922
923             /* Update potential sum for this i atom from the interaction with this j atom. */
924             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
925             velecsum         = _fjsp_add_v2r8(velecsum,velec);
926
927             fscal            = felec;
928
929             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
930
931             /* Update vectorial force */
932             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
933             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
934             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
935             
936             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
937             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
938             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
939
940             /**************************
941              * CALCULATE INTERACTIONS *
942              **************************/
943
944             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
945
946             /* EWALD ELECTROSTATICS */
947
948             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
949             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
950             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
951             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
952             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
953
954             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
955             ewtabD           = _fjsp_setzero_v2r8();
956             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
957             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
958             ewtabFn          = _fjsp_setzero_v2r8();
959             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
960             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
961             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
962             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
963             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
964
965             /* Update potential sum for this i atom from the interaction with this j atom. */
966             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
967             velecsum         = _fjsp_add_v2r8(velecsum,velec);
968
969             fscal            = felec;
970
971             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
972
973             /* Update vectorial force */
974             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
975             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
976             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
977             
978             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
979             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
980             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
981
982             /**************************
983              * CALCULATE INTERACTIONS *
984              **************************/
985
986             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
987
988             /* EWALD ELECTROSTATICS */
989
990             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
991             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
992             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
993             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
994             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
995
996             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
997             ewtabD           = _fjsp_setzero_v2r8();
998             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
999             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1000             ewtabFn          = _fjsp_setzero_v2r8();
1001             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1002             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1003             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1004             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
1005             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
1006
1007             /* Update potential sum for this i atom from the interaction with this j atom. */
1008             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1009             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1010
1011             fscal            = felec;
1012
1013             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1014
1015             /* Update vectorial force */
1016             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
1017             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
1018             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
1019             
1020             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
1021             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
1022             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
1023
1024             /**************************
1025              * CALCULATE INTERACTIONS *
1026              **************************/
1027
1028             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
1029
1030             /* EWALD ELECTROSTATICS */
1031
1032             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1033             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
1034             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1035             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1036             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1037
1038             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1039             ewtabD           = _fjsp_setzero_v2r8();
1040             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1041             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1042             ewtabFn          = _fjsp_setzero_v2r8();
1043             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1044             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1045             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1046             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
1047             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
1048
1049             /* Update potential sum for this i atom from the interaction with this j atom. */
1050             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1051             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1052
1053             fscal            = felec;
1054
1055             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1056
1057             /* Update vectorial force */
1058             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
1059             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
1060             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
1061             
1062             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
1063             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
1064             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
1065
1066             /**************************
1067              * CALCULATE INTERACTIONS *
1068              **************************/
1069
1070             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
1071
1072             /* EWALD ELECTROSTATICS */
1073
1074             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1075             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
1076             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1077             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1078             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1079
1080             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1081             ewtabD           = _fjsp_setzero_v2r8();
1082             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1083             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1084             ewtabFn          = _fjsp_setzero_v2r8();
1085             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1086             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1087             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1088             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
1089             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
1090
1091             /* Update potential sum for this i atom from the interaction with this j atom. */
1092             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1093             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1094
1095             fscal            = felec;
1096
1097             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1098
1099             /* Update vectorial force */
1100             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
1101             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
1102             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
1103             
1104             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
1105             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
1106             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
1107
1108             /**************************
1109              * CALCULATE INTERACTIONS *
1110              **************************/
1111
1112             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
1113
1114             /* EWALD ELECTROSTATICS */
1115
1116             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1117             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
1118             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1119             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1120             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1121
1122             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1123             ewtabD           = _fjsp_setzero_v2r8();
1124             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1125             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1126             ewtabFn          = _fjsp_setzero_v2r8();
1127             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1128             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1129             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1130             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
1131             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
1132
1133             /* Update potential sum for this i atom from the interaction with this j atom. */
1134             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1135             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1136
1137             fscal            = felec;
1138
1139             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1140
1141             /* Update vectorial force */
1142             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
1143             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
1144             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
1145             
1146             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
1147             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
1148             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
1149
1150             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1151
1152             /* Inner loop uses 420 flops */
1153         }
1154
1155         /* End of innermost loop */
1156
1157         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1158                                               f+i_coord_offset,fshift+i_shift_offset);
1159
1160         ggid                        = gid[iidx];
1161         /* Update potential energies */
1162         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
1163         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
1164
1165         /* Increment number of inner iterations */
1166         inneriter                  += j_index_end - j_index_start;
1167
1168         /* Outer loop uses 20 flops */
1169     }
1170
1171     /* Increment number of outer iterations */
1172     outeriter        += nri;
1173
1174     /* Update outer/inner flops */
1175
1176     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*420);
1177 }
1178 /*
1179  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sparc64_hpc_ace_double
1180  * Electrostatics interaction: Ewald
1181  * VdW interaction:            LJEwald
1182  * Geometry:                   Water3-Water3
1183  * Calculate force/pot:        Force
1184  */
1185 void
1186 nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sparc64_hpc_ace_double
1187                     (t_nblist                    * gmx_restrict       nlist,
1188                      rvec                        * gmx_restrict          xx,
1189                      rvec                        * gmx_restrict          ff,
1190                      t_forcerec                  * gmx_restrict          fr,
1191                      t_mdatoms                   * gmx_restrict     mdatoms,
1192                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1193                      t_nrnb                      * gmx_restrict        nrnb)
1194 {
1195     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1196      * just 0 for non-waters.
1197      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
1198      * jnr indices corresponding to data put in the four positions in the SIMD register.
1199      */
1200     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1201     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1202     int              jnrA,jnrB;
1203     int              j_coord_offsetA,j_coord_offsetB;
1204     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1205     real             rcutoff_scalar;
1206     real             *shiftvec,*fshift,*x,*f;
1207     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1208     int              vdwioffset0;
1209     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1210     int              vdwioffset1;
1211     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1212     int              vdwioffset2;
1213     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1214     int              vdwjidx0A,vdwjidx0B;
1215     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1216     int              vdwjidx1A,vdwjidx1B;
1217     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1218     int              vdwjidx2A,vdwjidx2B;
1219     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1220     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1221     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1222     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1223     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1224     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1225     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1226     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1227     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1228     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1229     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
1230     real             *charge;
1231     int              nvdwtype;
1232     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1233     int              *vdwtype;
1234     real             *vdwparam;
1235     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
1236     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
1237     _fjsp_v2r8           c6grid_00;
1238     _fjsp_v2r8           c6grid_01;
1239     _fjsp_v2r8           c6grid_02;
1240     _fjsp_v2r8           c6grid_10;
1241     _fjsp_v2r8           c6grid_11;
1242     _fjsp_v2r8           c6grid_12;
1243     _fjsp_v2r8           c6grid_20;
1244     _fjsp_v2r8           c6grid_21;
1245     _fjsp_v2r8           c6grid_22;
1246     real                 *vdwgridparam;
1247     _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1248     _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
1249     _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
1250     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1251     real             *ewtab;
1252     _fjsp_v2r8       itab_tmp;
1253     _fjsp_v2r8       dummy_mask,cutoff_mask;
1254     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
1255     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
1256     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
1257
1258     x                = xx[0];
1259     f                = ff[0];
1260
1261     nri              = nlist->nri;
1262     iinr             = nlist->iinr;
1263     jindex           = nlist->jindex;
1264     jjnr             = nlist->jjnr;
1265     shiftidx         = nlist->shift;
1266     gid              = nlist->gid;
1267     shiftvec         = fr->shift_vec[0];
1268     fshift           = fr->fshift[0];
1269     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
1270     charge           = mdatoms->chargeA;
1271     nvdwtype         = fr->ntype;
1272     vdwparam         = fr->nbfp;
1273     vdwtype          = mdatoms->typeA;
1274     vdwgridparam     = fr->ljpme_c6grid;
1275     sh_lj_ewald      = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
1276     ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
1277     ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
1278
1279     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
1280     ewtab            = fr->ic->tabq_coul_F;
1281     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
1282     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
1283
1284     /* Setup water-specific parameters */
1285     inr              = nlist->iinr[0];
1286     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
1287     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
1288     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
1289     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1290
1291     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
1292     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
1293     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
1294     vdwjidx0A        = 2*vdwtype[inr+0];
1295     qq00             = _fjsp_mul_v2r8(iq0,jq0);
1296     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
1297     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
1298     c6grid_00        = gmx_fjsp_set1_v2r8(vdwgridparam[vdwioffset0+vdwjidx0A]);
1299     qq01             = _fjsp_mul_v2r8(iq0,jq1);
1300     qq02             = _fjsp_mul_v2r8(iq0,jq2);
1301     qq10             = _fjsp_mul_v2r8(iq1,jq0);
1302     qq11             = _fjsp_mul_v2r8(iq1,jq1);
1303     qq12             = _fjsp_mul_v2r8(iq1,jq2);
1304     qq20             = _fjsp_mul_v2r8(iq2,jq0);
1305     qq21             = _fjsp_mul_v2r8(iq2,jq1);
1306     qq22             = _fjsp_mul_v2r8(iq2,jq2);
1307
1308     /* Avoid stupid compiler warnings */
1309     jnrA = jnrB = 0;
1310     j_coord_offsetA = 0;
1311     j_coord_offsetB = 0;
1312
1313     outeriter        = 0;
1314     inneriter        = 0;
1315
1316     /* Start outer loop over neighborlists */
1317     for(iidx=0; iidx<nri; iidx++)
1318     {
1319         /* Load shift vector for this list */
1320         i_shift_offset   = DIM*shiftidx[iidx];
1321
1322         /* Load limits for loop over neighbors */
1323         j_index_start    = jindex[iidx];
1324         j_index_end      = jindex[iidx+1];
1325
1326         /* Get outer coordinate index */
1327         inr              = iinr[iidx];
1328         i_coord_offset   = DIM*inr;
1329
1330         /* Load i particle coords and add shift vector */
1331         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
1332                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1333
1334         fix0             = _fjsp_setzero_v2r8();
1335         fiy0             = _fjsp_setzero_v2r8();
1336         fiz0             = _fjsp_setzero_v2r8();
1337         fix1             = _fjsp_setzero_v2r8();
1338         fiy1             = _fjsp_setzero_v2r8();
1339         fiz1             = _fjsp_setzero_v2r8();
1340         fix2             = _fjsp_setzero_v2r8();
1341         fiy2             = _fjsp_setzero_v2r8();
1342         fiz2             = _fjsp_setzero_v2r8();
1343
1344         /* Start inner kernel loop */
1345         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1346         {
1347
1348             /* Get j neighbor index, and coordinate index */
1349             jnrA             = jjnr[jidx];
1350             jnrB             = jjnr[jidx+1];
1351             j_coord_offsetA  = DIM*jnrA;
1352             j_coord_offsetB  = DIM*jnrB;
1353
1354             /* load j atom coordinates */
1355             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
1356                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1357
1358             /* Calculate displacement vector */
1359             dx00             = _fjsp_sub_v2r8(ix0,jx0);
1360             dy00             = _fjsp_sub_v2r8(iy0,jy0);
1361             dz00             = _fjsp_sub_v2r8(iz0,jz0);
1362             dx01             = _fjsp_sub_v2r8(ix0,jx1);
1363             dy01             = _fjsp_sub_v2r8(iy0,jy1);
1364             dz01             = _fjsp_sub_v2r8(iz0,jz1);
1365             dx02             = _fjsp_sub_v2r8(ix0,jx2);
1366             dy02             = _fjsp_sub_v2r8(iy0,jy2);
1367             dz02             = _fjsp_sub_v2r8(iz0,jz2);
1368             dx10             = _fjsp_sub_v2r8(ix1,jx0);
1369             dy10             = _fjsp_sub_v2r8(iy1,jy0);
1370             dz10             = _fjsp_sub_v2r8(iz1,jz0);
1371             dx11             = _fjsp_sub_v2r8(ix1,jx1);
1372             dy11             = _fjsp_sub_v2r8(iy1,jy1);
1373             dz11             = _fjsp_sub_v2r8(iz1,jz1);
1374             dx12             = _fjsp_sub_v2r8(ix1,jx2);
1375             dy12             = _fjsp_sub_v2r8(iy1,jy2);
1376             dz12             = _fjsp_sub_v2r8(iz1,jz2);
1377             dx20             = _fjsp_sub_v2r8(ix2,jx0);
1378             dy20             = _fjsp_sub_v2r8(iy2,jy0);
1379             dz20             = _fjsp_sub_v2r8(iz2,jz0);
1380             dx21             = _fjsp_sub_v2r8(ix2,jx1);
1381             dy21             = _fjsp_sub_v2r8(iy2,jy1);
1382             dz21             = _fjsp_sub_v2r8(iz2,jz1);
1383             dx22             = _fjsp_sub_v2r8(ix2,jx2);
1384             dy22             = _fjsp_sub_v2r8(iy2,jy2);
1385             dz22             = _fjsp_sub_v2r8(iz2,jz2);
1386
1387             /* Calculate squared distance and things based on it */
1388             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
1389             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
1390             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
1391             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
1392             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
1393             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
1394             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
1395             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
1396             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
1397
1398             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
1399             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
1400             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
1401             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
1402             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
1403             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
1404             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
1405             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
1406             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
1407
1408             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
1409             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
1410             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
1411             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
1412             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
1413             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
1414             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
1415             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
1416             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
1417
1418             fjx0             = _fjsp_setzero_v2r8();
1419             fjy0             = _fjsp_setzero_v2r8();
1420             fjz0             = _fjsp_setzero_v2r8();
1421             fjx1             = _fjsp_setzero_v2r8();
1422             fjy1             = _fjsp_setzero_v2r8();
1423             fjz1             = _fjsp_setzero_v2r8();
1424             fjx2             = _fjsp_setzero_v2r8();
1425             fjy2             = _fjsp_setzero_v2r8();
1426             fjz2             = _fjsp_setzero_v2r8();
1427
1428             /**************************
1429              * CALCULATE INTERACTIONS *
1430              **************************/
1431
1432             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
1433
1434             /* EWALD ELECTROSTATICS */
1435
1436             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1437             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
1438             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1439             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1440             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1441
1442             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1443                                          &ewtabF,&ewtabFn);
1444             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1445             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
1446
1447             /* Analytical LJ-PME */
1448             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
1449             ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
1450             ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
1451             exponent         = gmx_simd_exp_d(ewcljrsq);
1452             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1453             poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
1454             /* f6A = 6 * C6grid * (1 - poly) */
1455             f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_sub_v2r8(one,poly));
1456             /* f6B = C6grid * exponent * beta^6 */
1457             f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
1458             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1459             fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
1460
1461             fscal            = _fjsp_add_v2r8(felec,fvdw);
1462
1463             /* Update vectorial force */
1464             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
1465             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
1466             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
1467             
1468             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
1469             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
1470             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
1471
1472             /**************************
1473              * CALCULATE INTERACTIONS *
1474              **************************/
1475
1476             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
1477
1478             /* EWALD ELECTROSTATICS */
1479
1480             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1481             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
1482             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1483             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1484             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1485
1486             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1487                                          &ewtabF,&ewtabFn);
1488             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1489             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
1490
1491             fscal            = felec;
1492
1493             /* Update vectorial force */
1494             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
1495             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
1496             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
1497             
1498             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
1499             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
1500             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
1501
1502             /**************************
1503              * CALCULATE INTERACTIONS *
1504              **************************/
1505
1506             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
1507
1508             /* EWALD ELECTROSTATICS */
1509
1510             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1511             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
1512             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1513             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1514             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1515
1516             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1517                                          &ewtabF,&ewtabFn);
1518             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1519             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
1520
1521             fscal            = felec;
1522
1523             /* Update vectorial force */
1524             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
1525             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
1526             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
1527             
1528             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
1529             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
1530             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
1531
1532             /**************************
1533              * CALCULATE INTERACTIONS *
1534              **************************/
1535
1536             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
1537
1538             /* EWALD ELECTROSTATICS */
1539
1540             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1541             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
1542             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1543             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1544             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1545
1546             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1547                                          &ewtabF,&ewtabFn);
1548             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1549             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
1550
1551             fscal            = felec;
1552
1553             /* Update vectorial force */
1554             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
1555             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
1556             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
1557             
1558             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
1559             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
1560             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
1561
1562             /**************************
1563              * CALCULATE INTERACTIONS *
1564              **************************/
1565
1566             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
1567
1568             /* EWALD ELECTROSTATICS */
1569
1570             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1571             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
1572             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1573             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1574             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1575
1576             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1577                                          &ewtabF,&ewtabFn);
1578             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1579             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
1580
1581             fscal            = felec;
1582
1583             /* Update vectorial force */
1584             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
1585             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
1586             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
1587             
1588             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
1589             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
1590             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
1591
1592             /**************************
1593              * CALCULATE INTERACTIONS *
1594              **************************/
1595
1596             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
1597
1598             /* EWALD ELECTROSTATICS */
1599
1600             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1601             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
1602             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1603             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1604             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1605
1606             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1607                                          &ewtabF,&ewtabFn);
1608             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1609             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
1610
1611             fscal            = felec;
1612
1613             /* Update vectorial force */
1614             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
1615             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
1616             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
1617             
1618             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
1619             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
1620             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
1621
1622             /**************************
1623              * CALCULATE INTERACTIONS *
1624              **************************/
1625
1626             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
1627
1628             /* EWALD ELECTROSTATICS */
1629
1630             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1631             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
1632             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1633             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1634             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1635
1636             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1637                                          &ewtabF,&ewtabFn);
1638             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1639             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
1640
1641             fscal            = felec;
1642
1643             /* Update vectorial force */
1644             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
1645             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
1646             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
1647             
1648             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
1649             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
1650             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
1651
1652             /**************************
1653              * CALCULATE INTERACTIONS *
1654              **************************/
1655
1656             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
1657
1658             /* EWALD ELECTROSTATICS */
1659
1660             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1661             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
1662             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1663             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1664             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1665
1666             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1667                                          &ewtabF,&ewtabFn);
1668             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1669             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
1670
1671             fscal            = felec;
1672
1673             /* Update vectorial force */
1674             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
1675             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
1676             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
1677             
1678             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
1679             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
1680             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
1681
1682             /**************************
1683              * CALCULATE INTERACTIONS *
1684              **************************/
1685
1686             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
1687
1688             /* EWALD ELECTROSTATICS */
1689
1690             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1691             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
1692             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1693             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1694             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1695
1696             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1697                                          &ewtabF,&ewtabFn);
1698             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1699             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
1700
1701             fscal            = felec;
1702
1703             /* Update vectorial force */
1704             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
1705             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
1706             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
1707             
1708             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
1709             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
1710             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
1711
1712             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1713
1714             /* Inner loop uses 373 flops */
1715         }
1716
1717         if(jidx<j_index_end)
1718         {
1719
1720             jnrA             = jjnr[jidx];
1721             j_coord_offsetA  = DIM*jnrA;
1722
1723             /* load j atom coordinates */
1724             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
1725                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1726
1727             /* Calculate displacement vector */
1728             dx00             = _fjsp_sub_v2r8(ix0,jx0);
1729             dy00             = _fjsp_sub_v2r8(iy0,jy0);
1730             dz00             = _fjsp_sub_v2r8(iz0,jz0);
1731             dx01             = _fjsp_sub_v2r8(ix0,jx1);
1732             dy01             = _fjsp_sub_v2r8(iy0,jy1);
1733             dz01             = _fjsp_sub_v2r8(iz0,jz1);
1734             dx02             = _fjsp_sub_v2r8(ix0,jx2);
1735             dy02             = _fjsp_sub_v2r8(iy0,jy2);
1736             dz02             = _fjsp_sub_v2r8(iz0,jz2);
1737             dx10             = _fjsp_sub_v2r8(ix1,jx0);
1738             dy10             = _fjsp_sub_v2r8(iy1,jy0);
1739             dz10             = _fjsp_sub_v2r8(iz1,jz0);
1740             dx11             = _fjsp_sub_v2r8(ix1,jx1);
1741             dy11             = _fjsp_sub_v2r8(iy1,jy1);
1742             dz11             = _fjsp_sub_v2r8(iz1,jz1);
1743             dx12             = _fjsp_sub_v2r8(ix1,jx2);
1744             dy12             = _fjsp_sub_v2r8(iy1,jy2);
1745             dz12             = _fjsp_sub_v2r8(iz1,jz2);
1746             dx20             = _fjsp_sub_v2r8(ix2,jx0);
1747             dy20             = _fjsp_sub_v2r8(iy2,jy0);
1748             dz20             = _fjsp_sub_v2r8(iz2,jz0);
1749             dx21             = _fjsp_sub_v2r8(ix2,jx1);
1750             dy21             = _fjsp_sub_v2r8(iy2,jy1);
1751             dz21             = _fjsp_sub_v2r8(iz2,jz1);
1752             dx22             = _fjsp_sub_v2r8(ix2,jx2);
1753             dy22             = _fjsp_sub_v2r8(iy2,jy2);
1754             dz22             = _fjsp_sub_v2r8(iz2,jz2);
1755
1756             /* Calculate squared distance and things based on it */
1757             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
1758             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
1759             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
1760             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
1761             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
1762             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
1763             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
1764             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
1765             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
1766
1767             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
1768             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
1769             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
1770             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
1771             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
1772             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
1773             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
1774             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
1775             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
1776
1777             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
1778             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
1779             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
1780             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
1781             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
1782             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
1783             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
1784             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
1785             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
1786
1787             fjx0             = _fjsp_setzero_v2r8();
1788             fjy0             = _fjsp_setzero_v2r8();
1789             fjz0             = _fjsp_setzero_v2r8();
1790             fjx1             = _fjsp_setzero_v2r8();
1791             fjy1             = _fjsp_setzero_v2r8();
1792             fjz1             = _fjsp_setzero_v2r8();
1793             fjx2             = _fjsp_setzero_v2r8();
1794             fjy2             = _fjsp_setzero_v2r8();
1795             fjz2             = _fjsp_setzero_v2r8();
1796
1797             /**************************
1798              * CALCULATE INTERACTIONS *
1799              **************************/
1800
1801             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
1802
1803             /* EWALD ELECTROSTATICS */
1804
1805             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1806             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
1807             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1808             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1809             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1810
1811             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
1812             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1813             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
1814
1815             /* Analytical LJ-PME */
1816             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
1817             ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
1818             ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
1819             exponent         = gmx_simd_exp_d(ewcljrsq);
1820             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1821             poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
1822             /* f6A = 6 * C6grid * (1 - poly) */
1823             f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_sub_v2r8(one,poly));
1824             /* f6B = C6grid * exponent * beta^6 */
1825             f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
1826             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1827             fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
1828
1829             fscal            = _fjsp_add_v2r8(felec,fvdw);
1830
1831             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1832
1833             /* Update vectorial force */
1834             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
1835             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
1836             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
1837             
1838             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
1839             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
1840             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
1841
1842             /**************************
1843              * CALCULATE INTERACTIONS *
1844              **************************/
1845
1846             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
1847
1848             /* EWALD ELECTROSTATICS */
1849
1850             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1851             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
1852             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1853             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1854             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1855
1856             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
1857             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1858             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
1859
1860             fscal            = felec;
1861
1862             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1863
1864             /* Update vectorial force */
1865             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
1866             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
1867             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
1868             
1869             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
1870             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
1871             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
1872
1873             /**************************
1874              * CALCULATE INTERACTIONS *
1875              **************************/
1876
1877             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
1878
1879             /* EWALD ELECTROSTATICS */
1880
1881             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1882             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
1883             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1884             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1885             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1886
1887             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
1888             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1889             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
1890
1891             fscal            = felec;
1892
1893             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1894
1895             /* Update vectorial force */
1896             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
1897             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
1898             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
1899             
1900             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
1901             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
1902             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
1903
1904             /**************************
1905              * CALCULATE INTERACTIONS *
1906              **************************/
1907
1908             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
1909
1910             /* EWALD ELECTROSTATICS */
1911
1912             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1913             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
1914             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1915             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1916             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1917
1918             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
1919             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1920             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
1921
1922             fscal            = felec;
1923
1924             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1925
1926             /* Update vectorial force */
1927             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
1928             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
1929             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
1930             
1931             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
1932             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
1933             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
1934
1935             /**************************
1936              * CALCULATE INTERACTIONS *
1937              **************************/
1938
1939             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
1940
1941             /* EWALD ELECTROSTATICS */
1942
1943             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1944             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
1945             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1946             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1947             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1948
1949             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
1950             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1951             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
1952
1953             fscal            = felec;
1954
1955             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1956
1957             /* Update vectorial force */
1958             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
1959             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
1960             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
1961             
1962             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
1963             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
1964             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
1965
1966             /**************************
1967              * CALCULATE INTERACTIONS *
1968              **************************/
1969
1970             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
1971
1972             /* EWALD ELECTROSTATICS */
1973
1974             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1975             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
1976             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1977             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1978             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1979
1980             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
1981             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1982             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
1983
1984             fscal            = felec;
1985
1986             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1987
1988             /* Update vectorial force */
1989             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
1990             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
1991             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
1992             
1993             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
1994             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
1995             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
1996
1997             /**************************
1998              * CALCULATE INTERACTIONS *
1999              **************************/
2000
2001             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
2002
2003             /* EWALD ELECTROSTATICS */
2004
2005             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2006             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
2007             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2008             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2009             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2010
2011             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2012             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2013             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
2014
2015             fscal            = felec;
2016
2017             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2018
2019             /* Update vectorial force */
2020             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
2021             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
2022             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
2023             
2024             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
2025             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
2026             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
2027
2028             /**************************
2029              * CALCULATE INTERACTIONS *
2030              **************************/
2031
2032             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
2033
2034             /* EWALD ELECTROSTATICS */
2035
2036             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2037             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
2038             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2039             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2040             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2041
2042             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2043             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2044             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
2045
2046             fscal            = felec;
2047
2048             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2049
2050             /* Update vectorial force */
2051             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
2052             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
2053             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
2054             
2055             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
2056             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
2057             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
2058
2059             /**************************
2060              * CALCULATE INTERACTIONS *
2061              **************************/
2062
2063             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
2064
2065             /* EWALD ELECTROSTATICS */
2066
2067             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2068             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
2069             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2070             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2071             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2072
2073             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2074             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2075             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
2076
2077             fscal            = felec;
2078
2079             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2080
2081             /* Update vectorial force */
2082             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
2083             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
2084             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
2085             
2086             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
2087             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
2088             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
2089
2090             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2091
2092             /* Inner loop uses 373 flops */
2093         }
2094
2095         /* End of innermost loop */
2096
2097         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2098                                               f+i_coord_offset,fshift+i_shift_offset);
2099
2100         /* Increment number of inner iterations */
2101         inneriter                  += j_index_end - j_index_start;
2102
2103         /* Outer loop uses 18 flops */
2104     }
2105
2106     /* Increment number of outer iterations */
2107     outeriter        += nri;
2108
2109     /* Update outer/inner flops */
2110
2111     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*373);
2112 }