Remove no-inline-max-size and suppress remark
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sparc64_hpc_ace_double / nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_sparc64_hpc_ace_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "gromacs/legacyheaders/vec.h"
47 #include "nrnb.h"
48
49 #include "kernelutil_sparc64_hpc_ace_double.h"
50
51 /*
52  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sparc64_hpc_ace_double
53  * Electrostatics interaction: Ewald
54  * VdW interaction:            LJEwald
55  * Geometry:                   Water3-Water3
56  * Calculate force/pot:        PotentialAndForce
57  */
58 void
59 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sparc64_hpc_ace_double
60                     (t_nblist                    * gmx_restrict       nlist,
61                      rvec                        * gmx_restrict          xx,
62                      rvec                        * gmx_restrict          ff,
63                      t_forcerec                  * gmx_restrict          fr,
64                      t_mdatoms                   * gmx_restrict     mdatoms,
65                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
66                      t_nrnb                      * gmx_restrict        nrnb)
67 {
68     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69      * just 0 for non-waters.
70      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
71      * jnr indices corresponding to data put in the four positions in the SIMD register.
72      */
73     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
74     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75     int              jnrA,jnrB;
76     int              j_coord_offsetA,j_coord_offsetB;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
81     int              vdwioffset0;
82     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
83     int              vdwioffset1;
84     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85     int              vdwioffset2;
86     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87     int              vdwjidx0A,vdwjidx0B;
88     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
89     int              vdwjidx1A,vdwjidx1B;
90     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
91     int              vdwjidx2A,vdwjidx2B;
92     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
93     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
94     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
95     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
96     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
97     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
98     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
99     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
100     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
101     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
102     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
103     real             *charge;
104     int              nvdwtype;
105     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
106     int              *vdwtype;
107     real             *vdwparam;
108     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
109     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
110     _fjsp_v2r8           c6grid_00;
111     _fjsp_v2r8           c6grid_01;
112     _fjsp_v2r8           c6grid_02;
113     _fjsp_v2r8           c6grid_10;
114     _fjsp_v2r8           c6grid_11;
115     _fjsp_v2r8           c6grid_12;
116     _fjsp_v2r8           c6grid_20;
117     _fjsp_v2r8           c6grid_21;
118     _fjsp_v2r8           c6grid_22;
119     real                 *vdwgridparam;
120     _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
121     _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
122     _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
123     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
124     real             *ewtab;
125     _fjsp_v2r8       itab_tmp;
126     _fjsp_v2r8       dummy_mask,cutoff_mask;
127     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
128     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
129     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
130
131     x                = xx[0];
132     f                = ff[0];
133
134     nri              = nlist->nri;
135     iinr             = nlist->iinr;
136     jindex           = nlist->jindex;
137     jjnr             = nlist->jjnr;
138     shiftidx         = nlist->shift;
139     gid              = nlist->gid;
140     shiftvec         = fr->shift_vec[0];
141     fshift           = fr->fshift[0];
142     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
143     charge           = mdatoms->chargeA;
144     nvdwtype         = fr->ntype;
145     vdwparam         = fr->nbfp;
146     vdwtype          = mdatoms->typeA;
147     vdwgridparam     = fr->ljpme_c6grid;
148     sh_lj_ewald      = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
149     ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
150     ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
151
152     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
153     ewtab            = fr->ic->tabq_coul_FDV0;
154     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
155     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
156
157     /* Setup water-specific parameters */
158     inr              = nlist->iinr[0];
159     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
160     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
161     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
162     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
163
164     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
165     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
166     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
167     vdwjidx0A        = 2*vdwtype[inr+0];
168     qq00             = _fjsp_mul_v2r8(iq0,jq0);
169     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
170     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
171     c6grid_00        = gmx_fjsp_set1_v2r8(vdwgridparam[vdwioffset0+vdwjidx0A]);
172     qq01             = _fjsp_mul_v2r8(iq0,jq1);
173     qq02             = _fjsp_mul_v2r8(iq0,jq2);
174     qq10             = _fjsp_mul_v2r8(iq1,jq0);
175     qq11             = _fjsp_mul_v2r8(iq1,jq1);
176     qq12             = _fjsp_mul_v2r8(iq1,jq2);
177     qq20             = _fjsp_mul_v2r8(iq2,jq0);
178     qq21             = _fjsp_mul_v2r8(iq2,jq1);
179     qq22             = _fjsp_mul_v2r8(iq2,jq2);
180
181     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
182     rcutoff_scalar   = fr->rcoulomb;
183     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
184     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
185
186     sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
187     rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
188
189     /* Avoid stupid compiler warnings */
190     jnrA = jnrB = 0;
191     j_coord_offsetA = 0;
192     j_coord_offsetB = 0;
193
194     outeriter        = 0;
195     inneriter        = 0;
196
197     /* Start outer loop over neighborlists */
198     for(iidx=0; iidx<nri; iidx++)
199     {
200         /* Load shift vector for this list */
201         i_shift_offset   = DIM*shiftidx[iidx];
202
203         /* Load limits for loop over neighbors */
204         j_index_start    = jindex[iidx];
205         j_index_end      = jindex[iidx+1];
206
207         /* Get outer coordinate index */
208         inr              = iinr[iidx];
209         i_coord_offset   = DIM*inr;
210
211         /* Load i particle coords and add shift vector */
212         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
213                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
214
215         fix0             = _fjsp_setzero_v2r8();
216         fiy0             = _fjsp_setzero_v2r8();
217         fiz0             = _fjsp_setzero_v2r8();
218         fix1             = _fjsp_setzero_v2r8();
219         fiy1             = _fjsp_setzero_v2r8();
220         fiz1             = _fjsp_setzero_v2r8();
221         fix2             = _fjsp_setzero_v2r8();
222         fiy2             = _fjsp_setzero_v2r8();
223         fiz2             = _fjsp_setzero_v2r8();
224
225         /* Reset potential sums */
226         velecsum         = _fjsp_setzero_v2r8();
227         vvdwsum          = _fjsp_setzero_v2r8();
228
229         /* Start inner kernel loop */
230         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
231         {
232
233             /* Get j neighbor index, and coordinate index */
234             jnrA             = jjnr[jidx];
235             jnrB             = jjnr[jidx+1];
236             j_coord_offsetA  = DIM*jnrA;
237             j_coord_offsetB  = DIM*jnrB;
238
239             /* load j atom coordinates */
240             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
241                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
242
243             /* Calculate displacement vector */
244             dx00             = _fjsp_sub_v2r8(ix0,jx0);
245             dy00             = _fjsp_sub_v2r8(iy0,jy0);
246             dz00             = _fjsp_sub_v2r8(iz0,jz0);
247             dx01             = _fjsp_sub_v2r8(ix0,jx1);
248             dy01             = _fjsp_sub_v2r8(iy0,jy1);
249             dz01             = _fjsp_sub_v2r8(iz0,jz1);
250             dx02             = _fjsp_sub_v2r8(ix0,jx2);
251             dy02             = _fjsp_sub_v2r8(iy0,jy2);
252             dz02             = _fjsp_sub_v2r8(iz0,jz2);
253             dx10             = _fjsp_sub_v2r8(ix1,jx0);
254             dy10             = _fjsp_sub_v2r8(iy1,jy0);
255             dz10             = _fjsp_sub_v2r8(iz1,jz0);
256             dx11             = _fjsp_sub_v2r8(ix1,jx1);
257             dy11             = _fjsp_sub_v2r8(iy1,jy1);
258             dz11             = _fjsp_sub_v2r8(iz1,jz1);
259             dx12             = _fjsp_sub_v2r8(ix1,jx2);
260             dy12             = _fjsp_sub_v2r8(iy1,jy2);
261             dz12             = _fjsp_sub_v2r8(iz1,jz2);
262             dx20             = _fjsp_sub_v2r8(ix2,jx0);
263             dy20             = _fjsp_sub_v2r8(iy2,jy0);
264             dz20             = _fjsp_sub_v2r8(iz2,jz0);
265             dx21             = _fjsp_sub_v2r8(ix2,jx1);
266             dy21             = _fjsp_sub_v2r8(iy2,jy1);
267             dz21             = _fjsp_sub_v2r8(iz2,jz1);
268             dx22             = _fjsp_sub_v2r8(ix2,jx2);
269             dy22             = _fjsp_sub_v2r8(iy2,jy2);
270             dz22             = _fjsp_sub_v2r8(iz2,jz2);
271
272             /* Calculate squared distance and things based on it */
273             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
274             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
275             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
276             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
277             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
278             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
279             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
280             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
281             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
282
283             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
284             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
285             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
286             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
287             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
288             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
289             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
290             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
291             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
292
293             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
294             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
295             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
296             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
297             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
298             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
299             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
300             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
301             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
302
303             fjx0             = _fjsp_setzero_v2r8();
304             fjy0             = _fjsp_setzero_v2r8();
305             fjz0             = _fjsp_setzero_v2r8();
306             fjx1             = _fjsp_setzero_v2r8();
307             fjy1             = _fjsp_setzero_v2r8();
308             fjz1             = _fjsp_setzero_v2r8();
309             fjx2             = _fjsp_setzero_v2r8();
310             fjy2             = _fjsp_setzero_v2r8();
311             fjz2             = _fjsp_setzero_v2r8();
312
313             /**************************
314              * CALCULATE INTERACTIONS *
315              **************************/
316
317             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
318             {
319
320             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
321
322             /* EWALD ELECTROSTATICS */
323
324             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
325             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
326             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
327             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
328             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
329
330             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
331             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
332             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
333             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
334             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
335             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
336             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
337             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
338             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
339             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
340
341             /* Analytical LJ-PME */
342             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
343             ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
344             ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
345             exponent         = gmx_simd_exp_d(ewcljrsq);
346             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
347             poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
348             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
349             vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(c6grid_00,_fjsp_sub_v2r8(poly,one),c6_00),rinvsix);
350             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
351             vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
352                                _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw6,_fjsp_madd_v2r8(c6grid_00,sh_lj_ewald,_fjsp_mul_v2r8(c6_00,sh_vdw_invrcut6))),one_sixth));
353             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
354             fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
355
356             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
357
358             /* Update potential sum for this i atom from the interaction with this j atom. */
359             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
360             velecsum         = _fjsp_add_v2r8(velecsum,velec);
361             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
362             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
363
364             fscal            = _fjsp_add_v2r8(felec,fvdw);
365
366             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
367
368             /* Update vectorial force */
369             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
370             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
371             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
372             
373             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
374             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
375             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
376
377             }
378
379             /**************************
380              * CALCULATE INTERACTIONS *
381              **************************/
382
383             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
384             {
385
386             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
387
388             /* EWALD ELECTROSTATICS */
389
390             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
391             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
392             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
393             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
394             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
395
396             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
397             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
398             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
399             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
400             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
401             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
402             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
403             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
404             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv01,sh_ewald),velec));
405             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
406
407             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
408
409             /* Update potential sum for this i atom from the interaction with this j atom. */
410             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
411             velecsum         = _fjsp_add_v2r8(velecsum,velec);
412
413             fscal            = felec;
414
415             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
416
417             /* Update vectorial force */
418             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
419             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
420             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
421             
422             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
423             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
424             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
425
426             }
427
428             /**************************
429              * CALCULATE INTERACTIONS *
430              **************************/
431
432             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
433             {
434
435             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
436
437             /* EWALD ELECTROSTATICS */
438
439             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
440             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
441             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
442             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
443             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
444
445             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
446             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
447             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
448             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
449             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
450             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
451             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
452             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
453             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv02,sh_ewald),velec));
454             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
455
456             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
457
458             /* Update potential sum for this i atom from the interaction with this j atom. */
459             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
460             velecsum         = _fjsp_add_v2r8(velecsum,velec);
461
462             fscal            = felec;
463
464             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
465
466             /* Update vectorial force */
467             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
468             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
469             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
470             
471             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
472             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
473             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
474
475             }
476
477             /**************************
478              * CALCULATE INTERACTIONS *
479              **************************/
480
481             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
482             {
483
484             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
485
486             /* EWALD ELECTROSTATICS */
487
488             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
489             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
490             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
491             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
492             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
493
494             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
495             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
496             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
497             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
498             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
499             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
500             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
501             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
502             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
503             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
504
505             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
506
507             /* Update potential sum for this i atom from the interaction with this j atom. */
508             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
509             velecsum         = _fjsp_add_v2r8(velecsum,velec);
510
511             fscal            = felec;
512
513             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
514
515             /* Update vectorial force */
516             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
517             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
518             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
519             
520             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
521             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
522             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
523
524             }
525
526             /**************************
527              * CALCULATE INTERACTIONS *
528              **************************/
529
530             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
531             {
532
533             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
534
535             /* EWALD ELECTROSTATICS */
536
537             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
538             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
539             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
540             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
541             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
542
543             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
544             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
545             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
546             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
547             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
548             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
549             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
550             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
551             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
552             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
553
554             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
555
556             /* Update potential sum for this i atom from the interaction with this j atom. */
557             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
558             velecsum         = _fjsp_add_v2r8(velecsum,velec);
559
560             fscal            = felec;
561
562             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
563
564             /* Update vectorial force */
565             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
566             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
567             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
568             
569             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
570             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
571             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
572
573             }
574
575             /**************************
576              * CALCULATE INTERACTIONS *
577              **************************/
578
579             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
580             {
581
582             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
583
584             /* EWALD ELECTROSTATICS */
585
586             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
587             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
588             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
589             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
590             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
591
592             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
593             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
594             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
595             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
596             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
597             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
598             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
599             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
600             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
601             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
602
603             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
604
605             /* Update potential sum for this i atom from the interaction with this j atom. */
606             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
607             velecsum         = _fjsp_add_v2r8(velecsum,velec);
608
609             fscal            = felec;
610
611             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
612
613             /* Update vectorial force */
614             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
615             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
616             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
617             
618             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
619             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
620             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
621
622             }
623
624             /**************************
625              * CALCULATE INTERACTIONS *
626              **************************/
627
628             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
629             {
630
631             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
632
633             /* EWALD ELECTROSTATICS */
634
635             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
636             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
637             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
638             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
639             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
640
641             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
642             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
643             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
644             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
645             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
646             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
647             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
648             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
649             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
650             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
651
652             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
653
654             /* Update potential sum for this i atom from the interaction with this j atom. */
655             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
656             velecsum         = _fjsp_add_v2r8(velecsum,velec);
657
658             fscal            = felec;
659
660             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
661
662             /* Update vectorial force */
663             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
664             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
665             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
666             
667             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
668             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
669             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
670
671             }
672
673             /**************************
674              * CALCULATE INTERACTIONS *
675              **************************/
676
677             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
678             {
679
680             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
681
682             /* EWALD ELECTROSTATICS */
683
684             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
685             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
686             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
687             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
688             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
689
690             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
691             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
692             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
693             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
694             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
695             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
696             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
697             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
698             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
699             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
700
701             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
702
703             /* Update potential sum for this i atom from the interaction with this j atom. */
704             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
705             velecsum         = _fjsp_add_v2r8(velecsum,velec);
706
707             fscal            = felec;
708
709             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
710
711             /* Update vectorial force */
712             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
713             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
714             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
715             
716             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
717             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
718             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
719
720             }
721
722             /**************************
723              * CALCULATE INTERACTIONS *
724              **************************/
725
726             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
727             {
728
729             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
730
731             /* EWALD ELECTROSTATICS */
732
733             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
734             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
735             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
736             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
737             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
738
739             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
740             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
741             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
742             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
743             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
744             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
745             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
746             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
747             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
748             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
749
750             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
751
752             /* Update potential sum for this i atom from the interaction with this j atom. */
753             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
754             velecsum         = _fjsp_add_v2r8(velecsum,velec);
755
756             fscal            = felec;
757
758             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
759
760             /* Update vectorial force */
761             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
762             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
763             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
764             
765             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
766             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
767             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
768
769             }
770
771             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
772
773             /* Inner loop uses 471 flops */
774         }
775
776         if(jidx<j_index_end)
777         {
778
779             jnrA             = jjnr[jidx];
780             j_coord_offsetA  = DIM*jnrA;
781
782             /* load j atom coordinates */
783             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
784                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
785
786             /* Calculate displacement vector */
787             dx00             = _fjsp_sub_v2r8(ix0,jx0);
788             dy00             = _fjsp_sub_v2r8(iy0,jy0);
789             dz00             = _fjsp_sub_v2r8(iz0,jz0);
790             dx01             = _fjsp_sub_v2r8(ix0,jx1);
791             dy01             = _fjsp_sub_v2r8(iy0,jy1);
792             dz01             = _fjsp_sub_v2r8(iz0,jz1);
793             dx02             = _fjsp_sub_v2r8(ix0,jx2);
794             dy02             = _fjsp_sub_v2r8(iy0,jy2);
795             dz02             = _fjsp_sub_v2r8(iz0,jz2);
796             dx10             = _fjsp_sub_v2r8(ix1,jx0);
797             dy10             = _fjsp_sub_v2r8(iy1,jy0);
798             dz10             = _fjsp_sub_v2r8(iz1,jz0);
799             dx11             = _fjsp_sub_v2r8(ix1,jx1);
800             dy11             = _fjsp_sub_v2r8(iy1,jy1);
801             dz11             = _fjsp_sub_v2r8(iz1,jz1);
802             dx12             = _fjsp_sub_v2r8(ix1,jx2);
803             dy12             = _fjsp_sub_v2r8(iy1,jy2);
804             dz12             = _fjsp_sub_v2r8(iz1,jz2);
805             dx20             = _fjsp_sub_v2r8(ix2,jx0);
806             dy20             = _fjsp_sub_v2r8(iy2,jy0);
807             dz20             = _fjsp_sub_v2r8(iz2,jz0);
808             dx21             = _fjsp_sub_v2r8(ix2,jx1);
809             dy21             = _fjsp_sub_v2r8(iy2,jy1);
810             dz21             = _fjsp_sub_v2r8(iz2,jz1);
811             dx22             = _fjsp_sub_v2r8(ix2,jx2);
812             dy22             = _fjsp_sub_v2r8(iy2,jy2);
813             dz22             = _fjsp_sub_v2r8(iz2,jz2);
814
815             /* Calculate squared distance and things based on it */
816             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
817             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
818             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
819             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
820             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
821             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
822             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
823             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
824             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
825
826             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
827             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
828             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
829             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
830             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
831             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
832             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
833             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
834             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
835
836             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
837             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
838             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
839             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
840             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
841             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
842             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
843             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
844             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
845
846             fjx0             = _fjsp_setzero_v2r8();
847             fjy0             = _fjsp_setzero_v2r8();
848             fjz0             = _fjsp_setzero_v2r8();
849             fjx1             = _fjsp_setzero_v2r8();
850             fjy1             = _fjsp_setzero_v2r8();
851             fjz1             = _fjsp_setzero_v2r8();
852             fjx2             = _fjsp_setzero_v2r8();
853             fjy2             = _fjsp_setzero_v2r8();
854             fjz2             = _fjsp_setzero_v2r8();
855
856             /**************************
857              * CALCULATE INTERACTIONS *
858              **************************/
859
860             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
861             {
862
863             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
864
865             /* EWALD ELECTROSTATICS */
866
867             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
868             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
869             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
870             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
871             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
872
873             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
874             ewtabD           = _fjsp_setzero_v2r8();
875             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
876             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
877             ewtabFn          = _fjsp_setzero_v2r8();
878             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
879             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
880             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
881             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
882             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
883
884             /* Analytical LJ-PME */
885             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
886             ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
887             ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
888             exponent         = gmx_simd_exp_d(ewcljrsq);
889             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
890             poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
891             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
892             vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(c6grid_00,_fjsp_sub_v2r8(poly,one),c6_00),rinvsix);
893             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
894             vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
895                                _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw6,_fjsp_madd_v2r8(c6grid_00,sh_lj_ewald,_fjsp_mul_v2r8(c6_00,sh_vdw_invrcut6))),one_sixth));
896             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
897             fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
898
899             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
900
901             /* Update potential sum for this i atom from the interaction with this j atom. */
902             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
903             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
904             velecsum         = _fjsp_add_v2r8(velecsum,velec);
905             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
906             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
907             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
908
909             fscal            = _fjsp_add_v2r8(felec,fvdw);
910
911             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
912
913             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
914
915             /* Update vectorial force */
916             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
917             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
918             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
919             
920             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
921             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
922             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
923
924             }
925
926             /**************************
927              * CALCULATE INTERACTIONS *
928              **************************/
929
930             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
931             {
932
933             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
934
935             /* EWALD ELECTROSTATICS */
936
937             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
938             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
939             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
940             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
941             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
942
943             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
944             ewtabD           = _fjsp_setzero_v2r8();
945             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
946             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
947             ewtabFn          = _fjsp_setzero_v2r8();
948             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
949             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
950             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
951             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv01,sh_ewald),velec));
952             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
953
954             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
955
956             /* Update potential sum for this i atom from the interaction with this j atom. */
957             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
958             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
959             velecsum         = _fjsp_add_v2r8(velecsum,velec);
960
961             fscal            = felec;
962
963             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
964
965             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
966
967             /* Update vectorial force */
968             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
969             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
970             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
971             
972             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
973             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
974             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
975
976             }
977
978             /**************************
979              * CALCULATE INTERACTIONS *
980              **************************/
981
982             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
983             {
984
985             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
986
987             /* EWALD ELECTROSTATICS */
988
989             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
990             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
991             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
992             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
993             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
994
995             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
996             ewtabD           = _fjsp_setzero_v2r8();
997             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
998             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
999             ewtabFn          = _fjsp_setzero_v2r8();
1000             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1001             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1002             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1003             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv02,sh_ewald),velec));
1004             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
1005
1006             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
1007
1008             /* Update potential sum for this i atom from the interaction with this j atom. */
1009             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1010             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1011             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1012
1013             fscal            = felec;
1014
1015             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1016
1017             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1018
1019             /* Update vectorial force */
1020             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
1021             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
1022             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
1023             
1024             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
1025             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
1026             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
1027
1028             }
1029
1030             /**************************
1031              * CALCULATE INTERACTIONS *
1032              **************************/
1033
1034             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
1035             {
1036
1037             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
1038
1039             /* EWALD ELECTROSTATICS */
1040
1041             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1042             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
1043             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1044             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1045             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1046
1047             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1048             ewtabD           = _fjsp_setzero_v2r8();
1049             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1050             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1051             ewtabFn          = _fjsp_setzero_v2r8();
1052             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1053             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1054             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1055             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
1056             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
1057
1058             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
1059
1060             /* Update potential sum for this i atom from the interaction with this j atom. */
1061             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1062             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1063             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1064
1065             fscal            = felec;
1066
1067             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1068
1069             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1070
1071             /* Update vectorial force */
1072             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
1073             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
1074             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
1075             
1076             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
1077             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
1078             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
1079
1080             }
1081
1082             /**************************
1083              * CALCULATE INTERACTIONS *
1084              **************************/
1085
1086             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
1087             {
1088
1089             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
1090
1091             /* EWALD ELECTROSTATICS */
1092
1093             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1094             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
1095             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1096             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1097             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1098
1099             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1100             ewtabD           = _fjsp_setzero_v2r8();
1101             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1102             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1103             ewtabFn          = _fjsp_setzero_v2r8();
1104             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1105             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1106             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1107             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
1108             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
1109
1110             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
1111
1112             /* Update potential sum for this i atom from the interaction with this j atom. */
1113             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1114             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1115             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1116
1117             fscal            = felec;
1118
1119             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1120
1121             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1122
1123             /* Update vectorial force */
1124             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
1125             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
1126             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
1127             
1128             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
1129             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
1130             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
1131
1132             }
1133
1134             /**************************
1135              * CALCULATE INTERACTIONS *
1136              **************************/
1137
1138             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
1139             {
1140
1141             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
1142
1143             /* EWALD ELECTROSTATICS */
1144
1145             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1146             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
1147             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1148             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1149             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1150
1151             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1152             ewtabD           = _fjsp_setzero_v2r8();
1153             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1154             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1155             ewtabFn          = _fjsp_setzero_v2r8();
1156             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1157             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1158             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1159             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
1160             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
1161
1162             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
1163
1164             /* Update potential sum for this i atom from the interaction with this j atom. */
1165             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1166             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1167             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1168
1169             fscal            = felec;
1170
1171             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1172
1173             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1174
1175             /* Update vectorial force */
1176             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
1177             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
1178             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
1179             
1180             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
1181             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
1182             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
1183
1184             }
1185
1186             /**************************
1187              * CALCULATE INTERACTIONS *
1188              **************************/
1189
1190             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
1191             {
1192
1193             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
1194
1195             /* EWALD ELECTROSTATICS */
1196
1197             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1198             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
1199             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1200             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1201             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1202
1203             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1204             ewtabD           = _fjsp_setzero_v2r8();
1205             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1206             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1207             ewtabFn          = _fjsp_setzero_v2r8();
1208             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1209             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1210             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1211             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
1212             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
1213
1214             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
1215
1216             /* Update potential sum for this i atom from the interaction with this j atom. */
1217             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1218             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1219             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1220
1221             fscal            = felec;
1222
1223             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1224
1225             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1226
1227             /* Update vectorial force */
1228             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
1229             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
1230             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
1231             
1232             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
1233             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
1234             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
1235
1236             }
1237
1238             /**************************
1239              * CALCULATE INTERACTIONS *
1240              **************************/
1241
1242             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
1243             {
1244
1245             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
1246
1247             /* EWALD ELECTROSTATICS */
1248
1249             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1250             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
1251             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1252             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1253             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1254
1255             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1256             ewtabD           = _fjsp_setzero_v2r8();
1257             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1258             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1259             ewtabFn          = _fjsp_setzero_v2r8();
1260             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1261             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1262             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1263             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
1264             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
1265
1266             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
1267
1268             /* Update potential sum for this i atom from the interaction with this j atom. */
1269             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1270             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1271             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1272
1273             fscal            = felec;
1274
1275             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1276
1277             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1278
1279             /* Update vectorial force */
1280             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
1281             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
1282             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
1283             
1284             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
1285             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
1286             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
1287
1288             }
1289
1290             /**************************
1291              * CALCULATE INTERACTIONS *
1292              **************************/
1293
1294             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
1295             {
1296
1297             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
1298
1299             /* EWALD ELECTROSTATICS */
1300
1301             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1302             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
1303             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1304             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1305             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1306
1307             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1308             ewtabD           = _fjsp_setzero_v2r8();
1309             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1310             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1311             ewtabFn          = _fjsp_setzero_v2r8();
1312             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1313             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1314             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1315             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
1316             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
1317
1318             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
1319
1320             /* Update potential sum for this i atom from the interaction with this j atom. */
1321             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1322             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1323             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1324
1325             fscal            = felec;
1326
1327             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1328
1329             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1330
1331             /* Update vectorial force */
1332             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
1333             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
1334             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
1335             
1336             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
1337             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
1338             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
1339
1340             }
1341
1342             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1343
1344             /* Inner loop uses 471 flops */
1345         }
1346
1347         /* End of innermost loop */
1348
1349         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1350                                               f+i_coord_offset,fshift+i_shift_offset);
1351
1352         ggid                        = gid[iidx];
1353         /* Update potential energies */
1354         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
1355         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
1356
1357         /* Increment number of inner iterations */
1358         inneriter                  += j_index_end - j_index_start;
1359
1360         /* Outer loop uses 20 flops */
1361     }
1362
1363     /* Increment number of outer iterations */
1364     outeriter        += nri;
1365
1366     /* Update outer/inner flops */
1367
1368     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*471);
1369 }
1370 /*
1371  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sparc64_hpc_ace_double
1372  * Electrostatics interaction: Ewald
1373  * VdW interaction:            LJEwald
1374  * Geometry:                   Water3-Water3
1375  * Calculate force/pot:        Force
1376  */
1377 void
1378 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sparc64_hpc_ace_double
1379                     (t_nblist                    * gmx_restrict       nlist,
1380                      rvec                        * gmx_restrict          xx,
1381                      rvec                        * gmx_restrict          ff,
1382                      t_forcerec                  * gmx_restrict          fr,
1383                      t_mdatoms                   * gmx_restrict     mdatoms,
1384                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1385                      t_nrnb                      * gmx_restrict        nrnb)
1386 {
1387     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1388      * just 0 for non-waters.
1389      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
1390      * jnr indices corresponding to data put in the four positions in the SIMD register.
1391      */
1392     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1393     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1394     int              jnrA,jnrB;
1395     int              j_coord_offsetA,j_coord_offsetB;
1396     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1397     real             rcutoff_scalar;
1398     real             *shiftvec,*fshift,*x,*f;
1399     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1400     int              vdwioffset0;
1401     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1402     int              vdwioffset1;
1403     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1404     int              vdwioffset2;
1405     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1406     int              vdwjidx0A,vdwjidx0B;
1407     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1408     int              vdwjidx1A,vdwjidx1B;
1409     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1410     int              vdwjidx2A,vdwjidx2B;
1411     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1412     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1413     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1414     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1415     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1416     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1417     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1418     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1419     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1420     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1421     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
1422     real             *charge;
1423     int              nvdwtype;
1424     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1425     int              *vdwtype;
1426     real             *vdwparam;
1427     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
1428     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
1429     _fjsp_v2r8           c6grid_00;
1430     _fjsp_v2r8           c6grid_01;
1431     _fjsp_v2r8           c6grid_02;
1432     _fjsp_v2r8           c6grid_10;
1433     _fjsp_v2r8           c6grid_11;
1434     _fjsp_v2r8           c6grid_12;
1435     _fjsp_v2r8           c6grid_20;
1436     _fjsp_v2r8           c6grid_21;
1437     _fjsp_v2r8           c6grid_22;
1438     real                 *vdwgridparam;
1439     _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1440     _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
1441     _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
1442     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1443     real             *ewtab;
1444     _fjsp_v2r8       itab_tmp;
1445     _fjsp_v2r8       dummy_mask,cutoff_mask;
1446     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
1447     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
1448     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
1449
1450     x                = xx[0];
1451     f                = ff[0];
1452
1453     nri              = nlist->nri;
1454     iinr             = nlist->iinr;
1455     jindex           = nlist->jindex;
1456     jjnr             = nlist->jjnr;
1457     shiftidx         = nlist->shift;
1458     gid              = nlist->gid;
1459     shiftvec         = fr->shift_vec[0];
1460     fshift           = fr->fshift[0];
1461     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
1462     charge           = mdatoms->chargeA;
1463     nvdwtype         = fr->ntype;
1464     vdwparam         = fr->nbfp;
1465     vdwtype          = mdatoms->typeA;
1466     vdwgridparam     = fr->ljpme_c6grid;
1467     sh_lj_ewald      = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
1468     ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
1469     ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
1470
1471     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
1472     ewtab            = fr->ic->tabq_coul_F;
1473     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
1474     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
1475
1476     /* Setup water-specific parameters */
1477     inr              = nlist->iinr[0];
1478     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
1479     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
1480     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
1481     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1482
1483     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
1484     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
1485     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
1486     vdwjidx0A        = 2*vdwtype[inr+0];
1487     qq00             = _fjsp_mul_v2r8(iq0,jq0);
1488     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
1489     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
1490     c6grid_00        = gmx_fjsp_set1_v2r8(vdwgridparam[vdwioffset0+vdwjidx0A]);
1491     qq01             = _fjsp_mul_v2r8(iq0,jq1);
1492     qq02             = _fjsp_mul_v2r8(iq0,jq2);
1493     qq10             = _fjsp_mul_v2r8(iq1,jq0);
1494     qq11             = _fjsp_mul_v2r8(iq1,jq1);
1495     qq12             = _fjsp_mul_v2r8(iq1,jq2);
1496     qq20             = _fjsp_mul_v2r8(iq2,jq0);
1497     qq21             = _fjsp_mul_v2r8(iq2,jq1);
1498     qq22             = _fjsp_mul_v2r8(iq2,jq2);
1499
1500     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1501     rcutoff_scalar   = fr->rcoulomb;
1502     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
1503     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
1504
1505     sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
1506     rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
1507
1508     /* Avoid stupid compiler warnings */
1509     jnrA = jnrB = 0;
1510     j_coord_offsetA = 0;
1511     j_coord_offsetB = 0;
1512
1513     outeriter        = 0;
1514     inneriter        = 0;
1515
1516     /* Start outer loop over neighborlists */
1517     for(iidx=0; iidx<nri; iidx++)
1518     {
1519         /* Load shift vector for this list */
1520         i_shift_offset   = DIM*shiftidx[iidx];
1521
1522         /* Load limits for loop over neighbors */
1523         j_index_start    = jindex[iidx];
1524         j_index_end      = jindex[iidx+1];
1525
1526         /* Get outer coordinate index */
1527         inr              = iinr[iidx];
1528         i_coord_offset   = DIM*inr;
1529
1530         /* Load i particle coords and add shift vector */
1531         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
1532                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1533
1534         fix0             = _fjsp_setzero_v2r8();
1535         fiy0             = _fjsp_setzero_v2r8();
1536         fiz0             = _fjsp_setzero_v2r8();
1537         fix1             = _fjsp_setzero_v2r8();
1538         fiy1             = _fjsp_setzero_v2r8();
1539         fiz1             = _fjsp_setzero_v2r8();
1540         fix2             = _fjsp_setzero_v2r8();
1541         fiy2             = _fjsp_setzero_v2r8();
1542         fiz2             = _fjsp_setzero_v2r8();
1543
1544         /* Start inner kernel loop */
1545         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1546         {
1547
1548             /* Get j neighbor index, and coordinate index */
1549             jnrA             = jjnr[jidx];
1550             jnrB             = jjnr[jidx+1];
1551             j_coord_offsetA  = DIM*jnrA;
1552             j_coord_offsetB  = DIM*jnrB;
1553
1554             /* load j atom coordinates */
1555             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
1556                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1557
1558             /* Calculate displacement vector */
1559             dx00             = _fjsp_sub_v2r8(ix0,jx0);
1560             dy00             = _fjsp_sub_v2r8(iy0,jy0);
1561             dz00             = _fjsp_sub_v2r8(iz0,jz0);
1562             dx01             = _fjsp_sub_v2r8(ix0,jx1);
1563             dy01             = _fjsp_sub_v2r8(iy0,jy1);
1564             dz01             = _fjsp_sub_v2r8(iz0,jz1);
1565             dx02             = _fjsp_sub_v2r8(ix0,jx2);
1566             dy02             = _fjsp_sub_v2r8(iy0,jy2);
1567             dz02             = _fjsp_sub_v2r8(iz0,jz2);
1568             dx10             = _fjsp_sub_v2r8(ix1,jx0);
1569             dy10             = _fjsp_sub_v2r8(iy1,jy0);
1570             dz10             = _fjsp_sub_v2r8(iz1,jz0);
1571             dx11             = _fjsp_sub_v2r8(ix1,jx1);
1572             dy11             = _fjsp_sub_v2r8(iy1,jy1);
1573             dz11             = _fjsp_sub_v2r8(iz1,jz1);
1574             dx12             = _fjsp_sub_v2r8(ix1,jx2);
1575             dy12             = _fjsp_sub_v2r8(iy1,jy2);
1576             dz12             = _fjsp_sub_v2r8(iz1,jz2);
1577             dx20             = _fjsp_sub_v2r8(ix2,jx0);
1578             dy20             = _fjsp_sub_v2r8(iy2,jy0);
1579             dz20             = _fjsp_sub_v2r8(iz2,jz0);
1580             dx21             = _fjsp_sub_v2r8(ix2,jx1);
1581             dy21             = _fjsp_sub_v2r8(iy2,jy1);
1582             dz21             = _fjsp_sub_v2r8(iz2,jz1);
1583             dx22             = _fjsp_sub_v2r8(ix2,jx2);
1584             dy22             = _fjsp_sub_v2r8(iy2,jy2);
1585             dz22             = _fjsp_sub_v2r8(iz2,jz2);
1586
1587             /* Calculate squared distance and things based on it */
1588             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
1589             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
1590             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
1591             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
1592             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
1593             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
1594             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
1595             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
1596             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
1597
1598             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
1599             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
1600             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
1601             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
1602             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
1603             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
1604             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
1605             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
1606             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
1607
1608             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
1609             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
1610             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
1611             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
1612             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
1613             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
1614             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
1615             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
1616             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
1617
1618             fjx0             = _fjsp_setzero_v2r8();
1619             fjy0             = _fjsp_setzero_v2r8();
1620             fjz0             = _fjsp_setzero_v2r8();
1621             fjx1             = _fjsp_setzero_v2r8();
1622             fjy1             = _fjsp_setzero_v2r8();
1623             fjz1             = _fjsp_setzero_v2r8();
1624             fjx2             = _fjsp_setzero_v2r8();
1625             fjy2             = _fjsp_setzero_v2r8();
1626             fjz2             = _fjsp_setzero_v2r8();
1627
1628             /**************************
1629              * CALCULATE INTERACTIONS *
1630              **************************/
1631
1632             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
1633             {
1634
1635             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
1636
1637             /* EWALD ELECTROSTATICS */
1638
1639             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1640             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
1641             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1642             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1643             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1644
1645             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1646                                          &ewtabF,&ewtabFn);
1647             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1648             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
1649
1650             /* Analytical LJ-PME */
1651             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
1652             ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
1653             ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
1654             exponent         = gmx_simd_exp_d(ewcljrsq);
1655             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1656             poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
1657             /* f6A = 6 * C6grid * (1 - poly) */
1658             f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_sub_v2r8(one,poly));
1659             /* f6B = C6grid * exponent * beta^6 */
1660             f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
1661             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1662             fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
1663
1664             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
1665
1666             fscal            = _fjsp_add_v2r8(felec,fvdw);
1667
1668             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1669
1670             /* Update vectorial force */
1671             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
1672             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
1673             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
1674             
1675             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
1676             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
1677             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
1678
1679             }
1680
1681             /**************************
1682              * CALCULATE INTERACTIONS *
1683              **************************/
1684
1685             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
1686             {
1687
1688             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
1689
1690             /* EWALD ELECTROSTATICS */
1691
1692             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1693             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
1694             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1695             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1696             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1697
1698             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1699                                          &ewtabF,&ewtabFn);
1700             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1701             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
1702
1703             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
1704
1705             fscal            = felec;
1706
1707             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1708
1709             /* Update vectorial force */
1710             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
1711             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
1712             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
1713             
1714             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
1715             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
1716             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
1717
1718             }
1719
1720             /**************************
1721              * CALCULATE INTERACTIONS *
1722              **************************/
1723
1724             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
1725             {
1726
1727             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
1728
1729             /* EWALD ELECTROSTATICS */
1730
1731             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1732             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
1733             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1734             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1735             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1736
1737             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1738                                          &ewtabF,&ewtabFn);
1739             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1740             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
1741
1742             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
1743
1744             fscal            = felec;
1745
1746             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1747
1748             /* Update vectorial force */
1749             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
1750             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
1751             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
1752             
1753             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
1754             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
1755             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
1756
1757             }
1758
1759             /**************************
1760              * CALCULATE INTERACTIONS *
1761              **************************/
1762
1763             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
1764             {
1765
1766             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
1767
1768             /* EWALD ELECTROSTATICS */
1769
1770             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1771             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
1772             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1773             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1774             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1775
1776             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1777                                          &ewtabF,&ewtabFn);
1778             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1779             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
1780
1781             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
1782
1783             fscal            = felec;
1784
1785             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1786
1787             /* Update vectorial force */
1788             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
1789             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
1790             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
1791             
1792             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
1793             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
1794             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
1795
1796             }
1797
1798             /**************************
1799              * CALCULATE INTERACTIONS *
1800              **************************/
1801
1802             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
1803             {
1804
1805             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
1806
1807             /* EWALD ELECTROSTATICS */
1808
1809             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1810             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
1811             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1812             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1813             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1814
1815             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1816                                          &ewtabF,&ewtabFn);
1817             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1818             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
1819
1820             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
1821
1822             fscal            = felec;
1823
1824             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1825
1826             /* Update vectorial force */
1827             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
1828             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
1829             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
1830             
1831             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
1832             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
1833             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
1834
1835             }
1836
1837             /**************************
1838              * CALCULATE INTERACTIONS *
1839              **************************/
1840
1841             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
1842             {
1843
1844             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
1845
1846             /* EWALD ELECTROSTATICS */
1847
1848             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1849             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
1850             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1851             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1852             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1853
1854             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1855                                          &ewtabF,&ewtabFn);
1856             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1857             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
1858
1859             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
1860
1861             fscal            = felec;
1862
1863             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1864
1865             /* Update vectorial force */
1866             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
1867             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
1868             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
1869             
1870             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
1871             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
1872             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
1873
1874             }
1875
1876             /**************************
1877              * CALCULATE INTERACTIONS *
1878              **************************/
1879
1880             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
1881             {
1882
1883             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
1884
1885             /* EWALD ELECTROSTATICS */
1886
1887             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1888             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
1889             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1890             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1891             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1892
1893             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1894                                          &ewtabF,&ewtabFn);
1895             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1896             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
1897
1898             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
1899
1900             fscal            = felec;
1901
1902             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1903
1904             /* Update vectorial force */
1905             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
1906             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
1907             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
1908             
1909             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
1910             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
1911             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
1912
1913             }
1914
1915             /**************************
1916              * CALCULATE INTERACTIONS *
1917              **************************/
1918
1919             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
1920             {
1921
1922             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
1923
1924             /* EWALD ELECTROSTATICS */
1925
1926             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1927             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
1928             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1929             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1930             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1931
1932             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1933                                          &ewtabF,&ewtabFn);
1934             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1935             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
1936
1937             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
1938
1939             fscal            = felec;
1940
1941             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1942
1943             /* Update vectorial force */
1944             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
1945             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
1946             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
1947             
1948             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
1949             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
1950             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
1951
1952             }
1953
1954             /**************************
1955              * CALCULATE INTERACTIONS *
1956              **************************/
1957
1958             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
1959             {
1960
1961             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
1962
1963             /* EWALD ELECTROSTATICS */
1964
1965             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1966             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
1967             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1968             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1969             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1970
1971             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1972                                          &ewtabF,&ewtabFn);
1973             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1974             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
1975
1976             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
1977
1978             fscal            = felec;
1979
1980             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1981
1982             /* Update vectorial force */
1983             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
1984             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
1985             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
1986             
1987             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
1988             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
1989             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
1990
1991             }
1992
1993             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1994
1995             /* Inner loop uses 400 flops */
1996         }
1997
1998         if(jidx<j_index_end)
1999         {
2000
2001             jnrA             = jjnr[jidx];
2002             j_coord_offsetA  = DIM*jnrA;
2003
2004             /* load j atom coordinates */
2005             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
2006                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2007
2008             /* Calculate displacement vector */
2009             dx00             = _fjsp_sub_v2r8(ix0,jx0);
2010             dy00             = _fjsp_sub_v2r8(iy0,jy0);
2011             dz00             = _fjsp_sub_v2r8(iz0,jz0);
2012             dx01             = _fjsp_sub_v2r8(ix0,jx1);
2013             dy01             = _fjsp_sub_v2r8(iy0,jy1);
2014             dz01             = _fjsp_sub_v2r8(iz0,jz1);
2015             dx02             = _fjsp_sub_v2r8(ix0,jx2);
2016             dy02             = _fjsp_sub_v2r8(iy0,jy2);
2017             dz02             = _fjsp_sub_v2r8(iz0,jz2);
2018             dx10             = _fjsp_sub_v2r8(ix1,jx0);
2019             dy10             = _fjsp_sub_v2r8(iy1,jy0);
2020             dz10             = _fjsp_sub_v2r8(iz1,jz0);
2021             dx11             = _fjsp_sub_v2r8(ix1,jx1);
2022             dy11             = _fjsp_sub_v2r8(iy1,jy1);
2023             dz11             = _fjsp_sub_v2r8(iz1,jz1);
2024             dx12             = _fjsp_sub_v2r8(ix1,jx2);
2025             dy12             = _fjsp_sub_v2r8(iy1,jy2);
2026             dz12             = _fjsp_sub_v2r8(iz1,jz2);
2027             dx20             = _fjsp_sub_v2r8(ix2,jx0);
2028             dy20             = _fjsp_sub_v2r8(iy2,jy0);
2029             dz20             = _fjsp_sub_v2r8(iz2,jz0);
2030             dx21             = _fjsp_sub_v2r8(ix2,jx1);
2031             dy21             = _fjsp_sub_v2r8(iy2,jy1);
2032             dz21             = _fjsp_sub_v2r8(iz2,jz1);
2033             dx22             = _fjsp_sub_v2r8(ix2,jx2);
2034             dy22             = _fjsp_sub_v2r8(iy2,jy2);
2035             dz22             = _fjsp_sub_v2r8(iz2,jz2);
2036
2037             /* Calculate squared distance and things based on it */
2038             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
2039             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
2040             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
2041             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
2042             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
2043             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
2044             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
2045             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
2046             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
2047
2048             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
2049             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
2050             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
2051             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
2052             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
2053             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
2054             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
2055             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
2056             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
2057
2058             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
2059             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
2060             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
2061             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
2062             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
2063             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
2064             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
2065             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
2066             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
2067
2068             fjx0             = _fjsp_setzero_v2r8();
2069             fjy0             = _fjsp_setzero_v2r8();
2070             fjz0             = _fjsp_setzero_v2r8();
2071             fjx1             = _fjsp_setzero_v2r8();
2072             fjy1             = _fjsp_setzero_v2r8();
2073             fjz1             = _fjsp_setzero_v2r8();
2074             fjx2             = _fjsp_setzero_v2r8();
2075             fjy2             = _fjsp_setzero_v2r8();
2076             fjz2             = _fjsp_setzero_v2r8();
2077
2078             /**************************
2079              * CALCULATE INTERACTIONS *
2080              **************************/
2081
2082             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
2083             {
2084
2085             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
2086
2087             /* EWALD ELECTROSTATICS */
2088
2089             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2090             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
2091             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2092             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2093             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2094
2095             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2096             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2097             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
2098
2099             /* Analytical LJ-PME */
2100             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
2101             ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
2102             ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
2103             exponent         = gmx_simd_exp_d(ewcljrsq);
2104             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2105             poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
2106             /* f6A = 6 * C6grid * (1 - poly) */
2107             f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_sub_v2r8(one,poly));
2108             /* f6B = C6grid * exponent * beta^6 */
2109             f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
2110             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2111             fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
2112
2113             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
2114
2115             fscal            = _fjsp_add_v2r8(felec,fvdw);
2116
2117             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2118
2119             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2120
2121             /* Update vectorial force */
2122             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
2123             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
2124             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
2125             
2126             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
2127             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
2128             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
2129
2130             }
2131
2132             /**************************
2133              * CALCULATE INTERACTIONS *
2134              **************************/
2135
2136             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
2137             {
2138
2139             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
2140
2141             /* EWALD ELECTROSTATICS */
2142
2143             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2144             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
2145             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2146             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2147             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2148
2149             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2150             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2151             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
2152
2153             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
2154
2155             fscal            = felec;
2156
2157             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2158
2159             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2160
2161             /* Update vectorial force */
2162             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
2163             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
2164             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
2165             
2166             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
2167             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
2168             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
2169
2170             }
2171
2172             /**************************
2173              * CALCULATE INTERACTIONS *
2174              **************************/
2175
2176             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
2177             {
2178
2179             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
2180
2181             /* EWALD ELECTROSTATICS */
2182
2183             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2184             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
2185             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2186             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2187             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2188
2189             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2190             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2191             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
2192
2193             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
2194
2195             fscal            = felec;
2196
2197             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2198
2199             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2200
2201             /* Update vectorial force */
2202             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
2203             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
2204             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
2205             
2206             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
2207             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
2208             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
2209
2210             }
2211
2212             /**************************
2213              * CALCULATE INTERACTIONS *
2214              **************************/
2215
2216             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
2217             {
2218
2219             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
2220
2221             /* EWALD ELECTROSTATICS */
2222
2223             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2224             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
2225             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2226             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2227             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2228
2229             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2230             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2231             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
2232
2233             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
2234
2235             fscal            = felec;
2236
2237             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2238
2239             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2240
2241             /* Update vectorial force */
2242             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
2243             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
2244             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
2245             
2246             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
2247             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
2248             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
2249
2250             }
2251
2252             /**************************
2253              * CALCULATE INTERACTIONS *
2254              **************************/
2255
2256             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
2257             {
2258
2259             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
2260
2261             /* EWALD ELECTROSTATICS */
2262
2263             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2264             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
2265             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2266             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2267             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2268
2269             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2270             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2271             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
2272
2273             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
2274
2275             fscal            = felec;
2276
2277             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2278
2279             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2280
2281             /* Update vectorial force */
2282             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
2283             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
2284             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
2285             
2286             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
2287             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
2288             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
2289
2290             }
2291
2292             /**************************
2293              * CALCULATE INTERACTIONS *
2294              **************************/
2295
2296             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
2297             {
2298
2299             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
2300
2301             /* EWALD ELECTROSTATICS */
2302
2303             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2304             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
2305             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2306             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2307             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2308
2309             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2310             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2311             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
2312
2313             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
2314
2315             fscal            = felec;
2316
2317             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2318
2319             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2320
2321             /* Update vectorial force */
2322             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
2323             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
2324             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
2325             
2326             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
2327             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
2328             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
2329
2330             }
2331
2332             /**************************
2333              * CALCULATE INTERACTIONS *
2334              **************************/
2335
2336             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
2337             {
2338
2339             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
2340
2341             /* EWALD ELECTROSTATICS */
2342
2343             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2344             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
2345             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2346             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2347             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2348
2349             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2350             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2351             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
2352
2353             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
2354
2355             fscal            = felec;
2356
2357             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2358
2359             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2360
2361             /* Update vectorial force */
2362             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
2363             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
2364             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
2365             
2366             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
2367             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
2368             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
2369
2370             }
2371
2372             /**************************
2373              * CALCULATE INTERACTIONS *
2374              **************************/
2375
2376             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
2377             {
2378
2379             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
2380
2381             /* EWALD ELECTROSTATICS */
2382
2383             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2384             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
2385             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2386             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2387             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2388
2389             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2390             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2391             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
2392
2393             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
2394
2395             fscal            = felec;
2396
2397             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2398
2399             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2400
2401             /* Update vectorial force */
2402             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
2403             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
2404             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
2405             
2406             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
2407             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
2408             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
2409
2410             }
2411
2412             /**************************
2413              * CALCULATE INTERACTIONS *
2414              **************************/
2415
2416             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
2417             {
2418
2419             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
2420
2421             /* EWALD ELECTROSTATICS */
2422
2423             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2424             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
2425             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2426             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2427             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2428
2429             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2430             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2431             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
2432
2433             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
2434
2435             fscal            = felec;
2436
2437             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2438
2439             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2440
2441             /* Update vectorial force */
2442             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
2443             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
2444             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
2445             
2446             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
2447             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
2448             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
2449
2450             }
2451
2452             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2453
2454             /* Inner loop uses 400 flops */
2455         }
2456
2457         /* End of innermost loop */
2458
2459         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2460                                               f+i_coord_offset,fshift+i_shift_offset);
2461
2462         /* Increment number of inner iterations */
2463         inneriter                  += j_index_end - j_index_start;
2464
2465         /* Outer loop uses 18 flops */
2466     }
2467
2468     /* Increment number of outer iterations */
2469     outeriter        += nri;
2470
2471     /* Update outer/inner flops */
2472
2473     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*400);
2474 }