Remove all unnecessary HAVE_CONFIG_H
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sparc64_hpc_ace_double / nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_sparc64_hpc_ace_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
46
47 #include "kernelutil_sparc64_hpc_ace_double.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double
51  * Electrostatics interaction: Ewald
52  * VdW interaction:            LennardJones
53  * Geometry:                   Water4-Water4
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      t_forcerec                  * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67      * just 0 for non-waters.
68      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB;
74     int              j_coord_offsetA,j_coord_offsetB;
75     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
76     real             rcutoff_scalar;
77     real             *shiftvec,*fshift,*x,*f;
78     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
79     int              vdwioffset0;
80     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
81     int              vdwioffset1;
82     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
83     int              vdwioffset2;
84     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
85     int              vdwioffset3;
86     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
87     int              vdwjidx0A,vdwjidx0B;
88     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
89     int              vdwjidx1A,vdwjidx1B;
90     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
91     int              vdwjidx2A,vdwjidx2B;
92     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
93     int              vdwjidx3A,vdwjidx3B;
94     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
95     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
97     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
98     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
99     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
100     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
101     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
102     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
103     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
104     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
105     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
106     real             *charge;
107     int              nvdwtype;
108     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
109     int              *vdwtype;
110     real             *vdwparam;
111     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
112     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
113     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
114     real             *ewtab;
115     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
116     real             rswitch_scalar,d_scalar;
117     _fjsp_v2r8       itab_tmp;
118     _fjsp_v2r8       dummy_mask,cutoff_mask;
119     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
120     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
121     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
122
123     x                = xx[0];
124     f                = ff[0];
125
126     nri              = nlist->nri;
127     iinr             = nlist->iinr;
128     jindex           = nlist->jindex;
129     jjnr             = nlist->jjnr;
130     shiftidx         = nlist->shift;
131     gid              = nlist->gid;
132     shiftvec         = fr->shift_vec[0];
133     fshift           = fr->fshift[0];
134     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
135     charge           = mdatoms->chargeA;
136     nvdwtype         = fr->ntype;
137     vdwparam         = fr->nbfp;
138     vdwtype          = mdatoms->typeA;
139
140     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
141     ewtab            = fr->ic->tabq_coul_FDV0;
142     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
143     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
144
145     /* Setup water-specific parameters */
146     inr              = nlist->iinr[0];
147     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
148     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
149     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
150     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
151
152     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
153     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
154     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
155     vdwjidx0A        = 2*vdwtype[inr+0];
156     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
157     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
158     qq11             = _fjsp_mul_v2r8(iq1,jq1);
159     qq12             = _fjsp_mul_v2r8(iq1,jq2);
160     qq13             = _fjsp_mul_v2r8(iq1,jq3);
161     qq21             = _fjsp_mul_v2r8(iq2,jq1);
162     qq22             = _fjsp_mul_v2r8(iq2,jq2);
163     qq23             = _fjsp_mul_v2r8(iq2,jq3);
164     qq31             = _fjsp_mul_v2r8(iq3,jq1);
165     qq32             = _fjsp_mul_v2r8(iq3,jq2);
166     qq33             = _fjsp_mul_v2r8(iq3,jq3);
167
168     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
169     rcutoff_scalar   = fr->rcoulomb;
170     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
171     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
172
173     rswitch_scalar   = fr->rcoulomb_switch;
174     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
175     /* Setup switch parameters */
176     d_scalar         = rcutoff_scalar-rswitch_scalar;
177     d                = gmx_fjsp_set1_v2r8(d_scalar);
178     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
179     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
180     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
181     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
182     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
183     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
184
185     /* Avoid stupid compiler warnings */
186     jnrA = jnrB = 0;
187     j_coord_offsetA = 0;
188     j_coord_offsetB = 0;
189
190     outeriter        = 0;
191     inneriter        = 0;
192
193     /* Start outer loop over neighborlists */
194     for(iidx=0; iidx<nri; iidx++)
195     {
196         /* Load shift vector for this list */
197         i_shift_offset   = DIM*shiftidx[iidx];
198
199         /* Load limits for loop over neighbors */
200         j_index_start    = jindex[iidx];
201         j_index_end      = jindex[iidx+1];
202
203         /* Get outer coordinate index */
204         inr              = iinr[iidx];
205         i_coord_offset   = DIM*inr;
206
207         /* Load i particle coords and add shift vector */
208         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
209                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
210
211         fix0             = _fjsp_setzero_v2r8();
212         fiy0             = _fjsp_setzero_v2r8();
213         fiz0             = _fjsp_setzero_v2r8();
214         fix1             = _fjsp_setzero_v2r8();
215         fiy1             = _fjsp_setzero_v2r8();
216         fiz1             = _fjsp_setzero_v2r8();
217         fix2             = _fjsp_setzero_v2r8();
218         fiy2             = _fjsp_setzero_v2r8();
219         fiz2             = _fjsp_setzero_v2r8();
220         fix3             = _fjsp_setzero_v2r8();
221         fiy3             = _fjsp_setzero_v2r8();
222         fiz3             = _fjsp_setzero_v2r8();
223
224         /* Reset potential sums */
225         velecsum         = _fjsp_setzero_v2r8();
226         vvdwsum          = _fjsp_setzero_v2r8();
227
228         /* Start inner kernel loop */
229         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
230         {
231
232             /* Get j neighbor index, and coordinate index */
233             jnrA             = jjnr[jidx];
234             jnrB             = jjnr[jidx+1];
235             j_coord_offsetA  = DIM*jnrA;
236             j_coord_offsetB  = DIM*jnrB;
237
238             /* load j atom coordinates */
239             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
240                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
241                                               &jy2,&jz2,&jx3,&jy3,&jz3);
242
243             /* Calculate displacement vector */
244             dx00             = _fjsp_sub_v2r8(ix0,jx0);
245             dy00             = _fjsp_sub_v2r8(iy0,jy0);
246             dz00             = _fjsp_sub_v2r8(iz0,jz0);
247             dx11             = _fjsp_sub_v2r8(ix1,jx1);
248             dy11             = _fjsp_sub_v2r8(iy1,jy1);
249             dz11             = _fjsp_sub_v2r8(iz1,jz1);
250             dx12             = _fjsp_sub_v2r8(ix1,jx2);
251             dy12             = _fjsp_sub_v2r8(iy1,jy2);
252             dz12             = _fjsp_sub_v2r8(iz1,jz2);
253             dx13             = _fjsp_sub_v2r8(ix1,jx3);
254             dy13             = _fjsp_sub_v2r8(iy1,jy3);
255             dz13             = _fjsp_sub_v2r8(iz1,jz3);
256             dx21             = _fjsp_sub_v2r8(ix2,jx1);
257             dy21             = _fjsp_sub_v2r8(iy2,jy1);
258             dz21             = _fjsp_sub_v2r8(iz2,jz1);
259             dx22             = _fjsp_sub_v2r8(ix2,jx2);
260             dy22             = _fjsp_sub_v2r8(iy2,jy2);
261             dz22             = _fjsp_sub_v2r8(iz2,jz2);
262             dx23             = _fjsp_sub_v2r8(ix2,jx3);
263             dy23             = _fjsp_sub_v2r8(iy2,jy3);
264             dz23             = _fjsp_sub_v2r8(iz2,jz3);
265             dx31             = _fjsp_sub_v2r8(ix3,jx1);
266             dy31             = _fjsp_sub_v2r8(iy3,jy1);
267             dz31             = _fjsp_sub_v2r8(iz3,jz1);
268             dx32             = _fjsp_sub_v2r8(ix3,jx2);
269             dy32             = _fjsp_sub_v2r8(iy3,jy2);
270             dz32             = _fjsp_sub_v2r8(iz3,jz2);
271             dx33             = _fjsp_sub_v2r8(ix3,jx3);
272             dy33             = _fjsp_sub_v2r8(iy3,jy3);
273             dz33             = _fjsp_sub_v2r8(iz3,jz3);
274
275             /* Calculate squared distance and things based on it */
276             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
277             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
278             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
279             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
280             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
281             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
282             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
283             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
284             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
285             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
286
287             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
288             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
289             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
290             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
291             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
292             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
293             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
294             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
295             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
296             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
297
298             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
299             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
300             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
301             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
302             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
303             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
304             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
305             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
306             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
307             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
308
309             fjx0             = _fjsp_setzero_v2r8();
310             fjy0             = _fjsp_setzero_v2r8();
311             fjz0             = _fjsp_setzero_v2r8();
312             fjx1             = _fjsp_setzero_v2r8();
313             fjy1             = _fjsp_setzero_v2r8();
314             fjz1             = _fjsp_setzero_v2r8();
315             fjx2             = _fjsp_setzero_v2r8();
316             fjy2             = _fjsp_setzero_v2r8();
317             fjz2             = _fjsp_setzero_v2r8();
318             fjx3             = _fjsp_setzero_v2r8();
319             fjy3             = _fjsp_setzero_v2r8();
320             fjz3             = _fjsp_setzero_v2r8();
321
322             /**************************
323              * CALCULATE INTERACTIONS *
324              **************************/
325
326             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
327             {
328
329             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
330
331             /* LENNARD-JONES DISPERSION/REPULSION */
332
333             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
334             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
335             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
336             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
337             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
338
339             d                = _fjsp_sub_v2r8(r00,rswitch);
340             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
341             d2               = _fjsp_mul_v2r8(d,d);
342             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
343
344             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
345
346             /* Evaluate switch function */
347             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
348             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
349             vvdw             = _fjsp_mul_v2r8(vvdw,sw);
350             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
351
352             /* Update potential sum for this i atom from the interaction with this j atom. */
353             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
354             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
355
356             fscal            = fvdw;
357
358             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
359
360             /* Update vectorial force */
361             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
362             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
363             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
364             
365             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
366             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
367             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
368
369             }
370
371             /**************************
372              * CALCULATE INTERACTIONS *
373              **************************/
374
375             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
376             {
377
378             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
379
380             /* EWALD ELECTROSTATICS */
381
382             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
383             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
384             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
385             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
386             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
387
388             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
389             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
390             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
391             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
392             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
393             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
394             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
395             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
396             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
397             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
398
399             d                = _fjsp_sub_v2r8(r11,rswitch);
400             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
401             d2               = _fjsp_mul_v2r8(d,d);
402             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
403
404             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
405
406             /* Evaluate switch function */
407             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
408             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
409             velec            = _fjsp_mul_v2r8(velec,sw);
410             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
411
412             /* Update potential sum for this i atom from the interaction with this j atom. */
413             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
414             velecsum         = _fjsp_add_v2r8(velecsum,velec);
415
416             fscal            = felec;
417
418             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
419
420             /* Update vectorial force */
421             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
422             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
423             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
424             
425             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
426             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
427             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
428
429             }
430
431             /**************************
432              * CALCULATE INTERACTIONS *
433              **************************/
434
435             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
436             {
437
438             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
439
440             /* EWALD ELECTROSTATICS */
441
442             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
443             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
444             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
445             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
446             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
447
448             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
449             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
450             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
451             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
452             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
453             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
454             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
455             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
456             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
457             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
458
459             d                = _fjsp_sub_v2r8(r12,rswitch);
460             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
461             d2               = _fjsp_mul_v2r8(d,d);
462             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
463
464             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
465
466             /* Evaluate switch function */
467             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
468             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
469             velec            = _fjsp_mul_v2r8(velec,sw);
470             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
471
472             /* Update potential sum for this i atom from the interaction with this j atom. */
473             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
474             velecsum         = _fjsp_add_v2r8(velecsum,velec);
475
476             fscal            = felec;
477
478             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
479
480             /* Update vectorial force */
481             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
482             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
483             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
484             
485             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
486             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
487             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
488
489             }
490
491             /**************************
492              * CALCULATE INTERACTIONS *
493              **************************/
494
495             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
496             {
497
498             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
499
500             /* EWALD ELECTROSTATICS */
501
502             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
503             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
504             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
505             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
506             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
507
508             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
509             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
510             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
511             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
512             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
513             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
514             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
515             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
516             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
517             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
518
519             d                = _fjsp_sub_v2r8(r13,rswitch);
520             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
521             d2               = _fjsp_mul_v2r8(d,d);
522             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
523
524             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
525
526             /* Evaluate switch function */
527             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
528             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
529             velec            = _fjsp_mul_v2r8(velec,sw);
530             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
531
532             /* Update potential sum for this i atom from the interaction with this j atom. */
533             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
534             velecsum         = _fjsp_add_v2r8(velecsum,velec);
535
536             fscal            = felec;
537
538             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
539
540             /* Update vectorial force */
541             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
542             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
543             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
544             
545             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
546             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
547             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
548
549             }
550
551             /**************************
552              * CALCULATE INTERACTIONS *
553              **************************/
554
555             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
556             {
557
558             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
559
560             /* EWALD ELECTROSTATICS */
561
562             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
563             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
564             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
565             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
566             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
567
568             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
569             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
570             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
571             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
572             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
573             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
574             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
575             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
576             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
577             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
578
579             d                = _fjsp_sub_v2r8(r21,rswitch);
580             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
581             d2               = _fjsp_mul_v2r8(d,d);
582             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
583
584             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
585
586             /* Evaluate switch function */
587             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
588             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
589             velec            = _fjsp_mul_v2r8(velec,sw);
590             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
591
592             /* Update potential sum for this i atom from the interaction with this j atom. */
593             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
594             velecsum         = _fjsp_add_v2r8(velecsum,velec);
595
596             fscal            = felec;
597
598             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
599
600             /* Update vectorial force */
601             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
602             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
603             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
604             
605             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
606             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
607             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
608
609             }
610
611             /**************************
612              * CALCULATE INTERACTIONS *
613              **************************/
614
615             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
616             {
617
618             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
619
620             /* EWALD ELECTROSTATICS */
621
622             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
623             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
624             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
625             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
626             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
627
628             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
629             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
630             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
631             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
632             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
633             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
634             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
635             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
636             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
637             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
638
639             d                = _fjsp_sub_v2r8(r22,rswitch);
640             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
641             d2               = _fjsp_mul_v2r8(d,d);
642             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
643
644             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
645
646             /* Evaluate switch function */
647             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
648             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
649             velec            = _fjsp_mul_v2r8(velec,sw);
650             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
651
652             /* Update potential sum for this i atom from the interaction with this j atom. */
653             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
654             velecsum         = _fjsp_add_v2r8(velecsum,velec);
655
656             fscal            = felec;
657
658             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
659
660             /* Update vectorial force */
661             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
662             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
663             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
664             
665             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
666             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
667             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
668
669             }
670
671             /**************************
672              * CALCULATE INTERACTIONS *
673              **************************/
674
675             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
676             {
677
678             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
679
680             /* EWALD ELECTROSTATICS */
681
682             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
683             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
684             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
685             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
686             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
687
688             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
689             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
690             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
691             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
692             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
693             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
694             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
695             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
696             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
697             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
698
699             d                = _fjsp_sub_v2r8(r23,rswitch);
700             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
701             d2               = _fjsp_mul_v2r8(d,d);
702             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
703
704             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
705
706             /* Evaluate switch function */
707             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
708             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
709             velec            = _fjsp_mul_v2r8(velec,sw);
710             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
711
712             /* Update potential sum for this i atom from the interaction with this j atom. */
713             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
714             velecsum         = _fjsp_add_v2r8(velecsum,velec);
715
716             fscal            = felec;
717
718             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
719
720             /* Update vectorial force */
721             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
722             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
723             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
724             
725             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
726             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
727             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
728
729             }
730
731             /**************************
732              * CALCULATE INTERACTIONS *
733              **************************/
734
735             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
736             {
737
738             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
739
740             /* EWALD ELECTROSTATICS */
741
742             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
743             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
744             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
745             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
746             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
747
748             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
749             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
750             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
751             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
752             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
753             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
754             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
755             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
756             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
757             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
758
759             d                = _fjsp_sub_v2r8(r31,rswitch);
760             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
761             d2               = _fjsp_mul_v2r8(d,d);
762             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
763
764             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
765
766             /* Evaluate switch function */
767             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
768             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
769             velec            = _fjsp_mul_v2r8(velec,sw);
770             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
771
772             /* Update potential sum for this i atom from the interaction with this j atom. */
773             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
774             velecsum         = _fjsp_add_v2r8(velecsum,velec);
775
776             fscal            = felec;
777
778             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
779
780             /* Update vectorial force */
781             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
782             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
783             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
784             
785             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
786             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
787             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
788
789             }
790
791             /**************************
792              * CALCULATE INTERACTIONS *
793              **************************/
794
795             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
796             {
797
798             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
799
800             /* EWALD ELECTROSTATICS */
801
802             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
803             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
804             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
805             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
806             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
807
808             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
809             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
810             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
811             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
812             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
813             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
814             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
815             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
816             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
817             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
818
819             d                = _fjsp_sub_v2r8(r32,rswitch);
820             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
821             d2               = _fjsp_mul_v2r8(d,d);
822             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
823
824             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
825
826             /* Evaluate switch function */
827             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
828             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
829             velec            = _fjsp_mul_v2r8(velec,sw);
830             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
831
832             /* Update potential sum for this i atom from the interaction with this j atom. */
833             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
834             velecsum         = _fjsp_add_v2r8(velecsum,velec);
835
836             fscal            = felec;
837
838             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
839
840             /* Update vectorial force */
841             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
842             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
843             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
844             
845             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
846             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
847             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
848
849             }
850
851             /**************************
852              * CALCULATE INTERACTIONS *
853              **************************/
854
855             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
856             {
857
858             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
859
860             /* EWALD ELECTROSTATICS */
861
862             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
863             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
864             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
865             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
866             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
867
868             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
869             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
870             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
871             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
872             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
873             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
874             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
875             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
876             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
877             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
878
879             d                = _fjsp_sub_v2r8(r33,rswitch);
880             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
881             d2               = _fjsp_mul_v2r8(d,d);
882             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
883
884             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
885
886             /* Evaluate switch function */
887             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
888             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
889             velec            = _fjsp_mul_v2r8(velec,sw);
890             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
891
892             /* Update potential sum for this i atom from the interaction with this j atom. */
893             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
894             velecsum         = _fjsp_add_v2r8(velecsum,velec);
895
896             fscal            = felec;
897
898             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
899
900             /* Update vectorial force */
901             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
902             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
903             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
904             
905             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
906             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
907             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
908
909             }
910
911             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
912
913             /* Inner loop uses 677 flops */
914         }
915
916         if(jidx<j_index_end)
917         {
918
919             jnrA             = jjnr[jidx];
920             j_coord_offsetA  = DIM*jnrA;
921
922             /* load j atom coordinates */
923             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
924                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
925                                               &jy2,&jz2,&jx3,&jy3,&jz3);
926
927             /* Calculate displacement vector */
928             dx00             = _fjsp_sub_v2r8(ix0,jx0);
929             dy00             = _fjsp_sub_v2r8(iy0,jy0);
930             dz00             = _fjsp_sub_v2r8(iz0,jz0);
931             dx11             = _fjsp_sub_v2r8(ix1,jx1);
932             dy11             = _fjsp_sub_v2r8(iy1,jy1);
933             dz11             = _fjsp_sub_v2r8(iz1,jz1);
934             dx12             = _fjsp_sub_v2r8(ix1,jx2);
935             dy12             = _fjsp_sub_v2r8(iy1,jy2);
936             dz12             = _fjsp_sub_v2r8(iz1,jz2);
937             dx13             = _fjsp_sub_v2r8(ix1,jx3);
938             dy13             = _fjsp_sub_v2r8(iy1,jy3);
939             dz13             = _fjsp_sub_v2r8(iz1,jz3);
940             dx21             = _fjsp_sub_v2r8(ix2,jx1);
941             dy21             = _fjsp_sub_v2r8(iy2,jy1);
942             dz21             = _fjsp_sub_v2r8(iz2,jz1);
943             dx22             = _fjsp_sub_v2r8(ix2,jx2);
944             dy22             = _fjsp_sub_v2r8(iy2,jy2);
945             dz22             = _fjsp_sub_v2r8(iz2,jz2);
946             dx23             = _fjsp_sub_v2r8(ix2,jx3);
947             dy23             = _fjsp_sub_v2r8(iy2,jy3);
948             dz23             = _fjsp_sub_v2r8(iz2,jz3);
949             dx31             = _fjsp_sub_v2r8(ix3,jx1);
950             dy31             = _fjsp_sub_v2r8(iy3,jy1);
951             dz31             = _fjsp_sub_v2r8(iz3,jz1);
952             dx32             = _fjsp_sub_v2r8(ix3,jx2);
953             dy32             = _fjsp_sub_v2r8(iy3,jy2);
954             dz32             = _fjsp_sub_v2r8(iz3,jz2);
955             dx33             = _fjsp_sub_v2r8(ix3,jx3);
956             dy33             = _fjsp_sub_v2r8(iy3,jy3);
957             dz33             = _fjsp_sub_v2r8(iz3,jz3);
958
959             /* Calculate squared distance and things based on it */
960             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
961             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
962             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
963             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
964             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
965             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
966             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
967             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
968             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
969             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
970
971             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
972             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
973             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
974             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
975             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
976             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
977             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
978             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
979             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
980             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
981
982             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
983             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
984             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
985             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
986             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
987             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
988             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
989             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
990             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
991             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
992
993             fjx0             = _fjsp_setzero_v2r8();
994             fjy0             = _fjsp_setzero_v2r8();
995             fjz0             = _fjsp_setzero_v2r8();
996             fjx1             = _fjsp_setzero_v2r8();
997             fjy1             = _fjsp_setzero_v2r8();
998             fjz1             = _fjsp_setzero_v2r8();
999             fjx2             = _fjsp_setzero_v2r8();
1000             fjy2             = _fjsp_setzero_v2r8();
1001             fjz2             = _fjsp_setzero_v2r8();
1002             fjx3             = _fjsp_setzero_v2r8();
1003             fjy3             = _fjsp_setzero_v2r8();
1004             fjz3             = _fjsp_setzero_v2r8();
1005
1006             /**************************
1007              * CALCULATE INTERACTIONS *
1008              **************************/
1009
1010             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
1011             {
1012
1013             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
1014
1015             /* LENNARD-JONES DISPERSION/REPULSION */
1016
1017             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
1018             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
1019             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
1020             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
1021             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
1022
1023             d                = _fjsp_sub_v2r8(r00,rswitch);
1024             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1025             d2               = _fjsp_mul_v2r8(d,d);
1026             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1027
1028             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1029
1030             /* Evaluate switch function */
1031             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1032             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
1033             vvdw             = _fjsp_mul_v2r8(vvdw,sw);
1034             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
1035
1036             /* Update potential sum for this i atom from the interaction with this j atom. */
1037             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
1038             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
1039             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
1040
1041             fscal            = fvdw;
1042
1043             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1044
1045             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1046
1047             /* Update vectorial force */
1048             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
1049             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
1050             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
1051             
1052             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
1053             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
1054             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
1055
1056             }
1057
1058             /**************************
1059              * CALCULATE INTERACTIONS *
1060              **************************/
1061
1062             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
1063             {
1064
1065             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
1066
1067             /* EWALD ELECTROSTATICS */
1068
1069             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1070             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
1071             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1072             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1073             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1074
1075             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1076             ewtabD           = _fjsp_setzero_v2r8();
1077             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1078             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1079             ewtabFn          = _fjsp_setzero_v2r8();
1080             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1081             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1082             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1083             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
1084             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
1085
1086             d                = _fjsp_sub_v2r8(r11,rswitch);
1087             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1088             d2               = _fjsp_mul_v2r8(d,d);
1089             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1090
1091             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1092
1093             /* Evaluate switch function */
1094             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1095             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
1096             velec            = _fjsp_mul_v2r8(velec,sw);
1097             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
1098
1099             /* Update potential sum for this i atom from the interaction with this j atom. */
1100             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1101             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1102             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1103
1104             fscal            = felec;
1105
1106             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1107
1108             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1109
1110             /* Update vectorial force */
1111             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
1112             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
1113             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
1114             
1115             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
1116             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
1117             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
1118
1119             }
1120
1121             /**************************
1122              * CALCULATE INTERACTIONS *
1123              **************************/
1124
1125             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
1126             {
1127
1128             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
1129
1130             /* EWALD ELECTROSTATICS */
1131
1132             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1133             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
1134             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1135             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1136             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1137
1138             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1139             ewtabD           = _fjsp_setzero_v2r8();
1140             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1141             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1142             ewtabFn          = _fjsp_setzero_v2r8();
1143             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1144             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1145             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1146             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
1147             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
1148
1149             d                = _fjsp_sub_v2r8(r12,rswitch);
1150             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1151             d2               = _fjsp_mul_v2r8(d,d);
1152             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1153
1154             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1155
1156             /* Evaluate switch function */
1157             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1158             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
1159             velec            = _fjsp_mul_v2r8(velec,sw);
1160             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
1161
1162             /* Update potential sum for this i atom from the interaction with this j atom. */
1163             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1164             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1165             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1166
1167             fscal            = felec;
1168
1169             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1170
1171             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1172
1173             /* Update vectorial force */
1174             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
1175             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
1176             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
1177             
1178             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
1179             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
1180             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
1181
1182             }
1183
1184             /**************************
1185              * CALCULATE INTERACTIONS *
1186              **************************/
1187
1188             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
1189             {
1190
1191             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
1192
1193             /* EWALD ELECTROSTATICS */
1194
1195             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1196             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
1197             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1198             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1199             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1200
1201             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1202             ewtabD           = _fjsp_setzero_v2r8();
1203             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1204             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1205             ewtabFn          = _fjsp_setzero_v2r8();
1206             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1207             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1208             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1209             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
1210             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
1211
1212             d                = _fjsp_sub_v2r8(r13,rswitch);
1213             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1214             d2               = _fjsp_mul_v2r8(d,d);
1215             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1216
1217             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1218
1219             /* Evaluate switch function */
1220             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1221             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
1222             velec            = _fjsp_mul_v2r8(velec,sw);
1223             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
1224
1225             /* Update potential sum for this i atom from the interaction with this j atom. */
1226             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1227             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1228             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1229
1230             fscal            = felec;
1231
1232             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1233
1234             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1235
1236             /* Update vectorial force */
1237             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
1238             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
1239             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
1240             
1241             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
1242             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
1243             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
1244
1245             }
1246
1247             /**************************
1248              * CALCULATE INTERACTIONS *
1249              **************************/
1250
1251             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
1252             {
1253
1254             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
1255
1256             /* EWALD ELECTROSTATICS */
1257
1258             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1259             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
1260             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1261             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1262             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1263
1264             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1265             ewtabD           = _fjsp_setzero_v2r8();
1266             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1267             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1268             ewtabFn          = _fjsp_setzero_v2r8();
1269             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1270             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1271             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1272             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
1273             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
1274
1275             d                = _fjsp_sub_v2r8(r21,rswitch);
1276             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1277             d2               = _fjsp_mul_v2r8(d,d);
1278             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1279
1280             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1281
1282             /* Evaluate switch function */
1283             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1284             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
1285             velec            = _fjsp_mul_v2r8(velec,sw);
1286             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
1287
1288             /* Update potential sum for this i atom from the interaction with this j atom. */
1289             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1290             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1291             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1292
1293             fscal            = felec;
1294
1295             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1296
1297             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1298
1299             /* Update vectorial force */
1300             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
1301             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
1302             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
1303             
1304             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
1305             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
1306             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
1307
1308             }
1309
1310             /**************************
1311              * CALCULATE INTERACTIONS *
1312              **************************/
1313
1314             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
1315             {
1316
1317             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
1318
1319             /* EWALD ELECTROSTATICS */
1320
1321             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1322             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
1323             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1324             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1325             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1326
1327             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1328             ewtabD           = _fjsp_setzero_v2r8();
1329             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1330             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1331             ewtabFn          = _fjsp_setzero_v2r8();
1332             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1333             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1334             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1335             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
1336             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
1337
1338             d                = _fjsp_sub_v2r8(r22,rswitch);
1339             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1340             d2               = _fjsp_mul_v2r8(d,d);
1341             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1342
1343             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1344
1345             /* Evaluate switch function */
1346             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1347             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
1348             velec            = _fjsp_mul_v2r8(velec,sw);
1349             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
1350
1351             /* Update potential sum for this i atom from the interaction with this j atom. */
1352             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1353             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1354             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1355
1356             fscal            = felec;
1357
1358             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1359
1360             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1361
1362             /* Update vectorial force */
1363             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
1364             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
1365             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
1366             
1367             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
1368             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
1369             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
1370
1371             }
1372
1373             /**************************
1374              * CALCULATE INTERACTIONS *
1375              **************************/
1376
1377             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
1378             {
1379
1380             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
1381
1382             /* EWALD ELECTROSTATICS */
1383
1384             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1385             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
1386             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1387             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1388             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1389
1390             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1391             ewtabD           = _fjsp_setzero_v2r8();
1392             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1393             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1394             ewtabFn          = _fjsp_setzero_v2r8();
1395             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1396             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1397             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1398             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
1399             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
1400
1401             d                = _fjsp_sub_v2r8(r23,rswitch);
1402             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1403             d2               = _fjsp_mul_v2r8(d,d);
1404             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1405
1406             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1407
1408             /* Evaluate switch function */
1409             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1410             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
1411             velec            = _fjsp_mul_v2r8(velec,sw);
1412             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
1413
1414             /* Update potential sum for this i atom from the interaction with this j atom. */
1415             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1416             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1417             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1418
1419             fscal            = felec;
1420
1421             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1422
1423             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1424
1425             /* Update vectorial force */
1426             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
1427             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
1428             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
1429             
1430             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
1431             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
1432             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
1433
1434             }
1435
1436             /**************************
1437              * CALCULATE INTERACTIONS *
1438              **************************/
1439
1440             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
1441             {
1442
1443             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
1444
1445             /* EWALD ELECTROSTATICS */
1446
1447             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1448             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
1449             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1450             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1451             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1452
1453             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1454             ewtabD           = _fjsp_setzero_v2r8();
1455             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1456             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1457             ewtabFn          = _fjsp_setzero_v2r8();
1458             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1459             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1460             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1461             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
1462             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
1463
1464             d                = _fjsp_sub_v2r8(r31,rswitch);
1465             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1466             d2               = _fjsp_mul_v2r8(d,d);
1467             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1468
1469             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1470
1471             /* Evaluate switch function */
1472             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1473             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
1474             velec            = _fjsp_mul_v2r8(velec,sw);
1475             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
1476
1477             /* Update potential sum for this i atom from the interaction with this j atom. */
1478             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1479             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1480             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1481
1482             fscal            = felec;
1483
1484             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1485
1486             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1487
1488             /* Update vectorial force */
1489             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
1490             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
1491             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
1492             
1493             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
1494             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
1495             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
1496
1497             }
1498
1499             /**************************
1500              * CALCULATE INTERACTIONS *
1501              **************************/
1502
1503             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
1504             {
1505
1506             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
1507
1508             /* EWALD ELECTROSTATICS */
1509
1510             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1511             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
1512             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1513             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1514             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1515
1516             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1517             ewtabD           = _fjsp_setzero_v2r8();
1518             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1519             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1520             ewtabFn          = _fjsp_setzero_v2r8();
1521             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1522             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1523             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1524             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
1525             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
1526
1527             d                = _fjsp_sub_v2r8(r32,rswitch);
1528             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1529             d2               = _fjsp_mul_v2r8(d,d);
1530             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1531
1532             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1533
1534             /* Evaluate switch function */
1535             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1536             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
1537             velec            = _fjsp_mul_v2r8(velec,sw);
1538             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
1539
1540             /* Update potential sum for this i atom from the interaction with this j atom. */
1541             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1542             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1543             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1544
1545             fscal            = felec;
1546
1547             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1548
1549             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1550
1551             /* Update vectorial force */
1552             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
1553             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
1554             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
1555             
1556             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
1557             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
1558             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
1559
1560             }
1561
1562             /**************************
1563              * CALCULATE INTERACTIONS *
1564              **************************/
1565
1566             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
1567             {
1568
1569             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
1570
1571             /* EWALD ELECTROSTATICS */
1572
1573             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1574             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
1575             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1576             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1577             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1578
1579             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1580             ewtabD           = _fjsp_setzero_v2r8();
1581             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1582             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1583             ewtabFn          = _fjsp_setzero_v2r8();
1584             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1585             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1586             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1587             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
1588             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
1589
1590             d                = _fjsp_sub_v2r8(r33,rswitch);
1591             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1592             d2               = _fjsp_mul_v2r8(d,d);
1593             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1594
1595             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1596
1597             /* Evaluate switch function */
1598             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1599             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
1600             velec            = _fjsp_mul_v2r8(velec,sw);
1601             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
1602
1603             /* Update potential sum for this i atom from the interaction with this j atom. */
1604             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1605             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1606             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1607
1608             fscal            = felec;
1609
1610             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1611
1612             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1613
1614             /* Update vectorial force */
1615             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
1616             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
1617             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
1618             
1619             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
1620             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
1621             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
1622
1623             }
1624
1625             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1626
1627             /* Inner loop uses 677 flops */
1628         }
1629
1630         /* End of innermost loop */
1631
1632         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1633                                               f+i_coord_offset,fshift+i_shift_offset);
1634
1635         ggid                        = gid[iidx];
1636         /* Update potential energies */
1637         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
1638         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
1639
1640         /* Increment number of inner iterations */
1641         inneriter                  += j_index_end - j_index_start;
1642
1643         /* Outer loop uses 26 flops */
1644     }
1645
1646     /* Increment number of outer iterations */
1647     outeriter        += nri;
1648
1649     /* Update outer/inner flops */
1650
1651     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*677);
1652 }
1653 /*
1654  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double
1655  * Electrostatics interaction: Ewald
1656  * VdW interaction:            LennardJones
1657  * Geometry:                   Water4-Water4
1658  * Calculate force/pot:        Force
1659  */
1660 void
1661 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double
1662                     (t_nblist                    * gmx_restrict       nlist,
1663                      rvec                        * gmx_restrict          xx,
1664                      rvec                        * gmx_restrict          ff,
1665                      t_forcerec                  * gmx_restrict          fr,
1666                      t_mdatoms                   * gmx_restrict     mdatoms,
1667                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1668                      t_nrnb                      * gmx_restrict        nrnb)
1669 {
1670     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1671      * just 0 for non-waters.
1672      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
1673      * jnr indices corresponding to data put in the four positions in the SIMD register.
1674      */
1675     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1676     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1677     int              jnrA,jnrB;
1678     int              j_coord_offsetA,j_coord_offsetB;
1679     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1680     real             rcutoff_scalar;
1681     real             *shiftvec,*fshift,*x,*f;
1682     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1683     int              vdwioffset0;
1684     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1685     int              vdwioffset1;
1686     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1687     int              vdwioffset2;
1688     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1689     int              vdwioffset3;
1690     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1691     int              vdwjidx0A,vdwjidx0B;
1692     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1693     int              vdwjidx1A,vdwjidx1B;
1694     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1695     int              vdwjidx2A,vdwjidx2B;
1696     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1697     int              vdwjidx3A,vdwjidx3B;
1698     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1699     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1700     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1701     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1702     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1703     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1704     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1705     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1706     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1707     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1708     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1709     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
1710     real             *charge;
1711     int              nvdwtype;
1712     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1713     int              *vdwtype;
1714     real             *vdwparam;
1715     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
1716     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
1717     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1718     real             *ewtab;
1719     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1720     real             rswitch_scalar,d_scalar;
1721     _fjsp_v2r8       itab_tmp;
1722     _fjsp_v2r8       dummy_mask,cutoff_mask;
1723     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
1724     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
1725     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
1726
1727     x                = xx[0];
1728     f                = ff[0];
1729
1730     nri              = nlist->nri;
1731     iinr             = nlist->iinr;
1732     jindex           = nlist->jindex;
1733     jjnr             = nlist->jjnr;
1734     shiftidx         = nlist->shift;
1735     gid              = nlist->gid;
1736     shiftvec         = fr->shift_vec[0];
1737     fshift           = fr->fshift[0];
1738     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
1739     charge           = mdatoms->chargeA;
1740     nvdwtype         = fr->ntype;
1741     vdwparam         = fr->nbfp;
1742     vdwtype          = mdatoms->typeA;
1743
1744     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
1745     ewtab            = fr->ic->tabq_coul_FDV0;
1746     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
1747     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
1748
1749     /* Setup water-specific parameters */
1750     inr              = nlist->iinr[0];
1751     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
1752     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
1753     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
1754     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1755
1756     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
1757     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
1758     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
1759     vdwjidx0A        = 2*vdwtype[inr+0];
1760     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
1761     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
1762     qq11             = _fjsp_mul_v2r8(iq1,jq1);
1763     qq12             = _fjsp_mul_v2r8(iq1,jq2);
1764     qq13             = _fjsp_mul_v2r8(iq1,jq3);
1765     qq21             = _fjsp_mul_v2r8(iq2,jq1);
1766     qq22             = _fjsp_mul_v2r8(iq2,jq2);
1767     qq23             = _fjsp_mul_v2r8(iq2,jq3);
1768     qq31             = _fjsp_mul_v2r8(iq3,jq1);
1769     qq32             = _fjsp_mul_v2r8(iq3,jq2);
1770     qq33             = _fjsp_mul_v2r8(iq3,jq3);
1771
1772     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1773     rcutoff_scalar   = fr->rcoulomb;
1774     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
1775     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
1776
1777     rswitch_scalar   = fr->rcoulomb_switch;
1778     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
1779     /* Setup switch parameters */
1780     d_scalar         = rcutoff_scalar-rswitch_scalar;
1781     d                = gmx_fjsp_set1_v2r8(d_scalar);
1782     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
1783     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1784     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1785     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
1786     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1787     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1788
1789     /* Avoid stupid compiler warnings */
1790     jnrA = jnrB = 0;
1791     j_coord_offsetA = 0;
1792     j_coord_offsetB = 0;
1793
1794     outeriter        = 0;
1795     inneriter        = 0;
1796
1797     /* Start outer loop over neighborlists */
1798     for(iidx=0; iidx<nri; iidx++)
1799     {
1800         /* Load shift vector for this list */
1801         i_shift_offset   = DIM*shiftidx[iidx];
1802
1803         /* Load limits for loop over neighbors */
1804         j_index_start    = jindex[iidx];
1805         j_index_end      = jindex[iidx+1];
1806
1807         /* Get outer coordinate index */
1808         inr              = iinr[iidx];
1809         i_coord_offset   = DIM*inr;
1810
1811         /* Load i particle coords and add shift vector */
1812         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
1813                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1814
1815         fix0             = _fjsp_setzero_v2r8();
1816         fiy0             = _fjsp_setzero_v2r8();
1817         fiz0             = _fjsp_setzero_v2r8();
1818         fix1             = _fjsp_setzero_v2r8();
1819         fiy1             = _fjsp_setzero_v2r8();
1820         fiz1             = _fjsp_setzero_v2r8();
1821         fix2             = _fjsp_setzero_v2r8();
1822         fiy2             = _fjsp_setzero_v2r8();
1823         fiz2             = _fjsp_setzero_v2r8();
1824         fix3             = _fjsp_setzero_v2r8();
1825         fiy3             = _fjsp_setzero_v2r8();
1826         fiz3             = _fjsp_setzero_v2r8();
1827
1828         /* Start inner kernel loop */
1829         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1830         {
1831
1832             /* Get j neighbor index, and coordinate index */
1833             jnrA             = jjnr[jidx];
1834             jnrB             = jjnr[jidx+1];
1835             j_coord_offsetA  = DIM*jnrA;
1836             j_coord_offsetB  = DIM*jnrB;
1837
1838             /* load j atom coordinates */
1839             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
1840                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1841                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1842
1843             /* Calculate displacement vector */
1844             dx00             = _fjsp_sub_v2r8(ix0,jx0);
1845             dy00             = _fjsp_sub_v2r8(iy0,jy0);
1846             dz00             = _fjsp_sub_v2r8(iz0,jz0);
1847             dx11             = _fjsp_sub_v2r8(ix1,jx1);
1848             dy11             = _fjsp_sub_v2r8(iy1,jy1);
1849             dz11             = _fjsp_sub_v2r8(iz1,jz1);
1850             dx12             = _fjsp_sub_v2r8(ix1,jx2);
1851             dy12             = _fjsp_sub_v2r8(iy1,jy2);
1852             dz12             = _fjsp_sub_v2r8(iz1,jz2);
1853             dx13             = _fjsp_sub_v2r8(ix1,jx3);
1854             dy13             = _fjsp_sub_v2r8(iy1,jy3);
1855             dz13             = _fjsp_sub_v2r8(iz1,jz3);
1856             dx21             = _fjsp_sub_v2r8(ix2,jx1);
1857             dy21             = _fjsp_sub_v2r8(iy2,jy1);
1858             dz21             = _fjsp_sub_v2r8(iz2,jz1);
1859             dx22             = _fjsp_sub_v2r8(ix2,jx2);
1860             dy22             = _fjsp_sub_v2r8(iy2,jy2);
1861             dz22             = _fjsp_sub_v2r8(iz2,jz2);
1862             dx23             = _fjsp_sub_v2r8(ix2,jx3);
1863             dy23             = _fjsp_sub_v2r8(iy2,jy3);
1864             dz23             = _fjsp_sub_v2r8(iz2,jz3);
1865             dx31             = _fjsp_sub_v2r8(ix3,jx1);
1866             dy31             = _fjsp_sub_v2r8(iy3,jy1);
1867             dz31             = _fjsp_sub_v2r8(iz3,jz1);
1868             dx32             = _fjsp_sub_v2r8(ix3,jx2);
1869             dy32             = _fjsp_sub_v2r8(iy3,jy2);
1870             dz32             = _fjsp_sub_v2r8(iz3,jz2);
1871             dx33             = _fjsp_sub_v2r8(ix3,jx3);
1872             dy33             = _fjsp_sub_v2r8(iy3,jy3);
1873             dz33             = _fjsp_sub_v2r8(iz3,jz3);
1874
1875             /* Calculate squared distance and things based on it */
1876             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
1877             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
1878             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
1879             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
1880             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
1881             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
1882             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
1883             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
1884             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
1885             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
1886
1887             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
1888             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
1889             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
1890             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
1891             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
1892             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
1893             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
1894             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
1895             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
1896             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
1897
1898             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
1899             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
1900             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
1901             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
1902             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
1903             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
1904             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
1905             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
1906             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
1907             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
1908
1909             fjx0             = _fjsp_setzero_v2r8();
1910             fjy0             = _fjsp_setzero_v2r8();
1911             fjz0             = _fjsp_setzero_v2r8();
1912             fjx1             = _fjsp_setzero_v2r8();
1913             fjy1             = _fjsp_setzero_v2r8();
1914             fjz1             = _fjsp_setzero_v2r8();
1915             fjx2             = _fjsp_setzero_v2r8();
1916             fjy2             = _fjsp_setzero_v2r8();
1917             fjz2             = _fjsp_setzero_v2r8();
1918             fjx3             = _fjsp_setzero_v2r8();
1919             fjy3             = _fjsp_setzero_v2r8();
1920             fjz3             = _fjsp_setzero_v2r8();
1921
1922             /**************************
1923              * CALCULATE INTERACTIONS *
1924              **************************/
1925
1926             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
1927             {
1928
1929             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
1930
1931             /* LENNARD-JONES DISPERSION/REPULSION */
1932
1933             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
1934             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
1935             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
1936             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
1937             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
1938
1939             d                = _fjsp_sub_v2r8(r00,rswitch);
1940             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1941             d2               = _fjsp_mul_v2r8(d,d);
1942             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1943
1944             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1945
1946             /* Evaluate switch function */
1947             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1948             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
1949             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
1950
1951             fscal            = fvdw;
1952
1953             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1954
1955             /* Update vectorial force */
1956             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
1957             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
1958             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
1959             
1960             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
1961             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
1962             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
1963
1964             }
1965
1966             /**************************
1967              * CALCULATE INTERACTIONS *
1968              **************************/
1969
1970             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
1971             {
1972
1973             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
1974
1975             /* EWALD ELECTROSTATICS */
1976
1977             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1978             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
1979             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1980             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1981             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1982
1983             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1984             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
1985             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1986             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1987             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
1988             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1989             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1990             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1991             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
1992             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
1993
1994             d                = _fjsp_sub_v2r8(r11,rswitch);
1995             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1996             d2               = _fjsp_mul_v2r8(d,d);
1997             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1998
1999             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2000
2001             /* Evaluate switch function */
2002             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2003             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
2004             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
2005
2006             fscal            = felec;
2007
2008             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2009
2010             /* Update vectorial force */
2011             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
2012             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
2013             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
2014             
2015             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
2016             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
2017             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
2018
2019             }
2020
2021             /**************************
2022              * CALCULATE INTERACTIONS *
2023              **************************/
2024
2025             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
2026             {
2027
2028             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
2029
2030             /* EWALD ELECTROSTATICS */
2031
2032             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2033             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
2034             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2035             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2036             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2037
2038             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2039             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2040             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2041             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2042             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2043             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2044             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2045             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2046             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
2047             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
2048
2049             d                = _fjsp_sub_v2r8(r12,rswitch);
2050             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2051             d2               = _fjsp_mul_v2r8(d,d);
2052             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2053
2054             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2055
2056             /* Evaluate switch function */
2057             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2058             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
2059             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
2060
2061             fscal            = felec;
2062
2063             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2064
2065             /* Update vectorial force */
2066             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
2067             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
2068             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
2069             
2070             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
2071             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
2072             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
2073
2074             }
2075
2076             /**************************
2077              * CALCULATE INTERACTIONS *
2078              **************************/
2079
2080             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
2081             {
2082
2083             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
2084
2085             /* EWALD ELECTROSTATICS */
2086
2087             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2088             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
2089             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2090             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2091             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2092
2093             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2094             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2095             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2096             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2097             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2098             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2099             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2100             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2101             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
2102             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
2103
2104             d                = _fjsp_sub_v2r8(r13,rswitch);
2105             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2106             d2               = _fjsp_mul_v2r8(d,d);
2107             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2108
2109             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2110
2111             /* Evaluate switch function */
2112             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2113             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
2114             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
2115
2116             fscal            = felec;
2117
2118             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2119
2120             /* Update vectorial force */
2121             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
2122             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
2123             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
2124             
2125             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
2126             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
2127             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
2128
2129             }
2130
2131             /**************************
2132              * CALCULATE INTERACTIONS *
2133              **************************/
2134
2135             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
2136             {
2137
2138             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
2139
2140             /* EWALD ELECTROSTATICS */
2141
2142             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2143             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
2144             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2145             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2146             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2147
2148             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2149             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2150             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2151             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2152             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2153             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2154             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2155             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2156             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
2157             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
2158
2159             d                = _fjsp_sub_v2r8(r21,rswitch);
2160             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2161             d2               = _fjsp_mul_v2r8(d,d);
2162             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2163
2164             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2165
2166             /* Evaluate switch function */
2167             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2168             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
2169             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
2170
2171             fscal            = felec;
2172
2173             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2174
2175             /* Update vectorial force */
2176             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
2177             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
2178             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
2179             
2180             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
2181             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
2182             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
2183
2184             }
2185
2186             /**************************
2187              * CALCULATE INTERACTIONS *
2188              **************************/
2189
2190             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
2191             {
2192
2193             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
2194
2195             /* EWALD ELECTROSTATICS */
2196
2197             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2198             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
2199             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2200             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2201             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2202
2203             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2204             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2205             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2206             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2207             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2208             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2209             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2210             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2211             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
2212             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
2213
2214             d                = _fjsp_sub_v2r8(r22,rswitch);
2215             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2216             d2               = _fjsp_mul_v2r8(d,d);
2217             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2218
2219             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2220
2221             /* Evaluate switch function */
2222             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2223             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
2224             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
2225
2226             fscal            = felec;
2227
2228             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2229
2230             /* Update vectorial force */
2231             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
2232             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
2233             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
2234             
2235             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
2236             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
2237             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
2238
2239             }
2240
2241             /**************************
2242              * CALCULATE INTERACTIONS *
2243              **************************/
2244
2245             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
2246             {
2247
2248             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
2249
2250             /* EWALD ELECTROSTATICS */
2251
2252             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2253             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
2254             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2255             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2256             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2257
2258             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2259             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2260             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2261             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2262             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2263             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2264             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2265             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2266             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
2267             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
2268
2269             d                = _fjsp_sub_v2r8(r23,rswitch);
2270             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2271             d2               = _fjsp_mul_v2r8(d,d);
2272             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2273
2274             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2275
2276             /* Evaluate switch function */
2277             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2278             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
2279             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
2280
2281             fscal            = felec;
2282
2283             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2284
2285             /* Update vectorial force */
2286             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
2287             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
2288             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
2289             
2290             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
2291             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
2292             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
2293
2294             }
2295
2296             /**************************
2297              * CALCULATE INTERACTIONS *
2298              **************************/
2299
2300             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
2301             {
2302
2303             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
2304
2305             /* EWALD ELECTROSTATICS */
2306
2307             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2308             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
2309             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2310             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2311             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2312
2313             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2314             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2315             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2316             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2317             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2318             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2319             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2320             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2321             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
2322             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
2323
2324             d                = _fjsp_sub_v2r8(r31,rswitch);
2325             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2326             d2               = _fjsp_mul_v2r8(d,d);
2327             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2328
2329             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2330
2331             /* Evaluate switch function */
2332             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2333             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
2334             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
2335
2336             fscal            = felec;
2337
2338             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2339
2340             /* Update vectorial force */
2341             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
2342             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
2343             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
2344             
2345             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
2346             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
2347             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
2348
2349             }
2350
2351             /**************************
2352              * CALCULATE INTERACTIONS *
2353              **************************/
2354
2355             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
2356             {
2357
2358             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
2359
2360             /* EWALD ELECTROSTATICS */
2361
2362             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2363             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
2364             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2365             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2366             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2367
2368             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2369             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2370             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2371             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2372             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2373             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2374             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2375             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2376             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
2377             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
2378
2379             d                = _fjsp_sub_v2r8(r32,rswitch);
2380             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2381             d2               = _fjsp_mul_v2r8(d,d);
2382             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2383
2384             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2385
2386             /* Evaluate switch function */
2387             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2388             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
2389             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
2390
2391             fscal            = felec;
2392
2393             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2394
2395             /* Update vectorial force */
2396             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
2397             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
2398             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
2399             
2400             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
2401             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
2402             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
2403
2404             }
2405
2406             /**************************
2407              * CALCULATE INTERACTIONS *
2408              **************************/
2409
2410             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
2411             {
2412
2413             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
2414
2415             /* EWALD ELECTROSTATICS */
2416
2417             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2418             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
2419             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2420             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2421             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2422
2423             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2424             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2425             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2426             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2427             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2428             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2429             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2430             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2431             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
2432             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
2433
2434             d                = _fjsp_sub_v2r8(r33,rswitch);
2435             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2436             d2               = _fjsp_mul_v2r8(d,d);
2437             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2438
2439             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2440
2441             /* Evaluate switch function */
2442             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2443             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
2444             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
2445
2446             fscal            = felec;
2447
2448             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2449
2450             /* Update vectorial force */
2451             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
2452             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
2453             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
2454             
2455             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
2456             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
2457             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
2458
2459             }
2460
2461             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2462
2463             /* Inner loop uses 647 flops */
2464         }
2465
2466         if(jidx<j_index_end)
2467         {
2468
2469             jnrA             = jjnr[jidx];
2470             j_coord_offsetA  = DIM*jnrA;
2471
2472             /* load j atom coordinates */
2473             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
2474                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2475                                               &jy2,&jz2,&jx3,&jy3,&jz3);
2476
2477             /* Calculate displacement vector */
2478             dx00             = _fjsp_sub_v2r8(ix0,jx0);
2479             dy00             = _fjsp_sub_v2r8(iy0,jy0);
2480             dz00             = _fjsp_sub_v2r8(iz0,jz0);
2481             dx11             = _fjsp_sub_v2r8(ix1,jx1);
2482             dy11             = _fjsp_sub_v2r8(iy1,jy1);
2483             dz11             = _fjsp_sub_v2r8(iz1,jz1);
2484             dx12             = _fjsp_sub_v2r8(ix1,jx2);
2485             dy12             = _fjsp_sub_v2r8(iy1,jy2);
2486             dz12             = _fjsp_sub_v2r8(iz1,jz2);
2487             dx13             = _fjsp_sub_v2r8(ix1,jx3);
2488             dy13             = _fjsp_sub_v2r8(iy1,jy3);
2489             dz13             = _fjsp_sub_v2r8(iz1,jz3);
2490             dx21             = _fjsp_sub_v2r8(ix2,jx1);
2491             dy21             = _fjsp_sub_v2r8(iy2,jy1);
2492             dz21             = _fjsp_sub_v2r8(iz2,jz1);
2493             dx22             = _fjsp_sub_v2r8(ix2,jx2);
2494             dy22             = _fjsp_sub_v2r8(iy2,jy2);
2495             dz22             = _fjsp_sub_v2r8(iz2,jz2);
2496             dx23             = _fjsp_sub_v2r8(ix2,jx3);
2497             dy23             = _fjsp_sub_v2r8(iy2,jy3);
2498             dz23             = _fjsp_sub_v2r8(iz2,jz3);
2499             dx31             = _fjsp_sub_v2r8(ix3,jx1);
2500             dy31             = _fjsp_sub_v2r8(iy3,jy1);
2501             dz31             = _fjsp_sub_v2r8(iz3,jz1);
2502             dx32             = _fjsp_sub_v2r8(ix3,jx2);
2503             dy32             = _fjsp_sub_v2r8(iy3,jy2);
2504             dz32             = _fjsp_sub_v2r8(iz3,jz2);
2505             dx33             = _fjsp_sub_v2r8(ix3,jx3);
2506             dy33             = _fjsp_sub_v2r8(iy3,jy3);
2507             dz33             = _fjsp_sub_v2r8(iz3,jz3);
2508
2509             /* Calculate squared distance and things based on it */
2510             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
2511             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
2512             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
2513             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
2514             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
2515             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
2516             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
2517             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
2518             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
2519             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
2520
2521             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
2522             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
2523             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
2524             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
2525             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
2526             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
2527             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
2528             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
2529             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
2530             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
2531
2532             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
2533             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
2534             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
2535             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
2536             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
2537             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
2538             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
2539             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
2540             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
2541             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
2542
2543             fjx0             = _fjsp_setzero_v2r8();
2544             fjy0             = _fjsp_setzero_v2r8();
2545             fjz0             = _fjsp_setzero_v2r8();
2546             fjx1             = _fjsp_setzero_v2r8();
2547             fjy1             = _fjsp_setzero_v2r8();
2548             fjz1             = _fjsp_setzero_v2r8();
2549             fjx2             = _fjsp_setzero_v2r8();
2550             fjy2             = _fjsp_setzero_v2r8();
2551             fjz2             = _fjsp_setzero_v2r8();
2552             fjx3             = _fjsp_setzero_v2r8();
2553             fjy3             = _fjsp_setzero_v2r8();
2554             fjz3             = _fjsp_setzero_v2r8();
2555
2556             /**************************
2557              * CALCULATE INTERACTIONS *
2558              **************************/
2559
2560             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
2561             {
2562
2563             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
2564
2565             /* LENNARD-JONES DISPERSION/REPULSION */
2566
2567             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
2568             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
2569             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
2570             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
2571             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
2572
2573             d                = _fjsp_sub_v2r8(r00,rswitch);
2574             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2575             d2               = _fjsp_mul_v2r8(d,d);
2576             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2577
2578             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2579
2580             /* Evaluate switch function */
2581             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2582             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
2583             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
2584
2585             fscal            = fvdw;
2586
2587             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2588
2589             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2590
2591             /* Update vectorial force */
2592             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
2593             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
2594             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
2595             
2596             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
2597             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
2598             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
2599
2600             }
2601
2602             /**************************
2603              * CALCULATE INTERACTIONS *
2604              **************************/
2605
2606             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
2607             {
2608
2609             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
2610
2611             /* EWALD ELECTROSTATICS */
2612
2613             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2614             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
2615             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2616             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2617             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2618
2619             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2620             ewtabD           = _fjsp_setzero_v2r8();
2621             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2622             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2623             ewtabFn          = _fjsp_setzero_v2r8();
2624             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2625             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2626             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2627             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
2628             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
2629
2630             d                = _fjsp_sub_v2r8(r11,rswitch);
2631             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2632             d2               = _fjsp_mul_v2r8(d,d);
2633             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2634
2635             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2636
2637             /* Evaluate switch function */
2638             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2639             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
2640             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
2641
2642             fscal            = felec;
2643
2644             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2645
2646             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2647
2648             /* Update vectorial force */
2649             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
2650             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
2651             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
2652             
2653             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
2654             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
2655             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
2656
2657             }
2658
2659             /**************************
2660              * CALCULATE INTERACTIONS *
2661              **************************/
2662
2663             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
2664             {
2665
2666             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
2667
2668             /* EWALD ELECTROSTATICS */
2669
2670             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2671             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
2672             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2673             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2674             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2675
2676             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2677             ewtabD           = _fjsp_setzero_v2r8();
2678             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2679             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2680             ewtabFn          = _fjsp_setzero_v2r8();
2681             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2682             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2683             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2684             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
2685             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
2686
2687             d                = _fjsp_sub_v2r8(r12,rswitch);
2688             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2689             d2               = _fjsp_mul_v2r8(d,d);
2690             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2691
2692             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2693
2694             /* Evaluate switch function */
2695             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2696             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
2697             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
2698
2699             fscal            = felec;
2700
2701             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2702
2703             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2704
2705             /* Update vectorial force */
2706             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
2707             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
2708             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
2709             
2710             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
2711             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
2712             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
2713
2714             }
2715
2716             /**************************
2717              * CALCULATE INTERACTIONS *
2718              **************************/
2719
2720             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
2721             {
2722
2723             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
2724
2725             /* EWALD ELECTROSTATICS */
2726
2727             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2728             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
2729             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2730             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2731             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2732
2733             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2734             ewtabD           = _fjsp_setzero_v2r8();
2735             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2736             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2737             ewtabFn          = _fjsp_setzero_v2r8();
2738             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2739             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2740             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2741             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
2742             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
2743
2744             d                = _fjsp_sub_v2r8(r13,rswitch);
2745             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2746             d2               = _fjsp_mul_v2r8(d,d);
2747             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2748
2749             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2750
2751             /* Evaluate switch function */
2752             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2753             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
2754             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
2755
2756             fscal            = felec;
2757
2758             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2759
2760             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2761
2762             /* Update vectorial force */
2763             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
2764             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
2765             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
2766             
2767             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
2768             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
2769             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
2770
2771             }
2772
2773             /**************************
2774              * CALCULATE INTERACTIONS *
2775              **************************/
2776
2777             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
2778             {
2779
2780             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
2781
2782             /* EWALD ELECTROSTATICS */
2783
2784             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2785             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
2786             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2787             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2788             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2789
2790             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2791             ewtabD           = _fjsp_setzero_v2r8();
2792             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2793             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2794             ewtabFn          = _fjsp_setzero_v2r8();
2795             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2796             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2797             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2798             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
2799             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
2800
2801             d                = _fjsp_sub_v2r8(r21,rswitch);
2802             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2803             d2               = _fjsp_mul_v2r8(d,d);
2804             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2805
2806             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2807
2808             /* Evaluate switch function */
2809             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2810             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
2811             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
2812
2813             fscal            = felec;
2814
2815             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2816
2817             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2818
2819             /* Update vectorial force */
2820             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
2821             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
2822             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
2823             
2824             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
2825             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
2826             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
2827
2828             }
2829
2830             /**************************
2831              * CALCULATE INTERACTIONS *
2832              **************************/
2833
2834             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
2835             {
2836
2837             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
2838
2839             /* EWALD ELECTROSTATICS */
2840
2841             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2842             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
2843             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2844             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2845             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2846
2847             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2848             ewtabD           = _fjsp_setzero_v2r8();
2849             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2850             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2851             ewtabFn          = _fjsp_setzero_v2r8();
2852             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2853             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2854             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2855             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
2856             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
2857
2858             d                = _fjsp_sub_v2r8(r22,rswitch);
2859             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2860             d2               = _fjsp_mul_v2r8(d,d);
2861             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2862
2863             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2864
2865             /* Evaluate switch function */
2866             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2867             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
2868             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
2869
2870             fscal            = felec;
2871
2872             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2873
2874             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2875
2876             /* Update vectorial force */
2877             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
2878             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
2879             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
2880             
2881             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
2882             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
2883             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
2884
2885             }
2886
2887             /**************************
2888              * CALCULATE INTERACTIONS *
2889              **************************/
2890
2891             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
2892             {
2893
2894             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
2895
2896             /* EWALD ELECTROSTATICS */
2897
2898             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2899             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
2900             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2901             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2902             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2903
2904             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2905             ewtabD           = _fjsp_setzero_v2r8();
2906             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2907             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2908             ewtabFn          = _fjsp_setzero_v2r8();
2909             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2910             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2911             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2912             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
2913             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
2914
2915             d                = _fjsp_sub_v2r8(r23,rswitch);
2916             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2917             d2               = _fjsp_mul_v2r8(d,d);
2918             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2919
2920             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2921
2922             /* Evaluate switch function */
2923             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2924             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
2925             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
2926
2927             fscal            = felec;
2928
2929             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2930
2931             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2932
2933             /* Update vectorial force */
2934             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
2935             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
2936             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
2937             
2938             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
2939             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
2940             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
2941
2942             }
2943
2944             /**************************
2945              * CALCULATE INTERACTIONS *
2946              **************************/
2947
2948             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
2949             {
2950
2951             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
2952
2953             /* EWALD ELECTROSTATICS */
2954
2955             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2956             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
2957             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2958             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2959             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2960
2961             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2962             ewtabD           = _fjsp_setzero_v2r8();
2963             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2964             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2965             ewtabFn          = _fjsp_setzero_v2r8();
2966             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2967             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2968             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2969             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
2970             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
2971
2972             d                = _fjsp_sub_v2r8(r31,rswitch);
2973             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2974             d2               = _fjsp_mul_v2r8(d,d);
2975             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2976
2977             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2978
2979             /* Evaluate switch function */
2980             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2981             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
2982             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
2983
2984             fscal            = felec;
2985
2986             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2987
2988             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2989
2990             /* Update vectorial force */
2991             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
2992             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
2993             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
2994             
2995             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
2996             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
2997             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
2998
2999             }
3000
3001             /**************************
3002              * CALCULATE INTERACTIONS *
3003              **************************/
3004
3005             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
3006             {
3007
3008             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
3009
3010             /* EWALD ELECTROSTATICS */
3011
3012             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3013             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
3014             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
3015             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
3016             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
3017
3018             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
3019             ewtabD           = _fjsp_setzero_v2r8();
3020             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
3021             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
3022             ewtabFn          = _fjsp_setzero_v2r8();
3023             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
3024             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
3025             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
3026             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
3027             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
3028
3029             d                = _fjsp_sub_v2r8(r32,rswitch);
3030             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
3031             d2               = _fjsp_mul_v2r8(d,d);
3032             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
3033
3034             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
3035
3036             /* Evaluate switch function */
3037             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3038             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
3039             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
3040
3041             fscal            = felec;
3042
3043             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
3044
3045             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
3046
3047             /* Update vectorial force */
3048             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
3049             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
3050             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
3051             
3052             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
3053             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
3054             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
3055
3056             }
3057
3058             /**************************
3059              * CALCULATE INTERACTIONS *
3060              **************************/
3061
3062             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
3063             {
3064
3065             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
3066
3067             /* EWALD ELECTROSTATICS */
3068
3069             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3070             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
3071             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
3072             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
3073             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
3074
3075             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
3076             ewtabD           = _fjsp_setzero_v2r8();
3077             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
3078             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
3079             ewtabFn          = _fjsp_setzero_v2r8();
3080             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
3081             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
3082             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
3083             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
3084             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
3085
3086             d                = _fjsp_sub_v2r8(r33,rswitch);
3087             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
3088             d2               = _fjsp_mul_v2r8(d,d);
3089             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
3090
3091             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
3092
3093             /* Evaluate switch function */
3094             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3095             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
3096             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
3097
3098             fscal            = felec;
3099
3100             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
3101
3102             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
3103
3104             /* Update vectorial force */
3105             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
3106             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
3107             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
3108             
3109             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
3110             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
3111             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
3112
3113             }
3114
3115             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3116
3117             /* Inner loop uses 647 flops */
3118         }
3119
3120         /* End of innermost loop */
3121
3122         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3123                                               f+i_coord_offset,fshift+i_shift_offset);
3124
3125         /* Increment number of inner iterations */
3126         inneriter                  += j_index_end - j_index_start;
3127
3128         /* Outer loop uses 24 flops */
3129     }
3130
3131     /* Increment number of outer iterations */
3132     outeriter        += nri;
3133
3134     /* Update outer/inner flops */
3135
3136     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*647);
3137 }