Introduce gmxpre.h for truly global definitions
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sparc64_hpc_ace_double / nb_kernel_ElecEwSh_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
48
49 #include "kernelutil_sparc64_hpc_ace_double.h"
50
51 /*
52  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
53  * Electrostatics interaction: Ewald
54  * VdW interaction:            None
55  * Geometry:                   Water4-Water4
56  * Calculate force/pot:        PotentialAndForce
57  */
58 void
59 nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
60                     (t_nblist                    * gmx_restrict       nlist,
61                      rvec                        * gmx_restrict          xx,
62                      rvec                        * gmx_restrict          ff,
63                      t_forcerec                  * gmx_restrict          fr,
64                      t_mdatoms                   * gmx_restrict     mdatoms,
65                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
66                      t_nrnb                      * gmx_restrict        nrnb)
67 {
68     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69      * just 0 for non-waters.
70      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
71      * jnr indices corresponding to data put in the four positions in the SIMD register.
72      */
73     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
74     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75     int              jnrA,jnrB;
76     int              j_coord_offsetA,j_coord_offsetB;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
81     int              vdwioffset1;
82     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
83     int              vdwioffset2;
84     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
85     int              vdwioffset3;
86     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
87     int              vdwjidx1A,vdwjidx1B;
88     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
89     int              vdwjidx2A,vdwjidx2B;
90     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
91     int              vdwjidx3A,vdwjidx3B;
92     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
93     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
94     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
95     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
96     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
97     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
98     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
99     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
100     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
101     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
102     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
103     real             *charge;
104     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
105     real             *ewtab;
106     _fjsp_v2r8       itab_tmp;
107     _fjsp_v2r8       dummy_mask,cutoff_mask;
108     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
109     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
110     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
111
112     x                = xx[0];
113     f                = ff[0];
114
115     nri              = nlist->nri;
116     iinr             = nlist->iinr;
117     jindex           = nlist->jindex;
118     jjnr             = nlist->jjnr;
119     shiftidx         = nlist->shift;
120     gid              = nlist->gid;
121     shiftvec         = fr->shift_vec[0];
122     fshift           = fr->fshift[0];
123     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
124     charge           = mdatoms->chargeA;
125
126     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
127     ewtab            = fr->ic->tabq_coul_FDV0;
128     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
129     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
130
131     /* Setup water-specific parameters */
132     inr              = nlist->iinr[0];
133     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
134     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
135     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
136
137     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
138     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
139     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
140     qq11             = _fjsp_mul_v2r8(iq1,jq1);
141     qq12             = _fjsp_mul_v2r8(iq1,jq2);
142     qq13             = _fjsp_mul_v2r8(iq1,jq3);
143     qq21             = _fjsp_mul_v2r8(iq2,jq1);
144     qq22             = _fjsp_mul_v2r8(iq2,jq2);
145     qq23             = _fjsp_mul_v2r8(iq2,jq3);
146     qq31             = _fjsp_mul_v2r8(iq3,jq1);
147     qq32             = _fjsp_mul_v2r8(iq3,jq2);
148     qq33             = _fjsp_mul_v2r8(iq3,jq3);
149
150     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
151     rcutoff_scalar   = fr->rcoulomb;
152     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
153     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
154
155     /* Avoid stupid compiler warnings */
156     jnrA = jnrB = 0;
157     j_coord_offsetA = 0;
158     j_coord_offsetB = 0;
159
160     outeriter        = 0;
161     inneriter        = 0;
162
163     /* Start outer loop over neighborlists */
164     for(iidx=0; iidx<nri; iidx++)
165     {
166         /* Load shift vector for this list */
167         i_shift_offset   = DIM*shiftidx[iidx];
168
169         /* Load limits for loop over neighbors */
170         j_index_start    = jindex[iidx];
171         j_index_end      = jindex[iidx+1];
172
173         /* Get outer coordinate index */
174         inr              = iinr[iidx];
175         i_coord_offset   = DIM*inr;
176
177         /* Load i particle coords and add shift vector */
178         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
179                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
180
181         fix1             = _fjsp_setzero_v2r8();
182         fiy1             = _fjsp_setzero_v2r8();
183         fiz1             = _fjsp_setzero_v2r8();
184         fix2             = _fjsp_setzero_v2r8();
185         fiy2             = _fjsp_setzero_v2r8();
186         fiz2             = _fjsp_setzero_v2r8();
187         fix3             = _fjsp_setzero_v2r8();
188         fiy3             = _fjsp_setzero_v2r8();
189         fiz3             = _fjsp_setzero_v2r8();
190
191         /* Reset potential sums */
192         velecsum         = _fjsp_setzero_v2r8();
193
194         /* Start inner kernel loop */
195         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
196         {
197
198             /* Get j neighbor index, and coordinate index */
199             jnrA             = jjnr[jidx];
200             jnrB             = jjnr[jidx+1];
201             j_coord_offsetA  = DIM*jnrA;
202             j_coord_offsetB  = DIM*jnrB;
203
204             /* load j atom coordinates */
205             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
206                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
207
208             /* Calculate displacement vector */
209             dx11             = _fjsp_sub_v2r8(ix1,jx1);
210             dy11             = _fjsp_sub_v2r8(iy1,jy1);
211             dz11             = _fjsp_sub_v2r8(iz1,jz1);
212             dx12             = _fjsp_sub_v2r8(ix1,jx2);
213             dy12             = _fjsp_sub_v2r8(iy1,jy2);
214             dz12             = _fjsp_sub_v2r8(iz1,jz2);
215             dx13             = _fjsp_sub_v2r8(ix1,jx3);
216             dy13             = _fjsp_sub_v2r8(iy1,jy3);
217             dz13             = _fjsp_sub_v2r8(iz1,jz3);
218             dx21             = _fjsp_sub_v2r8(ix2,jx1);
219             dy21             = _fjsp_sub_v2r8(iy2,jy1);
220             dz21             = _fjsp_sub_v2r8(iz2,jz1);
221             dx22             = _fjsp_sub_v2r8(ix2,jx2);
222             dy22             = _fjsp_sub_v2r8(iy2,jy2);
223             dz22             = _fjsp_sub_v2r8(iz2,jz2);
224             dx23             = _fjsp_sub_v2r8(ix2,jx3);
225             dy23             = _fjsp_sub_v2r8(iy2,jy3);
226             dz23             = _fjsp_sub_v2r8(iz2,jz3);
227             dx31             = _fjsp_sub_v2r8(ix3,jx1);
228             dy31             = _fjsp_sub_v2r8(iy3,jy1);
229             dz31             = _fjsp_sub_v2r8(iz3,jz1);
230             dx32             = _fjsp_sub_v2r8(ix3,jx2);
231             dy32             = _fjsp_sub_v2r8(iy3,jy2);
232             dz32             = _fjsp_sub_v2r8(iz3,jz2);
233             dx33             = _fjsp_sub_v2r8(ix3,jx3);
234             dy33             = _fjsp_sub_v2r8(iy3,jy3);
235             dz33             = _fjsp_sub_v2r8(iz3,jz3);
236
237             /* Calculate squared distance and things based on it */
238             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
239             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
240             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
241             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
242             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
243             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
244             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
245             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
246             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
247
248             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
249             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
250             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
251             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
252             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
253             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
254             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
255             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
256             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
257
258             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
259             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
260             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
261             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
262             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
263             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
264             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
265             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
266             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
267
268             fjx1             = _fjsp_setzero_v2r8();
269             fjy1             = _fjsp_setzero_v2r8();
270             fjz1             = _fjsp_setzero_v2r8();
271             fjx2             = _fjsp_setzero_v2r8();
272             fjy2             = _fjsp_setzero_v2r8();
273             fjz2             = _fjsp_setzero_v2r8();
274             fjx3             = _fjsp_setzero_v2r8();
275             fjy3             = _fjsp_setzero_v2r8();
276             fjz3             = _fjsp_setzero_v2r8();
277
278             /**************************
279              * CALCULATE INTERACTIONS *
280              **************************/
281
282             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
283             {
284
285             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
286
287             /* EWALD ELECTROSTATICS */
288
289             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
290             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
291             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
292             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
293             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
294
295             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
296             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
297             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
298             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
299             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
300             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
301             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
302             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
303             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
304             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
305
306             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
307
308             /* Update potential sum for this i atom from the interaction with this j atom. */
309             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
310             velecsum         = _fjsp_add_v2r8(velecsum,velec);
311
312             fscal            = felec;
313
314             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
315
316             /* Update vectorial force */
317             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
318             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
319             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
320             
321             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
322             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
323             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
324
325             }
326
327             /**************************
328              * CALCULATE INTERACTIONS *
329              **************************/
330
331             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
332             {
333
334             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
335
336             /* EWALD ELECTROSTATICS */
337
338             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
339             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
340             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
341             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
342             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
343
344             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
345             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
346             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
347             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
348             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
349             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
350             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
351             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
352             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
353             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
354
355             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
356
357             /* Update potential sum for this i atom from the interaction with this j atom. */
358             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
359             velecsum         = _fjsp_add_v2r8(velecsum,velec);
360
361             fscal            = felec;
362
363             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
364
365             /* Update vectorial force */
366             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
367             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
368             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
369             
370             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
371             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
372             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
373
374             }
375
376             /**************************
377              * CALCULATE INTERACTIONS *
378              **************************/
379
380             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
381             {
382
383             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
384
385             /* EWALD ELECTROSTATICS */
386
387             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
388             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
389             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
390             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
391             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
392
393             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
394             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
395             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
396             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
397             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
398             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
399             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
400             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
401             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv13,sh_ewald),velec));
402             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
403
404             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
405
406             /* Update potential sum for this i atom from the interaction with this j atom. */
407             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
408             velecsum         = _fjsp_add_v2r8(velecsum,velec);
409
410             fscal            = felec;
411
412             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
413
414             /* Update vectorial force */
415             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
416             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
417             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
418             
419             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
420             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
421             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
422
423             }
424
425             /**************************
426              * CALCULATE INTERACTIONS *
427              **************************/
428
429             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
430             {
431
432             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
433
434             /* EWALD ELECTROSTATICS */
435
436             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
437             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
438             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
439             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
440             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
441
442             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
443             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
444             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
445             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
446             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
447             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
448             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
449             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
450             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
451             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
452
453             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
454
455             /* Update potential sum for this i atom from the interaction with this j atom. */
456             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
457             velecsum         = _fjsp_add_v2r8(velecsum,velec);
458
459             fscal            = felec;
460
461             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
462
463             /* Update vectorial force */
464             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
465             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
466             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
467             
468             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
469             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
470             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
471
472             }
473
474             /**************************
475              * CALCULATE INTERACTIONS *
476              **************************/
477
478             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
479             {
480
481             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
482
483             /* EWALD ELECTROSTATICS */
484
485             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
486             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
487             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
488             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
489             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
490
491             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
492             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
493             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
494             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
495             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
496             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
497             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
498             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
499             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
500             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
501
502             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
503
504             /* Update potential sum for this i atom from the interaction with this j atom. */
505             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
506             velecsum         = _fjsp_add_v2r8(velecsum,velec);
507
508             fscal            = felec;
509
510             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
511
512             /* Update vectorial force */
513             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
514             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
515             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
516             
517             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
518             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
519             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
520
521             }
522
523             /**************************
524              * CALCULATE INTERACTIONS *
525              **************************/
526
527             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
528             {
529
530             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
531
532             /* EWALD ELECTROSTATICS */
533
534             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
535             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
536             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
537             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
538             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
539
540             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
541             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
542             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
543             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
544             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
545             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
546             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
547             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
548             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv23,sh_ewald),velec));
549             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
550
551             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
552
553             /* Update potential sum for this i atom from the interaction with this j atom. */
554             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
555             velecsum         = _fjsp_add_v2r8(velecsum,velec);
556
557             fscal            = felec;
558
559             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
560
561             /* Update vectorial force */
562             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
563             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
564             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
565             
566             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
567             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
568             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
569
570             }
571
572             /**************************
573              * CALCULATE INTERACTIONS *
574              **************************/
575
576             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
577             {
578
579             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
580
581             /* EWALD ELECTROSTATICS */
582
583             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
584             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
585             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
586             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
587             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
588
589             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
590             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
591             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
592             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
593             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
594             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
595             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
596             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
597             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv31,sh_ewald),velec));
598             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
599
600             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
601
602             /* Update potential sum for this i atom from the interaction with this j atom. */
603             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
604             velecsum         = _fjsp_add_v2r8(velecsum,velec);
605
606             fscal            = felec;
607
608             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
609
610             /* Update vectorial force */
611             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
612             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
613             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
614             
615             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
616             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
617             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
618
619             }
620
621             /**************************
622              * CALCULATE INTERACTIONS *
623              **************************/
624
625             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
626             {
627
628             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
629
630             /* EWALD ELECTROSTATICS */
631
632             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
633             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
634             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
635             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
636             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
637
638             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
639             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
640             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
641             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
642             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
643             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
644             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
645             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
646             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv32,sh_ewald),velec));
647             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
648
649             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
650
651             /* Update potential sum for this i atom from the interaction with this j atom. */
652             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
653             velecsum         = _fjsp_add_v2r8(velecsum,velec);
654
655             fscal            = felec;
656
657             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
658
659             /* Update vectorial force */
660             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
661             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
662             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
663             
664             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
665             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
666             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
667
668             }
669
670             /**************************
671              * CALCULATE INTERACTIONS *
672              **************************/
673
674             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
675             {
676
677             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
678
679             /* EWALD ELECTROSTATICS */
680
681             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
682             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
683             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
684             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
685             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
686
687             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
688             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
689             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
690             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
691             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
692             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
693             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
694             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
695             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv33,sh_ewald),velec));
696             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
697
698             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
699
700             /* Update potential sum for this i atom from the interaction with this j atom. */
701             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
702             velecsum         = _fjsp_add_v2r8(velecsum,velec);
703
704             fscal            = felec;
705
706             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
707
708             /* Update vectorial force */
709             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
710             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
711             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
712             
713             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
714             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
715             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
716
717             }
718
719             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
720
721             /* Inner loop uses 441 flops */
722         }
723
724         if(jidx<j_index_end)
725         {
726
727             jnrA             = jjnr[jidx];
728             j_coord_offsetA  = DIM*jnrA;
729
730             /* load j atom coordinates */
731             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
732                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
733
734             /* Calculate displacement vector */
735             dx11             = _fjsp_sub_v2r8(ix1,jx1);
736             dy11             = _fjsp_sub_v2r8(iy1,jy1);
737             dz11             = _fjsp_sub_v2r8(iz1,jz1);
738             dx12             = _fjsp_sub_v2r8(ix1,jx2);
739             dy12             = _fjsp_sub_v2r8(iy1,jy2);
740             dz12             = _fjsp_sub_v2r8(iz1,jz2);
741             dx13             = _fjsp_sub_v2r8(ix1,jx3);
742             dy13             = _fjsp_sub_v2r8(iy1,jy3);
743             dz13             = _fjsp_sub_v2r8(iz1,jz3);
744             dx21             = _fjsp_sub_v2r8(ix2,jx1);
745             dy21             = _fjsp_sub_v2r8(iy2,jy1);
746             dz21             = _fjsp_sub_v2r8(iz2,jz1);
747             dx22             = _fjsp_sub_v2r8(ix2,jx2);
748             dy22             = _fjsp_sub_v2r8(iy2,jy2);
749             dz22             = _fjsp_sub_v2r8(iz2,jz2);
750             dx23             = _fjsp_sub_v2r8(ix2,jx3);
751             dy23             = _fjsp_sub_v2r8(iy2,jy3);
752             dz23             = _fjsp_sub_v2r8(iz2,jz3);
753             dx31             = _fjsp_sub_v2r8(ix3,jx1);
754             dy31             = _fjsp_sub_v2r8(iy3,jy1);
755             dz31             = _fjsp_sub_v2r8(iz3,jz1);
756             dx32             = _fjsp_sub_v2r8(ix3,jx2);
757             dy32             = _fjsp_sub_v2r8(iy3,jy2);
758             dz32             = _fjsp_sub_v2r8(iz3,jz2);
759             dx33             = _fjsp_sub_v2r8(ix3,jx3);
760             dy33             = _fjsp_sub_v2r8(iy3,jy3);
761             dz33             = _fjsp_sub_v2r8(iz3,jz3);
762
763             /* Calculate squared distance and things based on it */
764             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
765             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
766             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
767             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
768             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
769             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
770             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
771             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
772             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
773
774             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
775             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
776             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
777             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
778             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
779             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
780             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
781             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
782             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
783
784             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
785             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
786             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
787             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
788             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
789             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
790             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
791             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
792             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
793
794             fjx1             = _fjsp_setzero_v2r8();
795             fjy1             = _fjsp_setzero_v2r8();
796             fjz1             = _fjsp_setzero_v2r8();
797             fjx2             = _fjsp_setzero_v2r8();
798             fjy2             = _fjsp_setzero_v2r8();
799             fjz2             = _fjsp_setzero_v2r8();
800             fjx3             = _fjsp_setzero_v2r8();
801             fjy3             = _fjsp_setzero_v2r8();
802             fjz3             = _fjsp_setzero_v2r8();
803
804             /**************************
805              * CALCULATE INTERACTIONS *
806              **************************/
807
808             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
809             {
810
811             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
812
813             /* EWALD ELECTROSTATICS */
814
815             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
816             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
817             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
818             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
819             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
820
821             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
822             ewtabD           = _fjsp_setzero_v2r8();
823             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
824             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
825             ewtabFn          = _fjsp_setzero_v2r8();
826             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
827             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
828             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
829             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
830             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
831
832             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
833
834             /* Update potential sum for this i atom from the interaction with this j atom. */
835             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
836             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
837             velecsum         = _fjsp_add_v2r8(velecsum,velec);
838
839             fscal            = felec;
840
841             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
842
843             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
844
845             /* Update vectorial force */
846             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
847             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
848             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
849             
850             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
851             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
852             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
853
854             }
855
856             /**************************
857              * CALCULATE INTERACTIONS *
858              **************************/
859
860             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
861             {
862
863             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
864
865             /* EWALD ELECTROSTATICS */
866
867             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
868             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
869             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
870             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
871             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
872
873             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
874             ewtabD           = _fjsp_setzero_v2r8();
875             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
876             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
877             ewtabFn          = _fjsp_setzero_v2r8();
878             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
879             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
880             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
881             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
882             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
883
884             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
885
886             /* Update potential sum for this i atom from the interaction with this j atom. */
887             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
888             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
889             velecsum         = _fjsp_add_v2r8(velecsum,velec);
890
891             fscal            = felec;
892
893             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
894
895             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
896
897             /* Update vectorial force */
898             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
899             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
900             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
901             
902             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
903             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
904             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
905
906             }
907
908             /**************************
909              * CALCULATE INTERACTIONS *
910              **************************/
911
912             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
913             {
914
915             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
916
917             /* EWALD ELECTROSTATICS */
918
919             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
920             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
921             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
922             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
923             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
924
925             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
926             ewtabD           = _fjsp_setzero_v2r8();
927             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
928             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
929             ewtabFn          = _fjsp_setzero_v2r8();
930             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
931             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
932             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
933             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv13,sh_ewald),velec));
934             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
935
936             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
937
938             /* Update potential sum for this i atom from the interaction with this j atom. */
939             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
940             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
941             velecsum         = _fjsp_add_v2r8(velecsum,velec);
942
943             fscal            = felec;
944
945             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
946
947             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
948
949             /* Update vectorial force */
950             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
951             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
952             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
953             
954             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
955             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
956             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
957
958             }
959
960             /**************************
961              * CALCULATE INTERACTIONS *
962              **************************/
963
964             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
965             {
966
967             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
968
969             /* EWALD ELECTROSTATICS */
970
971             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
972             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
973             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
974             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
975             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
976
977             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
978             ewtabD           = _fjsp_setzero_v2r8();
979             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
980             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
981             ewtabFn          = _fjsp_setzero_v2r8();
982             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
983             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
984             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
985             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
986             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
987
988             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
989
990             /* Update potential sum for this i atom from the interaction with this j atom. */
991             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
992             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
993             velecsum         = _fjsp_add_v2r8(velecsum,velec);
994
995             fscal            = felec;
996
997             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
998
999             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1000
1001             /* Update vectorial force */
1002             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
1003             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
1004             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
1005             
1006             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
1007             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
1008             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
1009
1010             }
1011
1012             /**************************
1013              * CALCULATE INTERACTIONS *
1014              **************************/
1015
1016             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
1017             {
1018
1019             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
1020
1021             /* EWALD ELECTROSTATICS */
1022
1023             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1024             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
1025             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1026             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1027             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1028
1029             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1030             ewtabD           = _fjsp_setzero_v2r8();
1031             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1032             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1033             ewtabFn          = _fjsp_setzero_v2r8();
1034             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1035             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1036             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1037             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
1038             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
1039
1040             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
1041
1042             /* Update potential sum for this i atom from the interaction with this j atom. */
1043             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1044             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1045             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1046
1047             fscal            = felec;
1048
1049             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1050
1051             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1052
1053             /* Update vectorial force */
1054             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
1055             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
1056             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
1057             
1058             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
1059             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
1060             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
1061
1062             }
1063
1064             /**************************
1065              * CALCULATE INTERACTIONS *
1066              **************************/
1067
1068             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
1069             {
1070
1071             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
1072
1073             /* EWALD ELECTROSTATICS */
1074
1075             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1076             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
1077             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1078             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1079             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1080
1081             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1082             ewtabD           = _fjsp_setzero_v2r8();
1083             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1084             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1085             ewtabFn          = _fjsp_setzero_v2r8();
1086             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1087             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1088             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1089             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv23,sh_ewald),velec));
1090             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
1091
1092             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
1093
1094             /* Update potential sum for this i atom from the interaction with this j atom. */
1095             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1096             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1097             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1098
1099             fscal            = felec;
1100
1101             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1102
1103             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1104
1105             /* Update vectorial force */
1106             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
1107             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
1108             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
1109             
1110             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
1111             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
1112             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
1113
1114             }
1115
1116             /**************************
1117              * CALCULATE INTERACTIONS *
1118              **************************/
1119
1120             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
1121             {
1122
1123             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
1124
1125             /* EWALD ELECTROSTATICS */
1126
1127             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1128             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
1129             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1130             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1131             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1132
1133             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1134             ewtabD           = _fjsp_setzero_v2r8();
1135             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1136             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1137             ewtabFn          = _fjsp_setzero_v2r8();
1138             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1139             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1140             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1141             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv31,sh_ewald),velec));
1142             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
1143
1144             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
1145
1146             /* Update potential sum for this i atom from the interaction with this j atom. */
1147             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1148             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1149             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1150
1151             fscal            = felec;
1152
1153             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1154
1155             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1156
1157             /* Update vectorial force */
1158             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
1159             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
1160             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
1161             
1162             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
1163             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
1164             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
1165
1166             }
1167
1168             /**************************
1169              * CALCULATE INTERACTIONS *
1170              **************************/
1171
1172             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
1173             {
1174
1175             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
1176
1177             /* EWALD ELECTROSTATICS */
1178
1179             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1180             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
1181             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1182             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1183             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1184
1185             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1186             ewtabD           = _fjsp_setzero_v2r8();
1187             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1188             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1189             ewtabFn          = _fjsp_setzero_v2r8();
1190             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1191             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1192             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1193             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv32,sh_ewald),velec));
1194             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
1195
1196             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
1197
1198             /* Update potential sum for this i atom from the interaction with this j atom. */
1199             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1200             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1201             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1202
1203             fscal            = felec;
1204
1205             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1206
1207             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1208
1209             /* Update vectorial force */
1210             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
1211             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
1212             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
1213             
1214             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
1215             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
1216             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
1217
1218             }
1219
1220             /**************************
1221              * CALCULATE INTERACTIONS *
1222              **************************/
1223
1224             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
1225             {
1226
1227             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
1228
1229             /* EWALD ELECTROSTATICS */
1230
1231             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1232             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
1233             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1234             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1235             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1236
1237             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1238             ewtabD           = _fjsp_setzero_v2r8();
1239             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1240             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1241             ewtabFn          = _fjsp_setzero_v2r8();
1242             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1243             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1244             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1245             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv33,sh_ewald),velec));
1246             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
1247
1248             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
1249
1250             /* Update potential sum for this i atom from the interaction with this j atom. */
1251             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
1252             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1253             velecsum         = _fjsp_add_v2r8(velecsum,velec);
1254
1255             fscal            = felec;
1256
1257             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1258
1259             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1260
1261             /* Update vectorial force */
1262             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
1263             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
1264             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
1265             
1266             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
1267             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
1268             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
1269
1270             }
1271
1272             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1273
1274             /* Inner loop uses 441 flops */
1275         }
1276
1277         /* End of innermost loop */
1278
1279         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1280                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
1281
1282         ggid                        = gid[iidx];
1283         /* Update potential energies */
1284         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
1285
1286         /* Increment number of inner iterations */
1287         inneriter                  += j_index_end - j_index_start;
1288
1289         /* Outer loop uses 19 flops */
1290     }
1291
1292     /* Increment number of outer iterations */
1293     outeriter        += nri;
1294
1295     /* Update outer/inner flops */
1296
1297     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*441);
1298 }
1299 /*
1300  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
1301  * Electrostatics interaction: Ewald
1302  * VdW interaction:            None
1303  * Geometry:                   Water4-Water4
1304  * Calculate force/pot:        Force
1305  */
1306 void
1307 nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
1308                     (t_nblist                    * gmx_restrict       nlist,
1309                      rvec                        * gmx_restrict          xx,
1310                      rvec                        * gmx_restrict          ff,
1311                      t_forcerec                  * gmx_restrict          fr,
1312                      t_mdatoms                   * gmx_restrict     mdatoms,
1313                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1314                      t_nrnb                      * gmx_restrict        nrnb)
1315 {
1316     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1317      * just 0 for non-waters.
1318      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
1319      * jnr indices corresponding to data put in the four positions in the SIMD register.
1320      */
1321     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1322     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1323     int              jnrA,jnrB;
1324     int              j_coord_offsetA,j_coord_offsetB;
1325     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1326     real             rcutoff_scalar;
1327     real             *shiftvec,*fshift,*x,*f;
1328     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1329     int              vdwioffset1;
1330     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1331     int              vdwioffset2;
1332     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1333     int              vdwioffset3;
1334     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1335     int              vdwjidx1A,vdwjidx1B;
1336     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1337     int              vdwjidx2A,vdwjidx2B;
1338     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1339     int              vdwjidx3A,vdwjidx3B;
1340     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1341     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1342     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1343     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1344     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1345     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1346     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1347     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1348     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1349     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1350     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
1351     real             *charge;
1352     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1353     real             *ewtab;
1354     _fjsp_v2r8       itab_tmp;
1355     _fjsp_v2r8       dummy_mask,cutoff_mask;
1356     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
1357     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
1358     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
1359
1360     x                = xx[0];
1361     f                = ff[0];
1362
1363     nri              = nlist->nri;
1364     iinr             = nlist->iinr;
1365     jindex           = nlist->jindex;
1366     jjnr             = nlist->jjnr;
1367     shiftidx         = nlist->shift;
1368     gid              = nlist->gid;
1369     shiftvec         = fr->shift_vec[0];
1370     fshift           = fr->fshift[0];
1371     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
1372     charge           = mdatoms->chargeA;
1373
1374     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
1375     ewtab            = fr->ic->tabq_coul_F;
1376     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
1377     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
1378
1379     /* Setup water-specific parameters */
1380     inr              = nlist->iinr[0];
1381     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
1382     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
1383     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
1384
1385     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
1386     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
1387     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
1388     qq11             = _fjsp_mul_v2r8(iq1,jq1);
1389     qq12             = _fjsp_mul_v2r8(iq1,jq2);
1390     qq13             = _fjsp_mul_v2r8(iq1,jq3);
1391     qq21             = _fjsp_mul_v2r8(iq2,jq1);
1392     qq22             = _fjsp_mul_v2r8(iq2,jq2);
1393     qq23             = _fjsp_mul_v2r8(iq2,jq3);
1394     qq31             = _fjsp_mul_v2r8(iq3,jq1);
1395     qq32             = _fjsp_mul_v2r8(iq3,jq2);
1396     qq33             = _fjsp_mul_v2r8(iq3,jq3);
1397
1398     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1399     rcutoff_scalar   = fr->rcoulomb;
1400     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
1401     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
1402
1403     /* Avoid stupid compiler warnings */
1404     jnrA = jnrB = 0;
1405     j_coord_offsetA = 0;
1406     j_coord_offsetB = 0;
1407
1408     outeriter        = 0;
1409     inneriter        = 0;
1410
1411     /* Start outer loop over neighborlists */
1412     for(iidx=0; iidx<nri; iidx++)
1413     {
1414         /* Load shift vector for this list */
1415         i_shift_offset   = DIM*shiftidx[iidx];
1416
1417         /* Load limits for loop over neighbors */
1418         j_index_start    = jindex[iidx];
1419         j_index_end      = jindex[iidx+1];
1420
1421         /* Get outer coordinate index */
1422         inr              = iinr[iidx];
1423         i_coord_offset   = DIM*inr;
1424
1425         /* Load i particle coords and add shift vector */
1426         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1427                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1428
1429         fix1             = _fjsp_setzero_v2r8();
1430         fiy1             = _fjsp_setzero_v2r8();
1431         fiz1             = _fjsp_setzero_v2r8();
1432         fix2             = _fjsp_setzero_v2r8();
1433         fiy2             = _fjsp_setzero_v2r8();
1434         fiz2             = _fjsp_setzero_v2r8();
1435         fix3             = _fjsp_setzero_v2r8();
1436         fiy3             = _fjsp_setzero_v2r8();
1437         fiz3             = _fjsp_setzero_v2r8();
1438
1439         /* Start inner kernel loop */
1440         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1441         {
1442
1443             /* Get j neighbor index, and coordinate index */
1444             jnrA             = jjnr[jidx];
1445             jnrB             = jjnr[jidx+1];
1446             j_coord_offsetA  = DIM*jnrA;
1447             j_coord_offsetB  = DIM*jnrB;
1448
1449             /* load j atom coordinates */
1450             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1451                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1452
1453             /* Calculate displacement vector */
1454             dx11             = _fjsp_sub_v2r8(ix1,jx1);
1455             dy11             = _fjsp_sub_v2r8(iy1,jy1);
1456             dz11             = _fjsp_sub_v2r8(iz1,jz1);
1457             dx12             = _fjsp_sub_v2r8(ix1,jx2);
1458             dy12             = _fjsp_sub_v2r8(iy1,jy2);
1459             dz12             = _fjsp_sub_v2r8(iz1,jz2);
1460             dx13             = _fjsp_sub_v2r8(ix1,jx3);
1461             dy13             = _fjsp_sub_v2r8(iy1,jy3);
1462             dz13             = _fjsp_sub_v2r8(iz1,jz3);
1463             dx21             = _fjsp_sub_v2r8(ix2,jx1);
1464             dy21             = _fjsp_sub_v2r8(iy2,jy1);
1465             dz21             = _fjsp_sub_v2r8(iz2,jz1);
1466             dx22             = _fjsp_sub_v2r8(ix2,jx2);
1467             dy22             = _fjsp_sub_v2r8(iy2,jy2);
1468             dz22             = _fjsp_sub_v2r8(iz2,jz2);
1469             dx23             = _fjsp_sub_v2r8(ix2,jx3);
1470             dy23             = _fjsp_sub_v2r8(iy2,jy3);
1471             dz23             = _fjsp_sub_v2r8(iz2,jz3);
1472             dx31             = _fjsp_sub_v2r8(ix3,jx1);
1473             dy31             = _fjsp_sub_v2r8(iy3,jy1);
1474             dz31             = _fjsp_sub_v2r8(iz3,jz1);
1475             dx32             = _fjsp_sub_v2r8(ix3,jx2);
1476             dy32             = _fjsp_sub_v2r8(iy3,jy2);
1477             dz32             = _fjsp_sub_v2r8(iz3,jz2);
1478             dx33             = _fjsp_sub_v2r8(ix3,jx3);
1479             dy33             = _fjsp_sub_v2r8(iy3,jy3);
1480             dz33             = _fjsp_sub_v2r8(iz3,jz3);
1481
1482             /* Calculate squared distance and things based on it */
1483             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
1484             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
1485             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
1486             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
1487             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
1488             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
1489             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
1490             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
1491             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
1492
1493             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
1494             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
1495             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
1496             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
1497             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
1498             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
1499             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
1500             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
1501             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
1502
1503             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
1504             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
1505             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
1506             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
1507             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
1508             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
1509             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
1510             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
1511             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
1512
1513             fjx1             = _fjsp_setzero_v2r8();
1514             fjy1             = _fjsp_setzero_v2r8();
1515             fjz1             = _fjsp_setzero_v2r8();
1516             fjx2             = _fjsp_setzero_v2r8();
1517             fjy2             = _fjsp_setzero_v2r8();
1518             fjz2             = _fjsp_setzero_v2r8();
1519             fjx3             = _fjsp_setzero_v2r8();
1520             fjy3             = _fjsp_setzero_v2r8();
1521             fjz3             = _fjsp_setzero_v2r8();
1522
1523             /**************************
1524              * CALCULATE INTERACTIONS *
1525              **************************/
1526
1527             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
1528             {
1529
1530             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
1531
1532             /* EWALD ELECTROSTATICS */
1533
1534             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1535             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
1536             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1537             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1538             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1539
1540             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1541                                          &ewtabF,&ewtabFn);
1542             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1543             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
1544
1545             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
1546
1547             fscal            = felec;
1548
1549             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1550
1551             /* Update vectorial force */
1552             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
1553             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
1554             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
1555             
1556             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
1557             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
1558             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
1559
1560             }
1561
1562             /**************************
1563              * CALCULATE INTERACTIONS *
1564              **************************/
1565
1566             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
1567             {
1568
1569             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
1570
1571             /* EWALD ELECTROSTATICS */
1572
1573             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1574             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
1575             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1576             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1577             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1578
1579             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1580                                          &ewtabF,&ewtabFn);
1581             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1582             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
1583
1584             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
1585
1586             fscal            = felec;
1587
1588             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1589
1590             /* Update vectorial force */
1591             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
1592             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
1593             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
1594             
1595             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
1596             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
1597             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
1598
1599             }
1600
1601             /**************************
1602              * CALCULATE INTERACTIONS *
1603              **************************/
1604
1605             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
1606             {
1607
1608             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
1609
1610             /* EWALD ELECTROSTATICS */
1611
1612             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1613             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
1614             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1615             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1616             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1617
1618             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1619                                          &ewtabF,&ewtabFn);
1620             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1621             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
1622
1623             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
1624
1625             fscal            = felec;
1626
1627             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1628
1629             /* Update vectorial force */
1630             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
1631             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
1632             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
1633             
1634             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
1635             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
1636             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
1637
1638             }
1639
1640             /**************************
1641              * CALCULATE INTERACTIONS *
1642              **************************/
1643
1644             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
1645             {
1646
1647             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
1648
1649             /* EWALD ELECTROSTATICS */
1650
1651             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1652             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
1653             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1654             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1655             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1656
1657             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1658                                          &ewtabF,&ewtabFn);
1659             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1660             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
1661
1662             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
1663
1664             fscal            = felec;
1665
1666             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1667
1668             /* Update vectorial force */
1669             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
1670             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
1671             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
1672             
1673             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
1674             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
1675             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
1676
1677             }
1678
1679             /**************************
1680              * CALCULATE INTERACTIONS *
1681              **************************/
1682
1683             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
1684             {
1685
1686             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
1687
1688             /* EWALD ELECTROSTATICS */
1689
1690             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1691             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
1692             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1693             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1694             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1695
1696             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1697                                          &ewtabF,&ewtabFn);
1698             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1699             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
1700
1701             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
1702
1703             fscal            = felec;
1704
1705             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1706
1707             /* Update vectorial force */
1708             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
1709             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
1710             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
1711             
1712             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
1713             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
1714             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
1715
1716             }
1717
1718             /**************************
1719              * CALCULATE INTERACTIONS *
1720              **************************/
1721
1722             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
1723             {
1724
1725             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
1726
1727             /* EWALD ELECTROSTATICS */
1728
1729             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1730             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
1731             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1732             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1733             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1734
1735             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1736                                          &ewtabF,&ewtabFn);
1737             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1738             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
1739
1740             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
1741
1742             fscal            = felec;
1743
1744             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1745
1746             /* Update vectorial force */
1747             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
1748             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
1749             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
1750             
1751             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
1752             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
1753             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
1754
1755             }
1756
1757             /**************************
1758              * CALCULATE INTERACTIONS *
1759              **************************/
1760
1761             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
1762             {
1763
1764             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
1765
1766             /* EWALD ELECTROSTATICS */
1767
1768             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1769             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
1770             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1771             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1772             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1773
1774             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1775                                          &ewtabF,&ewtabFn);
1776             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1777             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
1778
1779             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
1780
1781             fscal            = felec;
1782
1783             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1784
1785             /* Update vectorial force */
1786             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
1787             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
1788             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
1789             
1790             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
1791             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
1792             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
1793
1794             }
1795
1796             /**************************
1797              * CALCULATE INTERACTIONS *
1798              **************************/
1799
1800             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
1801             {
1802
1803             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
1804
1805             /* EWALD ELECTROSTATICS */
1806
1807             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1808             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
1809             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1810             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1811             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1812
1813             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1814                                          &ewtabF,&ewtabFn);
1815             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1816             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
1817
1818             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
1819
1820             fscal            = felec;
1821
1822             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1823
1824             /* Update vectorial force */
1825             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
1826             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
1827             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
1828             
1829             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
1830             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
1831             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
1832
1833             }
1834
1835             /**************************
1836              * CALCULATE INTERACTIONS *
1837              **************************/
1838
1839             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
1840             {
1841
1842             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
1843
1844             /* EWALD ELECTROSTATICS */
1845
1846             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1847             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
1848             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1849             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1850             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1851
1852             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
1853                                          &ewtabF,&ewtabFn);
1854             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1855             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
1856
1857             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
1858
1859             fscal            = felec;
1860
1861             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1862
1863             /* Update vectorial force */
1864             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
1865             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
1866             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
1867             
1868             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
1869             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
1870             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
1871
1872             }
1873
1874             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1875
1876             /* Inner loop uses 378 flops */
1877         }
1878
1879         if(jidx<j_index_end)
1880         {
1881
1882             jnrA             = jjnr[jidx];
1883             j_coord_offsetA  = DIM*jnrA;
1884
1885             /* load j atom coordinates */
1886             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
1887                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1888
1889             /* Calculate displacement vector */
1890             dx11             = _fjsp_sub_v2r8(ix1,jx1);
1891             dy11             = _fjsp_sub_v2r8(iy1,jy1);
1892             dz11             = _fjsp_sub_v2r8(iz1,jz1);
1893             dx12             = _fjsp_sub_v2r8(ix1,jx2);
1894             dy12             = _fjsp_sub_v2r8(iy1,jy2);
1895             dz12             = _fjsp_sub_v2r8(iz1,jz2);
1896             dx13             = _fjsp_sub_v2r8(ix1,jx3);
1897             dy13             = _fjsp_sub_v2r8(iy1,jy3);
1898             dz13             = _fjsp_sub_v2r8(iz1,jz3);
1899             dx21             = _fjsp_sub_v2r8(ix2,jx1);
1900             dy21             = _fjsp_sub_v2r8(iy2,jy1);
1901             dz21             = _fjsp_sub_v2r8(iz2,jz1);
1902             dx22             = _fjsp_sub_v2r8(ix2,jx2);
1903             dy22             = _fjsp_sub_v2r8(iy2,jy2);
1904             dz22             = _fjsp_sub_v2r8(iz2,jz2);
1905             dx23             = _fjsp_sub_v2r8(ix2,jx3);
1906             dy23             = _fjsp_sub_v2r8(iy2,jy3);
1907             dz23             = _fjsp_sub_v2r8(iz2,jz3);
1908             dx31             = _fjsp_sub_v2r8(ix3,jx1);
1909             dy31             = _fjsp_sub_v2r8(iy3,jy1);
1910             dz31             = _fjsp_sub_v2r8(iz3,jz1);
1911             dx32             = _fjsp_sub_v2r8(ix3,jx2);
1912             dy32             = _fjsp_sub_v2r8(iy3,jy2);
1913             dz32             = _fjsp_sub_v2r8(iz3,jz2);
1914             dx33             = _fjsp_sub_v2r8(ix3,jx3);
1915             dy33             = _fjsp_sub_v2r8(iy3,jy3);
1916             dz33             = _fjsp_sub_v2r8(iz3,jz3);
1917
1918             /* Calculate squared distance and things based on it */
1919             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
1920             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
1921             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
1922             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
1923             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
1924             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
1925             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
1926             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
1927             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
1928
1929             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
1930             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
1931             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
1932             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
1933             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
1934             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
1935             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
1936             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
1937             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
1938
1939             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
1940             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
1941             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
1942             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
1943             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
1944             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
1945             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
1946             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
1947             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
1948
1949             fjx1             = _fjsp_setzero_v2r8();
1950             fjy1             = _fjsp_setzero_v2r8();
1951             fjz1             = _fjsp_setzero_v2r8();
1952             fjx2             = _fjsp_setzero_v2r8();
1953             fjy2             = _fjsp_setzero_v2r8();
1954             fjz2             = _fjsp_setzero_v2r8();
1955             fjx3             = _fjsp_setzero_v2r8();
1956             fjy3             = _fjsp_setzero_v2r8();
1957             fjz3             = _fjsp_setzero_v2r8();
1958
1959             /**************************
1960              * CALCULATE INTERACTIONS *
1961              **************************/
1962
1963             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
1964             {
1965
1966             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
1967
1968             /* EWALD ELECTROSTATICS */
1969
1970             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1971             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
1972             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
1973             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1974             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1975
1976             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
1977             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
1978             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
1979
1980             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
1981
1982             fscal            = felec;
1983
1984             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
1985
1986             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1987
1988             /* Update vectorial force */
1989             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
1990             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
1991             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
1992             
1993             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
1994             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
1995             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
1996
1997             }
1998
1999             /**************************
2000              * CALCULATE INTERACTIONS *
2001              **************************/
2002
2003             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
2004             {
2005
2006             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
2007
2008             /* EWALD ELECTROSTATICS */
2009
2010             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2011             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
2012             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2013             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2014             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2015
2016             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2017             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2018             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
2019
2020             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
2021
2022             fscal            = felec;
2023
2024             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2025
2026             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2027
2028             /* Update vectorial force */
2029             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
2030             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
2031             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
2032             
2033             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
2034             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
2035             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
2036
2037             }
2038
2039             /**************************
2040              * CALCULATE INTERACTIONS *
2041              **************************/
2042
2043             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
2044             {
2045
2046             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
2047
2048             /* EWALD ELECTROSTATICS */
2049
2050             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2051             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
2052             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2053             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2054             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2055
2056             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2057             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2058             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
2059
2060             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
2061
2062             fscal            = felec;
2063
2064             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2065
2066             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2067
2068             /* Update vectorial force */
2069             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
2070             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
2071             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
2072             
2073             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
2074             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
2075             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
2076
2077             }
2078
2079             /**************************
2080              * CALCULATE INTERACTIONS *
2081              **************************/
2082
2083             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
2084             {
2085
2086             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
2087
2088             /* EWALD ELECTROSTATICS */
2089
2090             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2091             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
2092             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2093             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2094             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2095
2096             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2097             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2098             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
2099
2100             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
2101
2102             fscal            = felec;
2103
2104             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2105
2106             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2107
2108             /* Update vectorial force */
2109             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
2110             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
2111             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
2112             
2113             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
2114             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
2115             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
2116
2117             }
2118
2119             /**************************
2120              * CALCULATE INTERACTIONS *
2121              **************************/
2122
2123             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
2124             {
2125
2126             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
2127
2128             /* EWALD ELECTROSTATICS */
2129
2130             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2131             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
2132             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2133             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2134             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2135
2136             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2137             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2138             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
2139
2140             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
2141
2142             fscal            = felec;
2143
2144             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2145
2146             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2147
2148             /* Update vectorial force */
2149             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
2150             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
2151             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
2152             
2153             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
2154             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
2155             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
2156
2157             }
2158
2159             /**************************
2160              * CALCULATE INTERACTIONS *
2161              **************************/
2162
2163             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
2164             {
2165
2166             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
2167
2168             /* EWALD ELECTROSTATICS */
2169
2170             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2171             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
2172             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2173             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2174             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2175
2176             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2177             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2178             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
2179
2180             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
2181
2182             fscal            = felec;
2183
2184             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2185
2186             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2187
2188             /* Update vectorial force */
2189             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
2190             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
2191             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
2192             
2193             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
2194             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
2195             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
2196
2197             }
2198
2199             /**************************
2200              * CALCULATE INTERACTIONS *
2201              **************************/
2202
2203             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
2204             {
2205
2206             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
2207
2208             /* EWALD ELECTROSTATICS */
2209
2210             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2211             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
2212             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2213             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2214             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2215
2216             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2217             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2218             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
2219
2220             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
2221
2222             fscal            = felec;
2223
2224             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2225
2226             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2227
2228             /* Update vectorial force */
2229             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
2230             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
2231             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
2232             
2233             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
2234             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
2235             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
2236
2237             }
2238
2239             /**************************
2240              * CALCULATE INTERACTIONS *
2241              **************************/
2242
2243             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
2244             {
2245
2246             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
2247
2248             /* EWALD ELECTROSTATICS */
2249
2250             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2251             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
2252             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2253             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2254             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2255
2256             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2257             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2258             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
2259
2260             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
2261
2262             fscal            = felec;
2263
2264             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2265
2266             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2267
2268             /* Update vectorial force */
2269             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
2270             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
2271             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
2272             
2273             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
2274             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
2275             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
2276
2277             }
2278
2279             /**************************
2280              * CALCULATE INTERACTIONS *
2281              **************************/
2282
2283             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
2284             {
2285
2286             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
2287
2288             /* EWALD ELECTROSTATICS */
2289
2290             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2291             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
2292             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
2293             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2294             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2295
2296             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
2297             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
2298             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
2299
2300             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
2301
2302             fscal            = felec;
2303
2304             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
2305
2306             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2307
2308             /* Update vectorial force */
2309             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
2310             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
2311             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
2312             
2313             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
2314             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
2315             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
2316
2317             }
2318
2319             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2320
2321             /* Inner loop uses 378 flops */
2322         }
2323
2324         /* End of innermost loop */
2325
2326         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2327                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
2328
2329         /* Increment number of inner iterations */
2330         inneriter                  += j_index_end - j_index_start;
2331
2332         /* Outer loop uses 18 flops */
2333     }
2334
2335     /* Increment number of outer iterations */
2336     outeriter        += nri;
2337
2338     /* Update outer/inner flops */
2339
2340     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*378);
2341 }