4690f19c1a1932c1497d26ead47a8023629c15df
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sparc64_hpc_ace_double / nb_kernel_ElecEw_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "nrnb.h"
48
49 #include "kernelutil_sparc64_hpc_ace_double.h"
50
51 /*
52  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
53  * Electrostatics interaction: Ewald
54  * VdW interaction:            None
55  * Geometry:                   Water4-Particle
56  * Calculate force/pot:        PotentialAndForce
57  */
58 void
59 nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
60                     (t_nblist                    * gmx_restrict       nlist,
61                      rvec                        * gmx_restrict          xx,
62                      rvec                        * gmx_restrict          ff,
63                      t_forcerec                  * gmx_restrict          fr,
64                      t_mdatoms                   * gmx_restrict     mdatoms,
65                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
66                      t_nrnb                      * gmx_restrict        nrnb)
67 {
68     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69      * just 0 for non-waters.
70      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
71      * jnr indices corresponding to data put in the four positions in the SIMD register.
72      */
73     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
74     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75     int              jnrA,jnrB;
76     int              j_coord_offsetA,j_coord_offsetB;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
81     int              vdwioffset1;
82     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
83     int              vdwioffset2;
84     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
85     int              vdwioffset3;
86     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
87     int              vdwjidx0A,vdwjidx0B;
88     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
89     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
90     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
91     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
92     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
93     real             *charge;
94     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
95     real             *ewtab;
96     _fjsp_v2r8       itab_tmp;
97     _fjsp_v2r8       dummy_mask,cutoff_mask;
98     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
99     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
100     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
101
102     x                = xx[0];
103     f                = ff[0];
104
105     nri              = nlist->nri;
106     iinr             = nlist->iinr;
107     jindex           = nlist->jindex;
108     jjnr             = nlist->jjnr;
109     shiftidx         = nlist->shift;
110     gid              = nlist->gid;
111     shiftvec         = fr->shift_vec[0];
112     fshift           = fr->fshift[0];
113     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
114     charge           = mdatoms->chargeA;
115
116     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
117     ewtab            = fr->ic->tabq_coul_FDV0;
118     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
119     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
120
121     /* Setup water-specific parameters */
122     inr              = nlist->iinr[0];
123     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
124     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
125     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
126
127     /* Avoid stupid compiler warnings */
128     jnrA = jnrB = 0;
129     j_coord_offsetA = 0;
130     j_coord_offsetB = 0;
131
132     outeriter        = 0;
133     inneriter        = 0;
134
135     /* Start outer loop over neighborlists */
136     for(iidx=0; iidx<nri; iidx++)
137     {
138         /* Load shift vector for this list */
139         i_shift_offset   = DIM*shiftidx[iidx];
140
141         /* Load limits for loop over neighbors */
142         j_index_start    = jindex[iidx];
143         j_index_end      = jindex[iidx+1];
144
145         /* Get outer coordinate index */
146         inr              = iinr[iidx];
147         i_coord_offset   = DIM*inr;
148
149         /* Load i particle coords and add shift vector */
150         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
151                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
152
153         fix1             = _fjsp_setzero_v2r8();
154         fiy1             = _fjsp_setzero_v2r8();
155         fiz1             = _fjsp_setzero_v2r8();
156         fix2             = _fjsp_setzero_v2r8();
157         fiy2             = _fjsp_setzero_v2r8();
158         fiz2             = _fjsp_setzero_v2r8();
159         fix3             = _fjsp_setzero_v2r8();
160         fiy3             = _fjsp_setzero_v2r8();
161         fiz3             = _fjsp_setzero_v2r8();
162
163         /* Reset potential sums */
164         velecsum         = _fjsp_setzero_v2r8();
165
166         /* Start inner kernel loop */
167         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
168         {
169
170             /* Get j neighbor index, and coordinate index */
171             jnrA             = jjnr[jidx];
172             jnrB             = jjnr[jidx+1];
173             j_coord_offsetA  = DIM*jnrA;
174             j_coord_offsetB  = DIM*jnrB;
175
176             /* load j atom coordinates */
177             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
178                                               &jx0,&jy0,&jz0);
179
180             /* Calculate displacement vector */
181             dx10             = _fjsp_sub_v2r8(ix1,jx0);
182             dy10             = _fjsp_sub_v2r8(iy1,jy0);
183             dz10             = _fjsp_sub_v2r8(iz1,jz0);
184             dx20             = _fjsp_sub_v2r8(ix2,jx0);
185             dy20             = _fjsp_sub_v2r8(iy2,jy0);
186             dz20             = _fjsp_sub_v2r8(iz2,jz0);
187             dx30             = _fjsp_sub_v2r8(ix3,jx0);
188             dy30             = _fjsp_sub_v2r8(iy3,jy0);
189             dz30             = _fjsp_sub_v2r8(iz3,jz0);
190
191             /* Calculate squared distance and things based on it */
192             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
193             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
194             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
195
196             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
197             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
198             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
199
200             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
201             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
202             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
203
204             /* Load parameters for j particles */
205             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
206
207             fjx0             = _fjsp_setzero_v2r8();
208             fjy0             = _fjsp_setzero_v2r8();
209             fjz0             = _fjsp_setzero_v2r8();
210
211             /**************************
212              * CALCULATE INTERACTIONS *
213              **************************/
214
215             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
216
217             /* Compute parameters for interactions between i and j atoms */
218             qq10             = _fjsp_mul_v2r8(iq1,jq0);
219
220             /* EWALD ELECTROSTATICS */
221
222             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
223             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
224             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
225             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
226             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
227
228             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
229             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
230             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
231             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
232             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
233             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
234             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
235             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
236             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
237             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
238
239             /* Update potential sum for this i atom from the interaction with this j atom. */
240             velecsum         = _fjsp_add_v2r8(velecsum,velec);
241
242             fscal            = felec;
243
244             /* Update vectorial force */
245             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
246             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
247             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
248             
249             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
250             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
251             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
252
253             /**************************
254              * CALCULATE INTERACTIONS *
255              **************************/
256
257             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
258
259             /* Compute parameters for interactions between i and j atoms */
260             qq20             = _fjsp_mul_v2r8(iq2,jq0);
261
262             /* EWALD ELECTROSTATICS */
263
264             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
265             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
266             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
267             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
268             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
269
270             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
271             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
272             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
273             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
274             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
275             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
276             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
277             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
278             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
279             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
280
281             /* Update potential sum for this i atom from the interaction with this j atom. */
282             velecsum         = _fjsp_add_v2r8(velecsum,velec);
283
284             fscal            = felec;
285
286             /* Update vectorial force */
287             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
288             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
289             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
290             
291             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
292             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
293             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
294
295             /**************************
296              * CALCULATE INTERACTIONS *
297              **************************/
298
299             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
300
301             /* Compute parameters for interactions between i and j atoms */
302             qq30             = _fjsp_mul_v2r8(iq3,jq0);
303
304             /* EWALD ELECTROSTATICS */
305
306             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
307             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
308             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
309             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
310             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
311
312             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
313             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
314             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
315             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
316             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
317             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
318             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
319             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
320             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
321             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
322
323             /* Update potential sum for this i atom from the interaction with this j atom. */
324             velecsum         = _fjsp_add_v2r8(velecsum,velec);
325
326             fscal            = felec;
327
328             /* Update vectorial force */
329             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
330             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
331             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
332             
333             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
334             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
335             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
336
337             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
338
339             /* Inner loop uses 135 flops */
340         }
341
342         if(jidx<j_index_end)
343         {
344
345             jnrA             = jjnr[jidx];
346             j_coord_offsetA  = DIM*jnrA;
347
348             /* load j atom coordinates */
349             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
350                                               &jx0,&jy0,&jz0);
351
352             /* Calculate displacement vector */
353             dx10             = _fjsp_sub_v2r8(ix1,jx0);
354             dy10             = _fjsp_sub_v2r8(iy1,jy0);
355             dz10             = _fjsp_sub_v2r8(iz1,jz0);
356             dx20             = _fjsp_sub_v2r8(ix2,jx0);
357             dy20             = _fjsp_sub_v2r8(iy2,jy0);
358             dz20             = _fjsp_sub_v2r8(iz2,jz0);
359             dx30             = _fjsp_sub_v2r8(ix3,jx0);
360             dy30             = _fjsp_sub_v2r8(iy3,jy0);
361             dz30             = _fjsp_sub_v2r8(iz3,jz0);
362
363             /* Calculate squared distance and things based on it */
364             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
365             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
366             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
367
368             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
369             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
370             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
371
372             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
373             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
374             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
375
376             /* Load parameters for j particles */
377             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
378
379             fjx0             = _fjsp_setzero_v2r8();
380             fjy0             = _fjsp_setzero_v2r8();
381             fjz0             = _fjsp_setzero_v2r8();
382
383             /**************************
384              * CALCULATE INTERACTIONS *
385              **************************/
386
387             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
388
389             /* Compute parameters for interactions between i and j atoms */
390             qq10             = _fjsp_mul_v2r8(iq1,jq0);
391
392             /* EWALD ELECTROSTATICS */
393
394             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
395             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
396             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
397             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
398             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
399
400             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
401             ewtabD           = _fjsp_setzero_v2r8();
402             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
403             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
404             ewtabFn          = _fjsp_setzero_v2r8();
405             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
406             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
407             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
408             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
409             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
410
411             /* Update potential sum for this i atom from the interaction with this j atom. */
412             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
413             velecsum         = _fjsp_add_v2r8(velecsum,velec);
414
415             fscal            = felec;
416
417             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
418
419             /* Update vectorial force */
420             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
421             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
422             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
423             
424             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
425             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
426             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
427
428             /**************************
429              * CALCULATE INTERACTIONS *
430              **************************/
431
432             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
433
434             /* Compute parameters for interactions between i and j atoms */
435             qq20             = _fjsp_mul_v2r8(iq2,jq0);
436
437             /* EWALD ELECTROSTATICS */
438
439             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
440             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
441             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
442             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
443             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
444
445             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
446             ewtabD           = _fjsp_setzero_v2r8();
447             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
448             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
449             ewtabFn          = _fjsp_setzero_v2r8();
450             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
451             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
452             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
453             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
454             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
455
456             /* Update potential sum for this i atom from the interaction with this j atom. */
457             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
458             velecsum         = _fjsp_add_v2r8(velecsum,velec);
459
460             fscal            = felec;
461
462             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
463
464             /* Update vectorial force */
465             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
466             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
467             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
468             
469             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
470             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
471             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
472
473             /**************************
474              * CALCULATE INTERACTIONS *
475              **************************/
476
477             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
478
479             /* Compute parameters for interactions between i and j atoms */
480             qq30             = _fjsp_mul_v2r8(iq3,jq0);
481
482             /* EWALD ELECTROSTATICS */
483
484             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
485             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
486             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
487             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
488             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
489
490             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
491             ewtabD           = _fjsp_setzero_v2r8();
492             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
493             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
494             ewtabFn          = _fjsp_setzero_v2r8();
495             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
496             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
497             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
498             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
499             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
500
501             /* Update potential sum for this i atom from the interaction with this j atom. */
502             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
503             velecsum         = _fjsp_add_v2r8(velecsum,velec);
504
505             fscal            = felec;
506
507             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
508
509             /* Update vectorial force */
510             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
511             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
512             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
513             
514             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
515             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
516             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
517
518             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
519
520             /* Inner loop uses 135 flops */
521         }
522
523         /* End of innermost loop */
524
525         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
526                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
527
528         ggid                        = gid[iidx];
529         /* Update potential energies */
530         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
531
532         /* Increment number of inner iterations */
533         inneriter                  += j_index_end - j_index_start;
534
535         /* Outer loop uses 19 flops */
536     }
537
538     /* Increment number of outer iterations */
539     outeriter        += nri;
540
541     /* Update outer/inner flops */
542
543     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*135);
544 }
545 /*
546  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
547  * Electrostatics interaction: Ewald
548  * VdW interaction:            None
549  * Geometry:                   Water4-Particle
550  * Calculate force/pot:        Force
551  */
552 void
553 nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
554                     (t_nblist                    * gmx_restrict       nlist,
555                      rvec                        * gmx_restrict          xx,
556                      rvec                        * gmx_restrict          ff,
557                      t_forcerec                  * gmx_restrict          fr,
558                      t_mdatoms                   * gmx_restrict     mdatoms,
559                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
560                      t_nrnb                      * gmx_restrict        nrnb)
561 {
562     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
563      * just 0 for non-waters.
564      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
565      * jnr indices corresponding to data put in the four positions in the SIMD register.
566      */
567     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
568     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
569     int              jnrA,jnrB;
570     int              j_coord_offsetA,j_coord_offsetB;
571     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
572     real             rcutoff_scalar;
573     real             *shiftvec,*fshift,*x,*f;
574     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
575     int              vdwioffset1;
576     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
577     int              vdwioffset2;
578     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
579     int              vdwioffset3;
580     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
581     int              vdwjidx0A,vdwjidx0B;
582     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
583     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
584     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
585     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
586     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
587     real             *charge;
588     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
589     real             *ewtab;
590     _fjsp_v2r8       itab_tmp;
591     _fjsp_v2r8       dummy_mask,cutoff_mask;
592     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
593     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
594     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
595
596     x                = xx[0];
597     f                = ff[0];
598
599     nri              = nlist->nri;
600     iinr             = nlist->iinr;
601     jindex           = nlist->jindex;
602     jjnr             = nlist->jjnr;
603     shiftidx         = nlist->shift;
604     gid              = nlist->gid;
605     shiftvec         = fr->shift_vec[0];
606     fshift           = fr->fshift[0];
607     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
608     charge           = mdatoms->chargeA;
609
610     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
611     ewtab            = fr->ic->tabq_coul_F;
612     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
613     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
614
615     /* Setup water-specific parameters */
616     inr              = nlist->iinr[0];
617     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
618     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
619     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
620
621     /* Avoid stupid compiler warnings */
622     jnrA = jnrB = 0;
623     j_coord_offsetA = 0;
624     j_coord_offsetB = 0;
625
626     outeriter        = 0;
627     inneriter        = 0;
628
629     /* Start outer loop over neighborlists */
630     for(iidx=0; iidx<nri; iidx++)
631     {
632         /* Load shift vector for this list */
633         i_shift_offset   = DIM*shiftidx[iidx];
634
635         /* Load limits for loop over neighbors */
636         j_index_start    = jindex[iidx];
637         j_index_end      = jindex[iidx+1];
638
639         /* Get outer coordinate index */
640         inr              = iinr[iidx];
641         i_coord_offset   = DIM*inr;
642
643         /* Load i particle coords and add shift vector */
644         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
645                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
646
647         fix1             = _fjsp_setzero_v2r8();
648         fiy1             = _fjsp_setzero_v2r8();
649         fiz1             = _fjsp_setzero_v2r8();
650         fix2             = _fjsp_setzero_v2r8();
651         fiy2             = _fjsp_setzero_v2r8();
652         fiz2             = _fjsp_setzero_v2r8();
653         fix3             = _fjsp_setzero_v2r8();
654         fiy3             = _fjsp_setzero_v2r8();
655         fiz3             = _fjsp_setzero_v2r8();
656
657         /* Start inner kernel loop */
658         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
659         {
660
661             /* Get j neighbor index, and coordinate index */
662             jnrA             = jjnr[jidx];
663             jnrB             = jjnr[jidx+1];
664             j_coord_offsetA  = DIM*jnrA;
665             j_coord_offsetB  = DIM*jnrB;
666
667             /* load j atom coordinates */
668             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
669                                               &jx0,&jy0,&jz0);
670
671             /* Calculate displacement vector */
672             dx10             = _fjsp_sub_v2r8(ix1,jx0);
673             dy10             = _fjsp_sub_v2r8(iy1,jy0);
674             dz10             = _fjsp_sub_v2r8(iz1,jz0);
675             dx20             = _fjsp_sub_v2r8(ix2,jx0);
676             dy20             = _fjsp_sub_v2r8(iy2,jy0);
677             dz20             = _fjsp_sub_v2r8(iz2,jz0);
678             dx30             = _fjsp_sub_v2r8(ix3,jx0);
679             dy30             = _fjsp_sub_v2r8(iy3,jy0);
680             dz30             = _fjsp_sub_v2r8(iz3,jz0);
681
682             /* Calculate squared distance and things based on it */
683             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
684             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
685             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
686
687             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
688             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
689             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
690
691             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
692             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
693             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
694
695             /* Load parameters for j particles */
696             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
697
698             fjx0             = _fjsp_setzero_v2r8();
699             fjy0             = _fjsp_setzero_v2r8();
700             fjz0             = _fjsp_setzero_v2r8();
701
702             /**************************
703              * CALCULATE INTERACTIONS *
704              **************************/
705
706             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
707
708             /* Compute parameters for interactions between i and j atoms */
709             qq10             = _fjsp_mul_v2r8(iq1,jq0);
710
711             /* EWALD ELECTROSTATICS */
712
713             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
714             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
715             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
716             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
717             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
718
719             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
720                                          &ewtabF,&ewtabFn);
721             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
722             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
723
724             fscal            = felec;
725
726             /* Update vectorial force */
727             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
728             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
729             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
730             
731             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
732             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
733             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
734
735             /**************************
736              * CALCULATE INTERACTIONS *
737              **************************/
738
739             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
740
741             /* Compute parameters for interactions between i and j atoms */
742             qq20             = _fjsp_mul_v2r8(iq2,jq0);
743
744             /* EWALD ELECTROSTATICS */
745
746             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
747             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
748             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
749             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
750             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
751
752             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
753                                          &ewtabF,&ewtabFn);
754             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
755             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
756
757             fscal            = felec;
758
759             /* Update vectorial force */
760             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
761             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
762             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
763             
764             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
765             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
766             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
767
768             /**************************
769              * CALCULATE INTERACTIONS *
770              **************************/
771
772             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
773
774             /* Compute parameters for interactions between i and j atoms */
775             qq30             = _fjsp_mul_v2r8(iq3,jq0);
776
777             /* EWALD ELECTROSTATICS */
778
779             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
780             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
781             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
782             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
783             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
784
785             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
786                                          &ewtabF,&ewtabFn);
787             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
788             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
789
790             fscal            = felec;
791
792             /* Update vectorial force */
793             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
794             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
795             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
796             
797             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
798             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
799             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
800
801             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
802
803             /* Inner loop uses 120 flops */
804         }
805
806         if(jidx<j_index_end)
807         {
808
809             jnrA             = jjnr[jidx];
810             j_coord_offsetA  = DIM*jnrA;
811
812             /* load j atom coordinates */
813             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
814                                               &jx0,&jy0,&jz0);
815
816             /* Calculate displacement vector */
817             dx10             = _fjsp_sub_v2r8(ix1,jx0);
818             dy10             = _fjsp_sub_v2r8(iy1,jy0);
819             dz10             = _fjsp_sub_v2r8(iz1,jz0);
820             dx20             = _fjsp_sub_v2r8(ix2,jx0);
821             dy20             = _fjsp_sub_v2r8(iy2,jy0);
822             dz20             = _fjsp_sub_v2r8(iz2,jz0);
823             dx30             = _fjsp_sub_v2r8(ix3,jx0);
824             dy30             = _fjsp_sub_v2r8(iy3,jy0);
825             dz30             = _fjsp_sub_v2r8(iz3,jz0);
826
827             /* Calculate squared distance and things based on it */
828             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
829             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
830             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
831
832             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
833             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
834             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
835
836             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
837             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
838             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
839
840             /* Load parameters for j particles */
841             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
842
843             fjx0             = _fjsp_setzero_v2r8();
844             fjy0             = _fjsp_setzero_v2r8();
845             fjz0             = _fjsp_setzero_v2r8();
846
847             /**************************
848              * CALCULATE INTERACTIONS *
849              **************************/
850
851             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
852
853             /* Compute parameters for interactions between i and j atoms */
854             qq10             = _fjsp_mul_v2r8(iq1,jq0);
855
856             /* EWALD ELECTROSTATICS */
857
858             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
859             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
860             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
861             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
862             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
863
864             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
865             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
866             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
867
868             fscal            = felec;
869
870             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
871
872             /* Update vectorial force */
873             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
874             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
875             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
876             
877             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
878             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
879             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
880
881             /**************************
882              * CALCULATE INTERACTIONS *
883              **************************/
884
885             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
886
887             /* Compute parameters for interactions between i and j atoms */
888             qq20             = _fjsp_mul_v2r8(iq2,jq0);
889
890             /* EWALD ELECTROSTATICS */
891
892             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
893             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
894             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
895             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
896             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
897
898             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
899             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
900             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
901
902             fscal            = felec;
903
904             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
905
906             /* Update vectorial force */
907             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
908             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
909             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
910             
911             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
912             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
913             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
914
915             /**************************
916              * CALCULATE INTERACTIONS *
917              **************************/
918
919             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
920
921             /* Compute parameters for interactions between i and j atoms */
922             qq30             = _fjsp_mul_v2r8(iq3,jq0);
923
924             /* EWALD ELECTROSTATICS */
925
926             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
927             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
928             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
929             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
930             _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
931
932             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
933             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
934             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
935
936             fscal            = felec;
937
938             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
939
940             /* Update vectorial force */
941             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
942             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
943             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
944             
945             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
946             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
947             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
948
949             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
950
951             /* Inner loop uses 120 flops */
952         }
953
954         /* End of innermost loop */
955
956         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
957                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
958
959         /* Increment number of inner iterations */
960         inneriter                  += j_index_end - j_index_start;
961
962         /* Outer loop uses 18 flops */
963     }
964
965     /* Increment number of outer iterations */
966     outeriter        += nri;
967
968     /* Update outer/inner flops */
969
970     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*120);
971 }