Remove all unnecessary HAVE_CONFIG_H
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sparc64_hpc_ace_double / nb_kernel_ElecCSTab_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
46
47 #include "kernelutil_sparc64_hpc_ace_double.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
51  * Electrostatics interaction: CubicSplineTable
52  * VdW interaction:            None
53  * Geometry:                   Water3-Particle
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      t_forcerec                  * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67      * just 0 for non-waters.
68      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB;
74     int              j_coord_offsetA,j_coord_offsetB;
75     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
76     real             rcutoff_scalar;
77     real             *shiftvec,*fshift,*x,*f;
78     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
79     int              vdwioffset0;
80     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
81     int              vdwioffset1;
82     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
83     int              vdwioffset2;
84     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
85     int              vdwjidx0A,vdwjidx0B;
86     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
87     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
88     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
89     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
90     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
91     real             *charge;
92     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
93     real             *vftab;
94     _fjsp_v2r8       itab_tmp;
95     _fjsp_v2r8       dummy_mask,cutoff_mask;
96     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
97     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
98     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
99
100     x                = xx[0];
101     f                = ff[0];
102
103     nri              = nlist->nri;
104     iinr             = nlist->iinr;
105     jindex           = nlist->jindex;
106     jjnr             = nlist->jjnr;
107     shiftidx         = nlist->shift;
108     gid              = nlist->gid;
109     shiftvec         = fr->shift_vec[0];
110     fshift           = fr->fshift[0];
111     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
112     charge           = mdatoms->chargeA;
113
114     vftab            = kernel_data->table_elec->data;
115     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
116
117     /* Setup water-specific parameters */
118     inr              = nlist->iinr[0];
119     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
120     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
121     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
122
123     /* Avoid stupid compiler warnings */
124     jnrA = jnrB = 0;
125     j_coord_offsetA = 0;
126     j_coord_offsetB = 0;
127
128     outeriter        = 0;
129     inneriter        = 0;
130
131     /* Start outer loop over neighborlists */
132     for(iidx=0; iidx<nri; iidx++)
133     {
134         /* Load shift vector for this list */
135         i_shift_offset   = DIM*shiftidx[iidx];
136
137         /* Load limits for loop over neighbors */
138         j_index_start    = jindex[iidx];
139         j_index_end      = jindex[iidx+1];
140
141         /* Get outer coordinate index */
142         inr              = iinr[iidx];
143         i_coord_offset   = DIM*inr;
144
145         /* Load i particle coords and add shift vector */
146         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
147                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
148
149         fix0             = _fjsp_setzero_v2r8();
150         fiy0             = _fjsp_setzero_v2r8();
151         fiz0             = _fjsp_setzero_v2r8();
152         fix1             = _fjsp_setzero_v2r8();
153         fiy1             = _fjsp_setzero_v2r8();
154         fiz1             = _fjsp_setzero_v2r8();
155         fix2             = _fjsp_setzero_v2r8();
156         fiy2             = _fjsp_setzero_v2r8();
157         fiz2             = _fjsp_setzero_v2r8();
158
159         /* Reset potential sums */
160         velecsum         = _fjsp_setzero_v2r8();
161
162         /* Start inner kernel loop */
163         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
164         {
165
166             /* Get j neighbor index, and coordinate index */
167             jnrA             = jjnr[jidx];
168             jnrB             = jjnr[jidx+1];
169             j_coord_offsetA  = DIM*jnrA;
170             j_coord_offsetB  = DIM*jnrB;
171
172             /* load j atom coordinates */
173             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
174                                               &jx0,&jy0,&jz0);
175
176             /* Calculate displacement vector */
177             dx00             = _fjsp_sub_v2r8(ix0,jx0);
178             dy00             = _fjsp_sub_v2r8(iy0,jy0);
179             dz00             = _fjsp_sub_v2r8(iz0,jz0);
180             dx10             = _fjsp_sub_v2r8(ix1,jx0);
181             dy10             = _fjsp_sub_v2r8(iy1,jy0);
182             dz10             = _fjsp_sub_v2r8(iz1,jz0);
183             dx20             = _fjsp_sub_v2r8(ix2,jx0);
184             dy20             = _fjsp_sub_v2r8(iy2,jy0);
185             dz20             = _fjsp_sub_v2r8(iz2,jz0);
186
187             /* Calculate squared distance and things based on it */
188             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
189             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
190             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
191
192             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
193             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
194             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
195
196             /* Load parameters for j particles */
197             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
198
199             fjx0             = _fjsp_setzero_v2r8();
200             fjy0             = _fjsp_setzero_v2r8();
201             fjz0             = _fjsp_setzero_v2r8();
202
203             /**************************
204              * CALCULATE INTERACTIONS *
205              **************************/
206
207             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
208
209             /* Compute parameters for interactions between i and j atoms */
210             qq00             = _fjsp_mul_v2r8(iq0,jq0);
211
212             /* Calculate table index by multiplying r with table scale and truncate to integer */
213             rt               = _fjsp_mul_v2r8(r00,vftabscale);
214             itab_tmp         = _fjsp_dtox_v2r8(rt);
215             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
216             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
217             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
218
219             vfconv.i[0]     *= 4;
220             vfconv.i[1]     *= 4;
221
222             /* CUBIC SPLINE TABLE ELECTROSTATICS */
223             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
224             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
225             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
226             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
227             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
228             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
229             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
230             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
231             velec            = _fjsp_mul_v2r8(qq00,VV);
232             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
233             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
234
235             /* Update potential sum for this i atom from the interaction with this j atom. */
236             velecsum         = _fjsp_add_v2r8(velecsum,velec);
237
238             fscal            = felec;
239
240             /* Update vectorial force */
241             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
242             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
243             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
244             
245             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
246             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
247             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
248
249             /**************************
250              * CALCULATE INTERACTIONS *
251              **************************/
252
253             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
254
255             /* Compute parameters for interactions between i and j atoms */
256             qq10             = _fjsp_mul_v2r8(iq1,jq0);
257
258             /* Calculate table index by multiplying r with table scale and truncate to integer */
259             rt               = _fjsp_mul_v2r8(r10,vftabscale);
260             itab_tmp         = _fjsp_dtox_v2r8(rt);
261             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
262             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
263             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
264
265             vfconv.i[0]     *= 4;
266             vfconv.i[1]     *= 4;
267
268             /* CUBIC SPLINE TABLE ELECTROSTATICS */
269             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
270             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
271             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
272             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
273             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
274             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
275             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
276             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
277             velec            = _fjsp_mul_v2r8(qq10,VV);
278             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
279             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
280
281             /* Update potential sum for this i atom from the interaction with this j atom. */
282             velecsum         = _fjsp_add_v2r8(velecsum,velec);
283
284             fscal            = felec;
285
286             /* Update vectorial force */
287             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
288             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
289             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
290             
291             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
292             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
293             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
294
295             /**************************
296              * CALCULATE INTERACTIONS *
297              **************************/
298
299             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
300
301             /* Compute parameters for interactions between i and j atoms */
302             qq20             = _fjsp_mul_v2r8(iq2,jq0);
303
304             /* Calculate table index by multiplying r with table scale and truncate to integer */
305             rt               = _fjsp_mul_v2r8(r20,vftabscale);
306             itab_tmp         = _fjsp_dtox_v2r8(rt);
307             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
308             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
309             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
310
311             vfconv.i[0]     *= 4;
312             vfconv.i[1]     *= 4;
313
314             /* CUBIC SPLINE TABLE ELECTROSTATICS */
315             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
316             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
317             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
318             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
319             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
320             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
321             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
322             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
323             velec            = _fjsp_mul_v2r8(qq20,VV);
324             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
325             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
326
327             /* Update potential sum for this i atom from the interaction with this j atom. */
328             velecsum         = _fjsp_add_v2r8(velecsum,velec);
329
330             fscal            = felec;
331
332             /* Update vectorial force */
333             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
334             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
335             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
336             
337             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
338             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
339             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
340
341             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
342
343             /* Inner loop uses 141 flops */
344         }
345
346         if(jidx<j_index_end)
347         {
348
349             jnrA             = jjnr[jidx];
350             j_coord_offsetA  = DIM*jnrA;
351
352             /* load j atom coordinates */
353             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
354                                               &jx0,&jy0,&jz0);
355
356             /* Calculate displacement vector */
357             dx00             = _fjsp_sub_v2r8(ix0,jx0);
358             dy00             = _fjsp_sub_v2r8(iy0,jy0);
359             dz00             = _fjsp_sub_v2r8(iz0,jz0);
360             dx10             = _fjsp_sub_v2r8(ix1,jx0);
361             dy10             = _fjsp_sub_v2r8(iy1,jy0);
362             dz10             = _fjsp_sub_v2r8(iz1,jz0);
363             dx20             = _fjsp_sub_v2r8(ix2,jx0);
364             dy20             = _fjsp_sub_v2r8(iy2,jy0);
365             dz20             = _fjsp_sub_v2r8(iz2,jz0);
366
367             /* Calculate squared distance and things based on it */
368             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
369             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
370             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
371
372             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
373             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
374             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
375
376             /* Load parameters for j particles */
377             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
378
379             fjx0             = _fjsp_setzero_v2r8();
380             fjy0             = _fjsp_setzero_v2r8();
381             fjz0             = _fjsp_setzero_v2r8();
382
383             /**************************
384              * CALCULATE INTERACTIONS *
385              **************************/
386
387             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
388
389             /* Compute parameters for interactions between i and j atoms */
390             qq00             = _fjsp_mul_v2r8(iq0,jq0);
391
392             /* Calculate table index by multiplying r with table scale and truncate to integer */
393             rt               = _fjsp_mul_v2r8(r00,vftabscale);
394             itab_tmp         = _fjsp_dtox_v2r8(rt);
395             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
396             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
397             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
398
399             vfconv.i[0]     *= 4;
400             vfconv.i[1]     *= 4;
401
402             /* CUBIC SPLINE TABLE ELECTROSTATICS */
403             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
404             F                = _fjsp_setzero_v2r8();
405             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
406             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
407             H                = _fjsp_setzero_v2r8();
408             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
409             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
410             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
411             velec            = _fjsp_mul_v2r8(qq00,VV);
412             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
413             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
414
415             /* Update potential sum for this i atom from the interaction with this j atom. */
416             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
417             velecsum         = _fjsp_add_v2r8(velecsum,velec);
418
419             fscal            = felec;
420
421             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
422
423             /* Update vectorial force */
424             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
425             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
426             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
427             
428             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
429             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
430             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
431
432             /**************************
433              * CALCULATE INTERACTIONS *
434              **************************/
435
436             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
437
438             /* Compute parameters for interactions between i and j atoms */
439             qq10             = _fjsp_mul_v2r8(iq1,jq0);
440
441             /* Calculate table index by multiplying r with table scale and truncate to integer */
442             rt               = _fjsp_mul_v2r8(r10,vftabscale);
443             itab_tmp         = _fjsp_dtox_v2r8(rt);
444             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
445             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
446             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
447
448             vfconv.i[0]     *= 4;
449             vfconv.i[1]     *= 4;
450
451             /* CUBIC SPLINE TABLE ELECTROSTATICS */
452             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
453             F                = _fjsp_setzero_v2r8();
454             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
455             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
456             H                = _fjsp_setzero_v2r8();
457             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
458             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
459             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
460             velec            = _fjsp_mul_v2r8(qq10,VV);
461             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
462             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
463
464             /* Update potential sum for this i atom from the interaction with this j atom. */
465             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
466             velecsum         = _fjsp_add_v2r8(velecsum,velec);
467
468             fscal            = felec;
469
470             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
471
472             /* Update vectorial force */
473             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
474             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
475             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
476             
477             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
478             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
479             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
480
481             /**************************
482              * CALCULATE INTERACTIONS *
483              **************************/
484
485             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
486
487             /* Compute parameters for interactions between i and j atoms */
488             qq20             = _fjsp_mul_v2r8(iq2,jq0);
489
490             /* Calculate table index by multiplying r with table scale and truncate to integer */
491             rt               = _fjsp_mul_v2r8(r20,vftabscale);
492             itab_tmp         = _fjsp_dtox_v2r8(rt);
493             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
494             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
495             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
496
497             vfconv.i[0]     *= 4;
498             vfconv.i[1]     *= 4;
499
500             /* CUBIC SPLINE TABLE ELECTROSTATICS */
501             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
502             F                = _fjsp_setzero_v2r8();
503             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
504             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
505             H                = _fjsp_setzero_v2r8();
506             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
507             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
508             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
509             velec            = _fjsp_mul_v2r8(qq20,VV);
510             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
511             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
512
513             /* Update potential sum for this i atom from the interaction with this j atom. */
514             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
515             velecsum         = _fjsp_add_v2r8(velecsum,velec);
516
517             fscal            = felec;
518
519             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
520
521             /* Update vectorial force */
522             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
523             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
524             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
525             
526             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
527             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
528             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
529
530             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
531
532             /* Inner loop uses 141 flops */
533         }
534
535         /* End of innermost loop */
536
537         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
538                                               f+i_coord_offset,fshift+i_shift_offset);
539
540         ggid                        = gid[iidx];
541         /* Update potential energies */
542         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
543
544         /* Increment number of inner iterations */
545         inneriter                  += j_index_end - j_index_start;
546
547         /* Outer loop uses 19 flops */
548     }
549
550     /* Increment number of outer iterations */
551     outeriter        += nri;
552
553     /* Update outer/inner flops */
554
555     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*141);
556 }
557 /*
558  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
559  * Electrostatics interaction: CubicSplineTable
560  * VdW interaction:            None
561  * Geometry:                   Water3-Particle
562  * Calculate force/pot:        Force
563  */
564 void
565 nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
566                     (t_nblist                    * gmx_restrict       nlist,
567                      rvec                        * gmx_restrict          xx,
568                      rvec                        * gmx_restrict          ff,
569                      t_forcerec                  * gmx_restrict          fr,
570                      t_mdatoms                   * gmx_restrict     mdatoms,
571                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
572                      t_nrnb                      * gmx_restrict        nrnb)
573 {
574     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
575      * just 0 for non-waters.
576      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
577      * jnr indices corresponding to data put in the four positions in the SIMD register.
578      */
579     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
580     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
581     int              jnrA,jnrB;
582     int              j_coord_offsetA,j_coord_offsetB;
583     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
584     real             rcutoff_scalar;
585     real             *shiftvec,*fshift,*x,*f;
586     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
587     int              vdwioffset0;
588     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
589     int              vdwioffset1;
590     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
591     int              vdwioffset2;
592     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
593     int              vdwjidx0A,vdwjidx0B;
594     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
595     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
596     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
597     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
598     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
599     real             *charge;
600     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
601     real             *vftab;
602     _fjsp_v2r8       itab_tmp;
603     _fjsp_v2r8       dummy_mask,cutoff_mask;
604     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
605     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
606     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
607
608     x                = xx[0];
609     f                = ff[0];
610
611     nri              = nlist->nri;
612     iinr             = nlist->iinr;
613     jindex           = nlist->jindex;
614     jjnr             = nlist->jjnr;
615     shiftidx         = nlist->shift;
616     gid              = nlist->gid;
617     shiftvec         = fr->shift_vec[0];
618     fshift           = fr->fshift[0];
619     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
620     charge           = mdatoms->chargeA;
621
622     vftab            = kernel_data->table_elec->data;
623     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
624
625     /* Setup water-specific parameters */
626     inr              = nlist->iinr[0];
627     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
628     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
629     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
630
631     /* Avoid stupid compiler warnings */
632     jnrA = jnrB = 0;
633     j_coord_offsetA = 0;
634     j_coord_offsetB = 0;
635
636     outeriter        = 0;
637     inneriter        = 0;
638
639     /* Start outer loop over neighborlists */
640     for(iidx=0; iidx<nri; iidx++)
641     {
642         /* Load shift vector for this list */
643         i_shift_offset   = DIM*shiftidx[iidx];
644
645         /* Load limits for loop over neighbors */
646         j_index_start    = jindex[iidx];
647         j_index_end      = jindex[iidx+1];
648
649         /* Get outer coordinate index */
650         inr              = iinr[iidx];
651         i_coord_offset   = DIM*inr;
652
653         /* Load i particle coords and add shift vector */
654         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
655                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
656
657         fix0             = _fjsp_setzero_v2r8();
658         fiy0             = _fjsp_setzero_v2r8();
659         fiz0             = _fjsp_setzero_v2r8();
660         fix1             = _fjsp_setzero_v2r8();
661         fiy1             = _fjsp_setzero_v2r8();
662         fiz1             = _fjsp_setzero_v2r8();
663         fix2             = _fjsp_setzero_v2r8();
664         fiy2             = _fjsp_setzero_v2r8();
665         fiz2             = _fjsp_setzero_v2r8();
666
667         /* Start inner kernel loop */
668         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
669         {
670
671             /* Get j neighbor index, and coordinate index */
672             jnrA             = jjnr[jidx];
673             jnrB             = jjnr[jidx+1];
674             j_coord_offsetA  = DIM*jnrA;
675             j_coord_offsetB  = DIM*jnrB;
676
677             /* load j atom coordinates */
678             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
679                                               &jx0,&jy0,&jz0);
680
681             /* Calculate displacement vector */
682             dx00             = _fjsp_sub_v2r8(ix0,jx0);
683             dy00             = _fjsp_sub_v2r8(iy0,jy0);
684             dz00             = _fjsp_sub_v2r8(iz0,jz0);
685             dx10             = _fjsp_sub_v2r8(ix1,jx0);
686             dy10             = _fjsp_sub_v2r8(iy1,jy0);
687             dz10             = _fjsp_sub_v2r8(iz1,jz0);
688             dx20             = _fjsp_sub_v2r8(ix2,jx0);
689             dy20             = _fjsp_sub_v2r8(iy2,jy0);
690             dz20             = _fjsp_sub_v2r8(iz2,jz0);
691
692             /* Calculate squared distance and things based on it */
693             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
694             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
695             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
696
697             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
698             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
699             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
700
701             /* Load parameters for j particles */
702             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
703
704             fjx0             = _fjsp_setzero_v2r8();
705             fjy0             = _fjsp_setzero_v2r8();
706             fjz0             = _fjsp_setzero_v2r8();
707
708             /**************************
709              * CALCULATE INTERACTIONS *
710              **************************/
711
712             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
713
714             /* Compute parameters for interactions between i and j atoms */
715             qq00             = _fjsp_mul_v2r8(iq0,jq0);
716
717             /* Calculate table index by multiplying r with table scale and truncate to integer */
718             rt               = _fjsp_mul_v2r8(r00,vftabscale);
719             itab_tmp         = _fjsp_dtox_v2r8(rt);
720             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
721             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
722             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
723
724             vfconv.i[0]     *= 4;
725             vfconv.i[1]     *= 4;
726
727             /* CUBIC SPLINE TABLE ELECTROSTATICS */
728             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
729             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
730             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
731             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
732             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
733             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
734             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
735             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
736             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
737
738             fscal            = felec;
739
740             /* Update vectorial force */
741             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
742             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
743             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
744             
745             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
746             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
747             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
748
749             /**************************
750              * CALCULATE INTERACTIONS *
751              **************************/
752
753             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
754
755             /* Compute parameters for interactions between i and j atoms */
756             qq10             = _fjsp_mul_v2r8(iq1,jq0);
757
758             /* Calculate table index by multiplying r with table scale and truncate to integer */
759             rt               = _fjsp_mul_v2r8(r10,vftabscale);
760             itab_tmp         = _fjsp_dtox_v2r8(rt);
761             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
762             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
763             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
764
765             vfconv.i[0]     *= 4;
766             vfconv.i[1]     *= 4;
767
768             /* CUBIC SPLINE TABLE ELECTROSTATICS */
769             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
770             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
771             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
772             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
773             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
774             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
775             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
776             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
777             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
778
779             fscal            = felec;
780
781             /* Update vectorial force */
782             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
783             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
784             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
785             
786             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
787             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
788             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
789
790             /**************************
791              * CALCULATE INTERACTIONS *
792              **************************/
793
794             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
795
796             /* Compute parameters for interactions between i and j atoms */
797             qq20             = _fjsp_mul_v2r8(iq2,jq0);
798
799             /* Calculate table index by multiplying r with table scale and truncate to integer */
800             rt               = _fjsp_mul_v2r8(r20,vftabscale);
801             itab_tmp         = _fjsp_dtox_v2r8(rt);
802             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
803             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
804             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
805
806             vfconv.i[0]     *= 4;
807             vfconv.i[1]     *= 4;
808
809             /* CUBIC SPLINE TABLE ELECTROSTATICS */
810             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
811             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
812             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
813             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
814             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
815             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
816             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
817             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
818             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
819
820             fscal            = felec;
821
822             /* Update vectorial force */
823             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
824             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
825             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
826             
827             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
828             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
829             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
830
831             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
832
833             /* Inner loop uses 129 flops */
834         }
835
836         if(jidx<j_index_end)
837         {
838
839             jnrA             = jjnr[jidx];
840             j_coord_offsetA  = DIM*jnrA;
841
842             /* load j atom coordinates */
843             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
844                                               &jx0,&jy0,&jz0);
845
846             /* Calculate displacement vector */
847             dx00             = _fjsp_sub_v2r8(ix0,jx0);
848             dy00             = _fjsp_sub_v2r8(iy0,jy0);
849             dz00             = _fjsp_sub_v2r8(iz0,jz0);
850             dx10             = _fjsp_sub_v2r8(ix1,jx0);
851             dy10             = _fjsp_sub_v2r8(iy1,jy0);
852             dz10             = _fjsp_sub_v2r8(iz1,jz0);
853             dx20             = _fjsp_sub_v2r8(ix2,jx0);
854             dy20             = _fjsp_sub_v2r8(iy2,jy0);
855             dz20             = _fjsp_sub_v2r8(iz2,jz0);
856
857             /* Calculate squared distance and things based on it */
858             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
859             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
860             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
861
862             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
863             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
864             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
865
866             /* Load parameters for j particles */
867             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
868
869             fjx0             = _fjsp_setzero_v2r8();
870             fjy0             = _fjsp_setzero_v2r8();
871             fjz0             = _fjsp_setzero_v2r8();
872
873             /**************************
874              * CALCULATE INTERACTIONS *
875              **************************/
876
877             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
878
879             /* Compute parameters for interactions between i and j atoms */
880             qq00             = _fjsp_mul_v2r8(iq0,jq0);
881
882             /* Calculate table index by multiplying r with table scale and truncate to integer */
883             rt               = _fjsp_mul_v2r8(r00,vftabscale);
884             itab_tmp         = _fjsp_dtox_v2r8(rt);
885             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
886             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
887             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
888
889             vfconv.i[0]     *= 4;
890             vfconv.i[1]     *= 4;
891
892             /* CUBIC SPLINE TABLE ELECTROSTATICS */
893             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
894             F                = _fjsp_setzero_v2r8();
895             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
896             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
897             H                = _fjsp_setzero_v2r8();
898             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
899             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
900             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
901             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
902
903             fscal            = felec;
904
905             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
906
907             /* Update vectorial force */
908             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
909             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
910             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
911             
912             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
913             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
914             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
915
916             /**************************
917              * CALCULATE INTERACTIONS *
918              **************************/
919
920             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
921
922             /* Compute parameters for interactions between i and j atoms */
923             qq10             = _fjsp_mul_v2r8(iq1,jq0);
924
925             /* Calculate table index by multiplying r with table scale and truncate to integer */
926             rt               = _fjsp_mul_v2r8(r10,vftabscale);
927             itab_tmp         = _fjsp_dtox_v2r8(rt);
928             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
929             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
930             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
931
932             vfconv.i[0]     *= 4;
933             vfconv.i[1]     *= 4;
934
935             /* CUBIC SPLINE TABLE ELECTROSTATICS */
936             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
937             F                = _fjsp_setzero_v2r8();
938             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
939             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
940             H                = _fjsp_setzero_v2r8();
941             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
942             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
943             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
944             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
945
946             fscal            = felec;
947
948             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
949
950             /* Update vectorial force */
951             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
952             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
953             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
954             
955             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
956             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
957             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
958
959             /**************************
960              * CALCULATE INTERACTIONS *
961              **************************/
962
963             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
964
965             /* Compute parameters for interactions between i and j atoms */
966             qq20             = _fjsp_mul_v2r8(iq2,jq0);
967
968             /* Calculate table index by multiplying r with table scale and truncate to integer */
969             rt               = _fjsp_mul_v2r8(r20,vftabscale);
970             itab_tmp         = _fjsp_dtox_v2r8(rt);
971             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
972             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
973             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
974
975             vfconv.i[0]     *= 4;
976             vfconv.i[1]     *= 4;
977
978             /* CUBIC SPLINE TABLE ELECTROSTATICS */
979             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
980             F                = _fjsp_setzero_v2r8();
981             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
982             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
983             H                = _fjsp_setzero_v2r8();
984             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
985             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
986             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
987             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
988
989             fscal            = felec;
990
991             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
992
993             /* Update vectorial force */
994             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
995             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
996             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
997             
998             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
999             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
1000             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
1001
1002             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
1003
1004             /* Inner loop uses 129 flops */
1005         }
1006
1007         /* End of innermost loop */
1008
1009         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1010                                               f+i_coord_offset,fshift+i_shift_offset);
1011
1012         /* Increment number of inner iterations */
1013         inneriter                  += j_index_end - j_index_start;
1014
1015         /* Outer loop uses 18 flops */
1016     }
1017
1018     /* Increment number of outer iterations */
1019     outeriter        += nri;
1020
1021     /* Update outer/inner flops */
1022
1023     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*129);
1024 }