571ae9e5a53f4fbc12d504f992bee3d0bea95b0a
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sparc64_hpc_ace_double / nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
46
47 #include "kernelutil_sparc64_hpc_ace_double.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
51  * Electrostatics interaction: Coulomb
52  * VdW interaction:            CubicSplineTable
53  * Geometry:                   Water3-Water3
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      t_forcerec                  * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67      * just 0 for non-waters.
68      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB;
74     int              j_coord_offsetA,j_coord_offsetB;
75     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
76     real             rcutoff_scalar;
77     real             *shiftvec,*fshift,*x,*f;
78     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
79     int              vdwioffset0;
80     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
81     int              vdwioffset1;
82     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
83     int              vdwioffset2;
84     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
85     int              vdwjidx0A,vdwjidx0B;
86     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
87     int              vdwjidx1A,vdwjidx1B;
88     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
89     int              vdwjidx2A,vdwjidx2B;
90     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
91     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
92     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
93     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
94     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
95     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
96     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
97     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
98     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
99     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
100     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
101     real             *charge;
102     int              nvdwtype;
103     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
104     int              *vdwtype;
105     real             *vdwparam;
106     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
107     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
108     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
109     real             *vftab;
110     _fjsp_v2r8       itab_tmp;
111     _fjsp_v2r8       dummy_mask,cutoff_mask;
112     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
113     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
114     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
115
116     x                = xx[0];
117     f                = ff[0];
118
119     nri              = nlist->nri;
120     iinr             = nlist->iinr;
121     jindex           = nlist->jindex;
122     jjnr             = nlist->jjnr;
123     shiftidx         = nlist->shift;
124     gid              = nlist->gid;
125     shiftvec         = fr->shift_vec[0];
126     fshift           = fr->fshift[0];
127     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
128     charge           = mdatoms->chargeA;
129     nvdwtype         = fr->ntype;
130     vdwparam         = fr->nbfp;
131     vdwtype          = mdatoms->typeA;
132
133     vftab            = kernel_data->table_vdw->data;
134     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
135
136     /* Setup water-specific parameters */
137     inr              = nlist->iinr[0];
138     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
139     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
140     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
141     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
142
143     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
144     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
145     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
146     vdwjidx0A        = 2*vdwtype[inr+0];
147     qq00             = _fjsp_mul_v2r8(iq0,jq0);
148     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
149     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
150     qq01             = _fjsp_mul_v2r8(iq0,jq1);
151     qq02             = _fjsp_mul_v2r8(iq0,jq2);
152     qq10             = _fjsp_mul_v2r8(iq1,jq0);
153     qq11             = _fjsp_mul_v2r8(iq1,jq1);
154     qq12             = _fjsp_mul_v2r8(iq1,jq2);
155     qq20             = _fjsp_mul_v2r8(iq2,jq0);
156     qq21             = _fjsp_mul_v2r8(iq2,jq1);
157     qq22             = _fjsp_mul_v2r8(iq2,jq2);
158
159     /* Avoid stupid compiler warnings */
160     jnrA = jnrB = 0;
161     j_coord_offsetA = 0;
162     j_coord_offsetB = 0;
163
164     outeriter        = 0;
165     inneriter        = 0;
166
167     /* Start outer loop over neighborlists */
168     for(iidx=0; iidx<nri; iidx++)
169     {
170         /* Load shift vector for this list */
171         i_shift_offset   = DIM*shiftidx[iidx];
172
173         /* Load limits for loop over neighbors */
174         j_index_start    = jindex[iidx];
175         j_index_end      = jindex[iidx+1];
176
177         /* Get outer coordinate index */
178         inr              = iinr[iidx];
179         i_coord_offset   = DIM*inr;
180
181         /* Load i particle coords and add shift vector */
182         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
183                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
184
185         fix0             = _fjsp_setzero_v2r8();
186         fiy0             = _fjsp_setzero_v2r8();
187         fiz0             = _fjsp_setzero_v2r8();
188         fix1             = _fjsp_setzero_v2r8();
189         fiy1             = _fjsp_setzero_v2r8();
190         fiz1             = _fjsp_setzero_v2r8();
191         fix2             = _fjsp_setzero_v2r8();
192         fiy2             = _fjsp_setzero_v2r8();
193         fiz2             = _fjsp_setzero_v2r8();
194
195         /* Reset potential sums */
196         velecsum         = _fjsp_setzero_v2r8();
197         vvdwsum          = _fjsp_setzero_v2r8();
198
199         /* Start inner kernel loop */
200         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
201         {
202
203             /* Get j neighbor index, and coordinate index */
204             jnrA             = jjnr[jidx];
205             jnrB             = jjnr[jidx+1];
206             j_coord_offsetA  = DIM*jnrA;
207             j_coord_offsetB  = DIM*jnrB;
208
209             /* load j atom coordinates */
210             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
211                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
212
213             /* Calculate displacement vector */
214             dx00             = _fjsp_sub_v2r8(ix0,jx0);
215             dy00             = _fjsp_sub_v2r8(iy0,jy0);
216             dz00             = _fjsp_sub_v2r8(iz0,jz0);
217             dx01             = _fjsp_sub_v2r8(ix0,jx1);
218             dy01             = _fjsp_sub_v2r8(iy0,jy1);
219             dz01             = _fjsp_sub_v2r8(iz0,jz1);
220             dx02             = _fjsp_sub_v2r8(ix0,jx2);
221             dy02             = _fjsp_sub_v2r8(iy0,jy2);
222             dz02             = _fjsp_sub_v2r8(iz0,jz2);
223             dx10             = _fjsp_sub_v2r8(ix1,jx0);
224             dy10             = _fjsp_sub_v2r8(iy1,jy0);
225             dz10             = _fjsp_sub_v2r8(iz1,jz0);
226             dx11             = _fjsp_sub_v2r8(ix1,jx1);
227             dy11             = _fjsp_sub_v2r8(iy1,jy1);
228             dz11             = _fjsp_sub_v2r8(iz1,jz1);
229             dx12             = _fjsp_sub_v2r8(ix1,jx2);
230             dy12             = _fjsp_sub_v2r8(iy1,jy2);
231             dz12             = _fjsp_sub_v2r8(iz1,jz2);
232             dx20             = _fjsp_sub_v2r8(ix2,jx0);
233             dy20             = _fjsp_sub_v2r8(iy2,jy0);
234             dz20             = _fjsp_sub_v2r8(iz2,jz0);
235             dx21             = _fjsp_sub_v2r8(ix2,jx1);
236             dy21             = _fjsp_sub_v2r8(iy2,jy1);
237             dz21             = _fjsp_sub_v2r8(iz2,jz1);
238             dx22             = _fjsp_sub_v2r8(ix2,jx2);
239             dy22             = _fjsp_sub_v2r8(iy2,jy2);
240             dz22             = _fjsp_sub_v2r8(iz2,jz2);
241
242             /* Calculate squared distance and things based on it */
243             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
244             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
245             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
246             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
247             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
248             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
249             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
250             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
251             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
252
253             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
254             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
255             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
256             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
257             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
258             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
259             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
260             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
261             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
262
263             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
264             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
265             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
266             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
267             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
268             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
269             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
270             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
271             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
272
273             fjx0             = _fjsp_setzero_v2r8();
274             fjy0             = _fjsp_setzero_v2r8();
275             fjz0             = _fjsp_setzero_v2r8();
276             fjx1             = _fjsp_setzero_v2r8();
277             fjy1             = _fjsp_setzero_v2r8();
278             fjz1             = _fjsp_setzero_v2r8();
279             fjx2             = _fjsp_setzero_v2r8();
280             fjy2             = _fjsp_setzero_v2r8();
281             fjz2             = _fjsp_setzero_v2r8();
282
283             /**************************
284              * CALCULATE INTERACTIONS *
285              **************************/
286
287             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
288
289             /* Calculate table index by multiplying r with table scale and truncate to integer */
290             rt               = _fjsp_mul_v2r8(r00,vftabscale);
291             itab_tmp         = _fjsp_dtox_v2r8(rt);
292             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
293             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
294             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
295
296             vfconv.i[0]     *= 8;
297             vfconv.i[1]     *= 8;
298
299             /* COULOMB ELECTROSTATICS */
300             velec            = _fjsp_mul_v2r8(qq00,rinv00);
301             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
302
303             /* CUBIC SPLINE TABLE DISPERSION */
304             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
305             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
306             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
307             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
308             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
309             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
310             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
311             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
312             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
313             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
314             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
315
316             /* CUBIC SPLINE TABLE REPULSION */
317             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
318             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
319             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
320             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
321             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
322             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
323             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
324             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
325             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
326             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
327             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
328             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
329             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
330
331             /* Update potential sum for this i atom from the interaction with this j atom. */
332             velecsum         = _fjsp_add_v2r8(velecsum,velec);
333             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
334
335             fscal            = _fjsp_add_v2r8(felec,fvdw);
336
337             /* Update vectorial force */
338             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
339             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
340             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
341             
342             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
343             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
344             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
345
346             /**************************
347              * CALCULATE INTERACTIONS *
348              **************************/
349
350             /* COULOMB ELECTROSTATICS */
351             velec            = _fjsp_mul_v2r8(qq01,rinv01);
352             felec            = _fjsp_mul_v2r8(velec,rinvsq01);
353
354             /* Update potential sum for this i atom from the interaction with this j atom. */
355             velecsum         = _fjsp_add_v2r8(velecsum,velec);
356
357             fscal            = felec;
358
359             /* Update vectorial force */
360             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
361             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
362             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
363             
364             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
365             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
366             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
367
368             /**************************
369              * CALCULATE INTERACTIONS *
370              **************************/
371
372             /* COULOMB ELECTROSTATICS */
373             velec            = _fjsp_mul_v2r8(qq02,rinv02);
374             felec            = _fjsp_mul_v2r8(velec,rinvsq02);
375
376             /* Update potential sum for this i atom from the interaction with this j atom. */
377             velecsum         = _fjsp_add_v2r8(velecsum,velec);
378
379             fscal            = felec;
380
381             /* Update vectorial force */
382             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
383             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
384             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
385             
386             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
387             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
388             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
389
390             /**************************
391              * CALCULATE INTERACTIONS *
392              **************************/
393
394             /* COULOMB ELECTROSTATICS */
395             velec            = _fjsp_mul_v2r8(qq10,rinv10);
396             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
397
398             /* Update potential sum for this i atom from the interaction with this j atom. */
399             velecsum         = _fjsp_add_v2r8(velecsum,velec);
400
401             fscal            = felec;
402
403             /* Update vectorial force */
404             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
405             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
406             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
407             
408             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
409             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
410             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
411
412             /**************************
413              * CALCULATE INTERACTIONS *
414              **************************/
415
416             /* COULOMB ELECTROSTATICS */
417             velec            = _fjsp_mul_v2r8(qq11,rinv11);
418             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
419
420             /* Update potential sum for this i atom from the interaction with this j atom. */
421             velecsum         = _fjsp_add_v2r8(velecsum,velec);
422
423             fscal            = felec;
424
425             /* Update vectorial force */
426             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
427             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
428             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
429             
430             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
431             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
432             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
433
434             /**************************
435              * CALCULATE INTERACTIONS *
436              **************************/
437
438             /* COULOMB ELECTROSTATICS */
439             velec            = _fjsp_mul_v2r8(qq12,rinv12);
440             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
441
442             /* Update potential sum for this i atom from the interaction with this j atom. */
443             velecsum         = _fjsp_add_v2r8(velecsum,velec);
444
445             fscal            = felec;
446
447             /* Update vectorial force */
448             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
449             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
450             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
451             
452             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
453             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
454             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
455
456             /**************************
457              * CALCULATE INTERACTIONS *
458              **************************/
459
460             /* COULOMB ELECTROSTATICS */
461             velec            = _fjsp_mul_v2r8(qq20,rinv20);
462             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
463
464             /* Update potential sum for this i atom from the interaction with this j atom. */
465             velecsum         = _fjsp_add_v2r8(velecsum,velec);
466
467             fscal            = felec;
468
469             /* Update vectorial force */
470             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
471             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
472             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
473             
474             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
475             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
476             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
477
478             /**************************
479              * CALCULATE INTERACTIONS *
480              **************************/
481
482             /* COULOMB ELECTROSTATICS */
483             velec            = _fjsp_mul_v2r8(qq21,rinv21);
484             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
485
486             /* Update potential sum for this i atom from the interaction with this j atom. */
487             velecsum         = _fjsp_add_v2r8(velecsum,velec);
488
489             fscal            = felec;
490
491             /* Update vectorial force */
492             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
493             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
494             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
495             
496             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
497             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
498             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
499
500             /**************************
501              * CALCULATE INTERACTIONS *
502              **************************/
503
504             /* COULOMB ELECTROSTATICS */
505             velec            = _fjsp_mul_v2r8(qq22,rinv22);
506             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
507
508             /* Update potential sum for this i atom from the interaction with this j atom. */
509             velecsum         = _fjsp_add_v2r8(velecsum,velec);
510
511             fscal            = felec;
512
513             /* Update vectorial force */
514             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
515             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
516             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
517             
518             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
519             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
520             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
521
522             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
523
524             /* Inner loop uses 314 flops */
525         }
526
527         if(jidx<j_index_end)
528         {
529
530             jnrA             = jjnr[jidx];
531             j_coord_offsetA  = DIM*jnrA;
532
533             /* load j atom coordinates */
534             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
535                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
536
537             /* Calculate displacement vector */
538             dx00             = _fjsp_sub_v2r8(ix0,jx0);
539             dy00             = _fjsp_sub_v2r8(iy0,jy0);
540             dz00             = _fjsp_sub_v2r8(iz0,jz0);
541             dx01             = _fjsp_sub_v2r8(ix0,jx1);
542             dy01             = _fjsp_sub_v2r8(iy0,jy1);
543             dz01             = _fjsp_sub_v2r8(iz0,jz1);
544             dx02             = _fjsp_sub_v2r8(ix0,jx2);
545             dy02             = _fjsp_sub_v2r8(iy0,jy2);
546             dz02             = _fjsp_sub_v2r8(iz0,jz2);
547             dx10             = _fjsp_sub_v2r8(ix1,jx0);
548             dy10             = _fjsp_sub_v2r8(iy1,jy0);
549             dz10             = _fjsp_sub_v2r8(iz1,jz0);
550             dx11             = _fjsp_sub_v2r8(ix1,jx1);
551             dy11             = _fjsp_sub_v2r8(iy1,jy1);
552             dz11             = _fjsp_sub_v2r8(iz1,jz1);
553             dx12             = _fjsp_sub_v2r8(ix1,jx2);
554             dy12             = _fjsp_sub_v2r8(iy1,jy2);
555             dz12             = _fjsp_sub_v2r8(iz1,jz2);
556             dx20             = _fjsp_sub_v2r8(ix2,jx0);
557             dy20             = _fjsp_sub_v2r8(iy2,jy0);
558             dz20             = _fjsp_sub_v2r8(iz2,jz0);
559             dx21             = _fjsp_sub_v2r8(ix2,jx1);
560             dy21             = _fjsp_sub_v2r8(iy2,jy1);
561             dz21             = _fjsp_sub_v2r8(iz2,jz1);
562             dx22             = _fjsp_sub_v2r8(ix2,jx2);
563             dy22             = _fjsp_sub_v2r8(iy2,jy2);
564             dz22             = _fjsp_sub_v2r8(iz2,jz2);
565
566             /* Calculate squared distance and things based on it */
567             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
568             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
569             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
570             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
571             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
572             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
573             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
574             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
575             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
576
577             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
578             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
579             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
580             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
581             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
582             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
583             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
584             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
585             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
586
587             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
588             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
589             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
590             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
591             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
592             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
593             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
594             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
595             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
596
597             fjx0             = _fjsp_setzero_v2r8();
598             fjy0             = _fjsp_setzero_v2r8();
599             fjz0             = _fjsp_setzero_v2r8();
600             fjx1             = _fjsp_setzero_v2r8();
601             fjy1             = _fjsp_setzero_v2r8();
602             fjz1             = _fjsp_setzero_v2r8();
603             fjx2             = _fjsp_setzero_v2r8();
604             fjy2             = _fjsp_setzero_v2r8();
605             fjz2             = _fjsp_setzero_v2r8();
606
607             /**************************
608              * CALCULATE INTERACTIONS *
609              **************************/
610
611             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
612
613             /* Calculate table index by multiplying r with table scale and truncate to integer */
614             rt               = _fjsp_mul_v2r8(r00,vftabscale);
615             itab_tmp         = _fjsp_dtox_v2r8(rt);
616             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
617             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
618             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
619
620             vfconv.i[0]     *= 8;
621             vfconv.i[1]     *= 8;
622
623             /* COULOMB ELECTROSTATICS */
624             velec            = _fjsp_mul_v2r8(qq00,rinv00);
625             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
626
627             /* CUBIC SPLINE TABLE DISPERSION */
628             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
629             F                = _fjsp_setzero_v2r8();
630             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
631             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
632             H                = _fjsp_setzero_v2r8();
633             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
634             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
635             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
636             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
637             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
638             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
639
640             /* CUBIC SPLINE TABLE REPULSION */
641             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
642             F                = _fjsp_setzero_v2r8();
643             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
644             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
645             H                = _fjsp_setzero_v2r8();
646             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
647             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
648             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
649             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
650             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
651             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
652             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
653             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
654
655             /* Update potential sum for this i atom from the interaction with this j atom. */
656             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
657             velecsum         = _fjsp_add_v2r8(velecsum,velec);
658             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
659             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
660
661             fscal            = _fjsp_add_v2r8(felec,fvdw);
662
663             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
664
665             /* Update vectorial force */
666             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
667             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
668             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
669             
670             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
671             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
672             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
673
674             /**************************
675              * CALCULATE INTERACTIONS *
676              **************************/
677
678             /* COULOMB ELECTROSTATICS */
679             velec            = _fjsp_mul_v2r8(qq01,rinv01);
680             felec            = _fjsp_mul_v2r8(velec,rinvsq01);
681
682             /* Update potential sum for this i atom from the interaction with this j atom. */
683             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
684             velecsum         = _fjsp_add_v2r8(velecsum,velec);
685
686             fscal            = felec;
687
688             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
689
690             /* Update vectorial force */
691             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
692             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
693             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
694             
695             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
696             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
697             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
698
699             /**************************
700              * CALCULATE INTERACTIONS *
701              **************************/
702
703             /* COULOMB ELECTROSTATICS */
704             velec            = _fjsp_mul_v2r8(qq02,rinv02);
705             felec            = _fjsp_mul_v2r8(velec,rinvsq02);
706
707             /* Update potential sum for this i atom from the interaction with this j atom. */
708             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
709             velecsum         = _fjsp_add_v2r8(velecsum,velec);
710
711             fscal            = felec;
712
713             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
714
715             /* Update vectorial force */
716             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
717             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
718             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
719             
720             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
721             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
722             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
723
724             /**************************
725              * CALCULATE INTERACTIONS *
726              **************************/
727
728             /* COULOMB ELECTROSTATICS */
729             velec            = _fjsp_mul_v2r8(qq10,rinv10);
730             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
731
732             /* Update potential sum for this i atom from the interaction with this j atom. */
733             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
734             velecsum         = _fjsp_add_v2r8(velecsum,velec);
735
736             fscal            = felec;
737
738             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
739
740             /* Update vectorial force */
741             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
742             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
743             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
744             
745             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
746             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
747             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
748
749             /**************************
750              * CALCULATE INTERACTIONS *
751              **************************/
752
753             /* COULOMB ELECTROSTATICS */
754             velec            = _fjsp_mul_v2r8(qq11,rinv11);
755             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
756
757             /* Update potential sum for this i atom from the interaction with this j atom. */
758             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
759             velecsum         = _fjsp_add_v2r8(velecsum,velec);
760
761             fscal            = felec;
762
763             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
764
765             /* Update vectorial force */
766             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
767             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
768             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
769             
770             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
771             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
772             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
773
774             /**************************
775              * CALCULATE INTERACTIONS *
776              **************************/
777
778             /* COULOMB ELECTROSTATICS */
779             velec            = _fjsp_mul_v2r8(qq12,rinv12);
780             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
781
782             /* Update potential sum for this i atom from the interaction with this j atom. */
783             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
784             velecsum         = _fjsp_add_v2r8(velecsum,velec);
785
786             fscal            = felec;
787
788             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
789
790             /* Update vectorial force */
791             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
792             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
793             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
794             
795             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
796             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
797             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
798
799             /**************************
800              * CALCULATE INTERACTIONS *
801              **************************/
802
803             /* COULOMB ELECTROSTATICS */
804             velec            = _fjsp_mul_v2r8(qq20,rinv20);
805             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
806
807             /* Update potential sum for this i atom from the interaction with this j atom. */
808             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
809             velecsum         = _fjsp_add_v2r8(velecsum,velec);
810
811             fscal            = felec;
812
813             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
814
815             /* Update vectorial force */
816             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
817             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
818             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
819             
820             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
821             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
822             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
823
824             /**************************
825              * CALCULATE INTERACTIONS *
826              **************************/
827
828             /* COULOMB ELECTROSTATICS */
829             velec            = _fjsp_mul_v2r8(qq21,rinv21);
830             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
831
832             /* Update potential sum for this i atom from the interaction with this j atom. */
833             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
834             velecsum         = _fjsp_add_v2r8(velecsum,velec);
835
836             fscal            = felec;
837
838             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
839
840             /* Update vectorial force */
841             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
842             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
843             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
844             
845             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
846             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
847             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
848
849             /**************************
850              * CALCULATE INTERACTIONS *
851              **************************/
852
853             /* COULOMB ELECTROSTATICS */
854             velec            = _fjsp_mul_v2r8(qq22,rinv22);
855             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
856
857             /* Update potential sum for this i atom from the interaction with this j atom. */
858             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
859             velecsum         = _fjsp_add_v2r8(velecsum,velec);
860
861             fscal            = felec;
862
863             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
864
865             /* Update vectorial force */
866             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
867             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
868             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
869             
870             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
871             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
872             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
873
874             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
875
876             /* Inner loop uses 314 flops */
877         }
878
879         /* End of innermost loop */
880
881         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
882                                               f+i_coord_offset,fshift+i_shift_offset);
883
884         ggid                        = gid[iidx];
885         /* Update potential energies */
886         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
887         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
888
889         /* Increment number of inner iterations */
890         inneriter                  += j_index_end - j_index_start;
891
892         /* Outer loop uses 20 flops */
893     }
894
895     /* Increment number of outer iterations */
896     outeriter        += nri;
897
898     /* Update outer/inner flops */
899
900     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*314);
901 }
902 /*
903  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
904  * Electrostatics interaction: Coulomb
905  * VdW interaction:            CubicSplineTable
906  * Geometry:                   Water3-Water3
907  * Calculate force/pot:        Force
908  */
909 void
910 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
911                     (t_nblist                    * gmx_restrict       nlist,
912                      rvec                        * gmx_restrict          xx,
913                      rvec                        * gmx_restrict          ff,
914                      t_forcerec                  * gmx_restrict          fr,
915                      t_mdatoms                   * gmx_restrict     mdatoms,
916                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
917                      t_nrnb                      * gmx_restrict        nrnb)
918 {
919     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
920      * just 0 for non-waters.
921      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
922      * jnr indices corresponding to data put in the four positions in the SIMD register.
923      */
924     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
925     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
926     int              jnrA,jnrB;
927     int              j_coord_offsetA,j_coord_offsetB;
928     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
929     real             rcutoff_scalar;
930     real             *shiftvec,*fshift,*x,*f;
931     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
932     int              vdwioffset0;
933     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
934     int              vdwioffset1;
935     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
936     int              vdwioffset2;
937     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
938     int              vdwjidx0A,vdwjidx0B;
939     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
940     int              vdwjidx1A,vdwjidx1B;
941     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
942     int              vdwjidx2A,vdwjidx2B;
943     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
944     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
945     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
946     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
947     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
948     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
949     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
950     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
951     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
952     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
953     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
954     real             *charge;
955     int              nvdwtype;
956     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
957     int              *vdwtype;
958     real             *vdwparam;
959     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
960     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
961     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
962     real             *vftab;
963     _fjsp_v2r8       itab_tmp;
964     _fjsp_v2r8       dummy_mask,cutoff_mask;
965     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
966     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
967     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
968
969     x                = xx[0];
970     f                = ff[0];
971
972     nri              = nlist->nri;
973     iinr             = nlist->iinr;
974     jindex           = nlist->jindex;
975     jjnr             = nlist->jjnr;
976     shiftidx         = nlist->shift;
977     gid              = nlist->gid;
978     shiftvec         = fr->shift_vec[0];
979     fshift           = fr->fshift[0];
980     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
981     charge           = mdatoms->chargeA;
982     nvdwtype         = fr->ntype;
983     vdwparam         = fr->nbfp;
984     vdwtype          = mdatoms->typeA;
985
986     vftab            = kernel_data->table_vdw->data;
987     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
988
989     /* Setup water-specific parameters */
990     inr              = nlist->iinr[0];
991     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
992     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
993     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
994     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
995
996     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
997     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
998     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
999     vdwjidx0A        = 2*vdwtype[inr+0];
1000     qq00             = _fjsp_mul_v2r8(iq0,jq0);
1001     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
1002     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
1003     qq01             = _fjsp_mul_v2r8(iq0,jq1);
1004     qq02             = _fjsp_mul_v2r8(iq0,jq2);
1005     qq10             = _fjsp_mul_v2r8(iq1,jq0);
1006     qq11             = _fjsp_mul_v2r8(iq1,jq1);
1007     qq12             = _fjsp_mul_v2r8(iq1,jq2);
1008     qq20             = _fjsp_mul_v2r8(iq2,jq0);
1009     qq21             = _fjsp_mul_v2r8(iq2,jq1);
1010     qq22             = _fjsp_mul_v2r8(iq2,jq2);
1011
1012     /* Avoid stupid compiler warnings */
1013     jnrA = jnrB = 0;
1014     j_coord_offsetA = 0;
1015     j_coord_offsetB = 0;
1016
1017     outeriter        = 0;
1018     inneriter        = 0;
1019
1020     /* Start outer loop over neighborlists */
1021     for(iidx=0; iidx<nri; iidx++)
1022     {
1023         /* Load shift vector for this list */
1024         i_shift_offset   = DIM*shiftidx[iidx];
1025
1026         /* Load limits for loop over neighbors */
1027         j_index_start    = jindex[iidx];
1028         j_index_end      = jindex[iidx+1];
1029
1030         /* Get outer coordinate index */
1031         inr              = iinr[iidx];
1032         i_coord_offset   = DIM*inr;
1033
1034         /* Load i particle coords and add shift vector */
1035         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
1036                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1037
1038         fix0             = _fjsp_setzero_v2r8();
1039         fiy0             = _fjsp_setzero_v2r8();
1040         fiz0             = _fjsp_setzero_v2r8();
1041         fix1             = _fjsp_setzero_v2r8();
1042         fiy1             = _fjsp_setzero_v2r8();
1043         fiz1             = _fjsp_setzero_v2r8();
1044         fix2             = _fjsp_setzero_v2r8();
1045         fiy2             = _fjsp_setzero_v2r8();
1046         fiz2             = _fjsp_setzero_v2r8();
1047
1048         /* Start inner kernel loop */
1049         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1050         {
1051
1052             /* Get j neighbor index, and coordinate index */
1053             jnrA             = jjnr[jidx];
1054             jnrB             = jjnr[jidx+1];
1055             j_coord_offsetA  = DIM*jnrA;
1056             j_coord_offsetB  = DIM*jnrB;
1057
1058             /* load j atom coordinates */
1059             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
1060                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1061
1062             /* Calculate displacement vector */
1063             dx00             = _fjsp_sub_v2r8(ix0,jx0);
1064             dy00             = _fjsp_sub_v2r8(iy0,jy0);
1065             dz00             = _fjsp_sub_v2r8(iz0,jz0);
1066             dx01             = _fjsp_sub_v2r8(ix0,jx1);
1067             dy01             = _fjsp_sub_v2r8(iy0,jy1);
1068             dz01             = _fjsp_sub_v2r8(iz0,jz1);
1069             dx02             = _fjsp_sub_v2r8(ix0,jx2);
1070             dy02             = _fjsp_sub_v2r8(iy0,jy2);
1071             dz02             = _fjsp_sub_v2r8(iz0,jz2);
1072             dx10             = _fjsp_sub_v2r8(ix1,jx0);
1073             dy10             = _fjsp_sub_v2r8(iy1,jy0);
1074             dz10             = _fjsp_sub_v2r8(iz1,jz0);
1075             dx11             = _fjsp_sub_v2r8(ix1,jx1);
1076             dy11             = _fjsp_sub_v2r8(iy1,jy1);
1077             dz11             = _fjsp_sub_v2r8(iz1,jz1);
1078             dx12             = _fjsp_sub_v2r8(ix1,jx2);
1079             dy12             = _fjsp_sub_v2r8(iy1,jy2);
1080             dz12             = _fjsp_sub_v2r8(iz1,jz2);
1081             dx20             = _fjsp_sub_v2r8(ix2,jx0);
1082             dy20             = _fjsp_sub_v2r8(iy2,jy0);
1083             dz20             = _fjsp_sub_v2r8(iz2,jz0);
1084             dx21             = _fjsp_sub_v2r8(ix2,jx1);
1085             dy21             = _fjsp_sub_v2r8(iy2,jy1);
1086             dz21             = _fjsp_sub_v2r8(iz2,jz1);
1087             dx22             = _fjsp_sub_v2r8(ix2,jx2);
1088             dy22             = _fjsp_sub_v2r8(iy2,jy2);
1089             dz22             = _fjsp_sub_v2r8(iz2,jz2);
1090
1091             /* Calculate squared distance and things based on it */
1092             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
1093             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
1094             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
1095             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
1096             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
1097             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
1098             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
1099             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
1100             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
1101
1102             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
1103             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
1104             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
1105             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
1106             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
1107             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
1108             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
1109             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
1110             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
1111
1112             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
1113             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
1114             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
1115             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
1116             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
1117             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
1118             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
1119             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
1120             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
1121
1122             fjx0             = _fjsp_setzero_v2r8();
1123             fjy0             = _fjsp_setzero_v2r8();
1124             fjz0             = _fjsp_setzero_v2r8();
1125             fjx1             = _fjsp_setzero_v2r8();
1126             fjy1             = _fjsp_setzero_v2r8();
1127             fjz1             = _fjsp_setzero_v2r8();
1128             fjx2             = _fjsp_setzero_v2r8();
1129             fjy2             = _fjsp_setzero_v2r8();
1130             fjz2             = _fjsp_setzero_v2r8();
1131
1132             /**************************
1133              * CALCULATE INTERACTIONS *
1134              **************************/
1135
1136             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
1137
1138             /* Calculate table index by multiplying r with table scale and truncate to integer */
1139             rt               = _fjsp_mul_v2r8(r00,vftabscale);
1140             itab_tmp         = _fjsp_dtox_v2r8(rt);
1141             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
1142             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
1143             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
1144
1145             vfconv.i[0]     *= 8;
1146             vfconv.i[1]     *= 8;
1147
1148             /* COULOMB ELECTROSTATICS */
1149             velec            = _fjsp_mul_v2r8(qq00,rinv00);
1150             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
1151
1152             /* CUBIC SPLINE TABLE DISPERSION */
1153             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
1154             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
1155             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
1156             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
1157             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
1158             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
1159             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
1160             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
1161             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
1162
1163             /* CUBIC SPLINE TABLE REPULSION */
1164             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
1165             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
1166             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
1167             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
1168             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
1169             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
1170             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
1171             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
1172             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
1173             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
1174
1175             fscal            = _fjsp_add_v2r8(felec,fvdw);
1176
1177             /* Update vectorial force */
1178             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
1179             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
1180             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
1181             
1182             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
1183             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
1184             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
1185
1186             /**************************
1187              * CALCULATE INTERACTIONS *
1188              **************************/
1189
1190             /* COULOMB ELECTROSTATICS */
1191             velec            = _fjsp_mul_v2r8(qq01,rinv01);
1192             felec            = _fjsp_mul_v2r8(velec,rinvsq01);
1193
1194             fscal            = felec;
1195
1196             /* Update vectorial force */
1197             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
1198             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
1199             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
1200             
1201             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
1202             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
1203             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
1204
1205             /**************************
1206              * CALCULATE INTERACTIONS *
1207              **************************/
1208
1209             /* COULOMB ELECTROSTATICS */
1210             velec            = _fjsp_mul_v2r8(qq02,rinv02);
1211             felec            = _fjsp_mul_v2r8(velec,rinvsq02);
1212
1213             fscal            = felec;
1214
1215             /* Update vectorial force */
1216             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
1217             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
1218             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
1219             
1220             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
1221             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
1222             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
1223
1224             /**************************
1225              * CALCULATE INTERACTIONS *
1226              **************************/
1227
1228             /* COULOMB ELECTROSTATICS */
1229             velec            = _fjsp_mul_v2r8(qq10,rinv10);
1230             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
1231
1232             fscal            = felec;
1233
1234             /* Update vectorial force */
1235             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
1236             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
1237             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
1238             
1239             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
1240             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
1241             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
1242
1243             /**************************
1244              * CALCULATE INTERACTIONS *
1245              **************************/
1246
1247             /* COULOMB ELECTROSTATICS */
1248             velec            = _fjsp_mul_v2r8(qq11,rinv11);
1249             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
1250
1251             fscal            = felec;
1252
1253             /* Update vectorial force */
1254             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
1255             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
1256             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
1257             
1258             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
1259             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
1260             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
1261
1262             /**************************
1263              * CALCULATE INTERACTIONS *
1264              **************************/
1265
1266             /* COULOMB ELECTROSTATICS */
1267             velec            = _fjsp_mul_v2r8(qq12,rinv12);
1268             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
1269
1270             fscal            = felec;
1271
1272             /* Update vectorial force */
1273             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
1274             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
1275             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
1276             
1277             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
1278             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
1279             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
1280
1281             /**************************
1282              * CALCULATE INTERACTIONS *
1283              **************************/
1284
1285             /* COULOMB ELECTROSTATICS */
1286             velec            = _fjsp_mul_v2r8(qq20,rinv20);
1287             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
1288
1289             fscal            = felec;
1290
1291             /* Update vectorial force */
1292             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
1293             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
1294             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
1295             
1296             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
1297             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
1298             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
1299
1300             /**************************
1301              * CALCULATE INTERACTIONS *
1302              **************************/
1303
1304             /* COULOMB ELECTROSTATICS */
1305             velec            = _fjsp_mul_v2r8(qq21,rinv21);
1306             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
1307
1308             fscal            = felec;
1309
1310             /* Update vectorial force */
1311             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
1312             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
1313             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
1314             
1315             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
1316             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
1317             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
1318
1319             /**************************
1320              * CALCULATE INTERACTIONS *
1321              **************************/
1322
1323             /* COULOMB ELECTROSTATICS */
1324             velec            = _fjsp_mul_v2r8(qq22,rinv22);
1325             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
1326
1327             fscal            = felec;
1328
1329             /* Update vectorial force */
1330             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
1331             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
1332             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
1333             
1334             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
1335             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
1336             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
1337
1338             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1339
1340             /* Inner loop uses 297 flops */
1341         }
1342
1343         if(jidx<j_index_end)
1344         {
1345
1346             jnrA             = jjnr[jidx];
1347             j_coord_offsetA  = DIM*jnrA;
1348
1349             /* load j atom coordinates */
1350             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
1351                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1352
1353             /* Calculate displacement vector */
1354             dx00             = _fjsp_sub_v2r8(ix0,jx0);
1355             dy00             = _fjsp_sub_v2r8(iy0,jy0);
1356             dz00             = _fjsp_sub_v2r8(iz0,jz0);
1357             dx01             = _fjsp_sub_v2r8(ix0,jx1);
1358             dy01             = _fjsp_sub_v2r8(iy0,jy1);
1359             dz01             = _fjsp_sub_v2r8(iz0,jz1);
1360             dx02             = _fjsp_sub_v2r8(ix0,jx2);
1361             dy02             = _fjsp_sub_v2r8(iy0,jy2);
1362             dz02             = _fjsp_sub_v2r8(iz0,jz2);
1363             dx10             = _fjsp_sub_v2r8(ix1,jx0);
1364             dy10             = _fjsp_sub_v2r8(iy1,jy0);
1365             dz10             = _fjsp_sub_v2r8(iz1,jz0);
1366             dx11             = _fjsp_sub_v2r8(ix1,jx1);
1367             dy11             = _fjsp_sub_v2r8(iy1,jy1);
1368             dz11             = _fjsp_sub_v2r8(iz1,jz1);
1369             dx12             = _fjsp_sub_v2r8(ix1,jx2);
1370             dy12             = _fjsp_sub_v2r8(iy1,jy2);
1371             dz12             = _fjsp_sub_v2r8(iz1,jz2);
1372             dx20             = _fjsp_sub_v2r8(ix2,jx0);
1373             dy20             = _fjsp_sub_v2r8(iy2,jy0);
1374             dz20             = _fjsp_sub_v2r8(iz2,jz0);
1375             dx21             = _fjsp_sub_v2r8(ix2,jx1);
1376             dy21             = _fjsp_sub_v2r8(iy2,jy1);
1377             dz21             = _fjsp_sub_v2r8(iz2,jz1);
1378             dx22             = _fjsp_sub_v2r8(ix2,jx2);
1379             dy22             = _fjsp_sub_v2r8(iy2,jy2);
1380             dz22             = _fjsp_sub_v2r8(iz2,jz2);
1381
1382             /* Calculate squared distance and things based on it */
1383             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
1384             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
1385             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
1386             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
1387             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
1388             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
1389             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
1390             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
1391             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
1392
1393             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
1394             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
1395             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
1396             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
1397             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
1398             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
1399             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
1400             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
1401             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
1402
1403             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
1404             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
1405             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
1406             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
1407             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
1408             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
1409             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
1410             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
1411             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
1412
1413             fjx0             = _fjsp_setzero_v2r8();
1414             fjy0             = _fjsp_setzero_v2r8();
1415             fjz0             = _fjsp_setzero_v2r8();
1416             fjx1             = _fjsp_setzero_v2r8();
1417             fjy1             = _fjsp_setzero_v2r8();
1418             fjz1             = _fjsp_setzero_v2r8();
1419             fjx2             = _fjsp_setzero_v2r8();
1420             fjy2             = _fjsp_setzero_v2r8();
1421             fjz2             = _fjsp_setzero_v2r8();
1422
1423             /**************************
1424              * CALCULATE INTERACTIONS *
1425              **************************/
1426
1427             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
1428
1429             /* Calculate table index by multiplying r with table scale and truncate to integer */
1430             rt               = _fjsp_mul_v2r8(r00,vftabscale);
1431             itab_tmp         = _fjsp_dtox_v2r8(rt);
1432             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
1433             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
1434             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
1435
1436             vfconv.i[0]     *= 8;
1437             vfconv.i[1]     *= 8;
1438
1439             /* COULOMB ELECTROSTATICS */
1440             velec            = _fjsp_mul_v2r8(qq00,rinv00);
1441             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
1442
1443             /* CUBIC SPLINE TABLE DISPERSION */
1444             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
1445             F                = _fjsp_setzero_v2r8();
1446             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
1447             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
1448             H                = _fjsp_setzero_v2r8();
1449             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
1450             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
1451             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
1452             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
1453
1454             /* CUBIC SPLINE TABLE REPULSION */
1455             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
1456             F                = _fjsp_setzero_v2r8();
1457             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
1458             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
1459             H                = _fjsp_setzero_v2r8();
1460             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
1461             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
1462             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
1463             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
1464             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
1465
1466             fscal            = _fjsp_add_v2r8(felec,fvdw);
1467
1468             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1469
1470             /* Update vectorial force */
1471             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
1472             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
1473             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
1474             
1475             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
1476             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
1477             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
1478
1479             /**************************
1480              * CALCULATE INTERACTIONS *
1481              **************************/
1482
1483             /* COULOMB ELECTROSTATICS */
1484             velec            = _fjsp_mul_v2r8(qq01,rinv01);
1485             felec            = _fjsp_mul_v2r8(velec,rinvsq01);
1486
1487             fscal            = felec;
1488
1489             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1490
1491             /* Update vectorial force */
1492             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
1493             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
1494             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
1495             
1496             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
1497             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
1498             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
1499
1500             /**************************
1501              * CALCULATE INTERACTIONS *
1502              **************************/
1503
1504             /* COULOMB ELECTROSTATICS */
1505             velec            = _fjsp_mul_v2r8(qq02,rinv02);
1506             felec            = _fjsp_mul_v2r8(velec,rinvsq02);
1507
1508             fscal            = felec;
1509
1510             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1511
1512             /* Update vectorial force */
1513             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
1514             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
1515             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
1516             
1517             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
1518             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
1519             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
1520
1521             /**************************
1522              * CALCULATE INTERACTIONS *
1523              **************************/
1524
1525             /* COULOMB ELECTROSTATICS */
1526             velec            = _fjsp_mul_v2r8(qq10,rinv10);
1527             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
1528
1529             fscal            = felec;
1530
1531             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1532
1533             /* Update vectorial force */
1534             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
1535             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
1536             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
1537             
1538             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
1539             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
1540             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
1541
1542             /**************************
1543              * CALCULATE INTERACTIONS *
1544              **************************/
1545
1546             /* COULOMB ELECTROSTATICS */
1547             velec            = _fjsp_mul_v2r8(qq11,rinv11);
1548             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
1549
1550             fscal            = felec;
1551
1552             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1553
1554             /* Update vectorial force */
1555             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
1556             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
1557             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
1558             
1559             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
1560             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
1561             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
1562
1563             /**************************
1564              * CALCULATE INTERACTIONS *
1565              **************************/
1566
1567             /* COULOMB ELECTROSTATICS */
1568             velec            = _fjsp_mul_v2r8(qq12,rinv12);
1569             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
1570
1571             fscal            = felec;
1572
1573             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1574
1575             /* Update vectorial force */
1576             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
1577             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
1578             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
1579             
1580             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
1581             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
1582             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
1583
1584             /**************************
1585              * CALCULATE INTERACTIONS *
1586              **************************/
1587
1588             /* COULOMB ELECTROSTATICS */
1589             velec            = _fjsp_mul_v2r8(qq20,rinv20);
1590             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
1591
1592             fscal            = felec;
1593
1594             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1595
1596             /* Update vectorial force */
1597             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
1598             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
1599             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
1600             
1601             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
1602             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
1603             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
1604
1605             /**************************
1606              * CALCULATE INTERACTIONS *
1607              **************************/
1608
1609             /* COULOMB ELECTROSTATICS */
1610             velec            = _fjsp_mul_v2r8(qq21,rinv21);
1611             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
1612
1613             fscal            = felec;
1614
1615             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1616
1617             /* Update vectorial force */
1618             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
1619             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
1620             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
1621             
1622             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
1623             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
1624             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
1625
1626             /**************************
1627              * CALCULATE INTERACTIONS *
1628              **************************/
1629
1630             /* COULOMB ELECTROSTATICS */
1631             velec            = _fjsp_mul_v2r8(qq22,rinv22);
1632             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
1633
1634             fscal            = felec;
1635
1636             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1637
1638             /* Update vectorial force */
1639             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
1640             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
1641             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
1642             
1643             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
1644             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
1645             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
1646
1647             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1648
1649             /* Inner loop uses 297 flops */
1650         }
1651
1652         /* End of innermost loop */
1653
1654         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1655                                               f+i_coord_offset,fshift+i_shift_offset);
1656
1657         /* Increment number of inner iterations */
1658         inneriter                  += j_index_end - j_index_start;
1659
1660         /* Outer loop uses 18 flops */
1661     }
1662
1663     /* Increment number of outer iterations */
1664     outeriter        += nri;
1665
1666     /* Update outer/inner flops */
1667
1668     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*297);
1669 }