056b173546d1136c66623be84871dff649d098ac
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_avx_128_fma_double / nb_kernel_ElecRF_VdwCSTab_GeomW3P1_avx_128_fma_double.c
1 /*
2  * Note: this file was generated by the Gromacs avx_128_fma_double kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_128_fma_double.h"
34 #include "kernelutil_x86_avx_128_fma_double.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_avx_128_fma_double
38  * Electrostatics interaction: ReactionField
39  * VdW interaction:            CubicSplineTable
40  * Geometry:                   Water3-Particle
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_avx_128_fma_double
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54      * just 0 for non-waters.
55      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB;
61     int              j_coord_offsetA,j_coord_offsetB;
62     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
63     real             rcutoff_scalar;
64     real             *shiftvec,*fshift,*x,*f;
65     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66     int              vdwioffset0;
67     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68     int              vdwioffset1;
69     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
70     int              vdwioffset2;
71     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72     int              vdwjidx0A,vdwjidx0B;
73     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
75     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
76     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
77     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
78     real             *charge;
79     int              nvdwtype;
80     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
81     int              *vdwtype;
82     real             *vdwparam;
83     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
84     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
85     __m128i          vfitab;
86     __m128i          ifour       = _mm_set1_epi32(4);
87     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
88     real             *vftab;
89     __m128d          dummy_mask,cutoff_mask;
90     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
91     __m128d          one     = _mm_set1_pd(1.0);
92     __m128d          two     = _mm_set1_pd(2.0);
93     x                = xx[0];
94     f                = ff[0];
95
96     nri              = nlist->nri;
97     iinr             = nlist->iinr;
98     jindex           = nlist->jindex;
99     jjnr             = nlist->jjnr;
100     shiftidx         = nlist->shift;
101     gid              = nlist->gid;
102     shiftvec         = fr->shift_vec[0];
103     fshift           = fr->fshift[0];
104     facel            = _mm_set1_pd(fr->epsfac);
105     charge           = mdatoms->chargeA;
106     krf              = _mm_set1_pd(fr->ic->k_rf);
107     krf2             = _mm_set1_pd(fr->ic->k_rf*2.0);
108     crf              = _mm_set1_pd(fr->ic->c_rf);
109     nvdwtype         = fr->ntype;
110     vdwparam         = fr->nbfp;
111     vdwtype          = mdatoms->typeA;
112
113     vftab            = kernel_data->table_vdw->data;
114     vftabscale       = _mm_set1_pd(kernel_data->table_vdw->scale);
115
116     /* Setup water-specific parameters */
117     inr              = nlist->iinr[0];
118     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
119     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
120     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
121     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
122
123     /* Avoid stupid compiler warnings */
124     jnrA = jnrB = 0;
125     j_coord_offsetA = 0;
126     j_coord_offsetB = 0;
127
128     outeriter        = 0;
129     inneriter        = 0;
130
131     /* Start outer loop over neighborlists */
132     for(iidx=0; iidx<nri; iidx++)
133     {
134         /* Load shift vector for this list */
135         i_shift_offset   = DIM*shiftidx[iidx];
136
137         /* Load limits for loop over neighbors */
138         j_index_start    = jindex[iidx];
139         j_index_end      = jindex[iidx+1];
140
141         /* Get outer coordinate index */
142         inr              = iinr[iidx];
143         i_coord_offset   = DIM*inr;
144
145         /* Load i particle coords and add shift vector */
146         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
147                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
148
149         fix0             = _mm_setzero_pd();
150         fiy0             = _mm_setzero_pd();
151         fiz0             = _mm_setzero_pd();
152         fix1             = _mm_setzero_pd();
153         fiy1             = _mm_setzero_pd();
154         fiz1             = _mm_setzero_pd();
155         fix2             = _mm_setzero_pd();
156         fiy2             = _mm_setzero_pd();
157         fiz2             = _mm_setzero_pd();
158
159         /* Reset potential sums */
160         velecsum         = _mm_setzero_pd();
161         vvdwsum          = _mm_setzero_pd();
162
163         /* Start inner kernel loop */
164         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
165         {
166
167             /* Get j neighbor index, and coordinate index */
168             jnrA             = jjnr[jidx];
169             jnrB             = jjnr[jidx+1];
170             j_coord_offsetA  = DIM*jnrA;
171             j_coord_offsetB  = DIM*jnrB;
172
173             /* load j atom coordinates */
174             gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
175                                               &jx0,&jy0,&jz0);
176
177             /* Calculate displacement vector */
178             dx00             = _mm_sub_pd(ix0,jx0);
179             dy00             = _mm_sub_pd(iy0,jy0);
180             dz00             = _mm_sub_pd(iz0,jz0);
181             dx10             = _mm_sub_pd(ix1,jx0);
182             dy10             = _mm_sub_pd(iy1,jy0);
183             dz10             = _mm_sub_pd(iz1,jz0);
184             dx20             = _mm_sub_pd(ix2,jx0);
185             dy20             = _mm_sub_pd(iy2,jy0);
186             dz20             = _mm_sub_pd(iz2,jz0);
187
188             /* Calculate squared distance and things based on it */
189             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
190             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
191             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
192
193             rinv00           = gmx_mm_invsqrt_pd(rsq00);
194             rinv10           = gmx_mm_invsqrt_pd(rsq10);
195             rinv20           = gmx_mm_invsqrt_pd(rsq20);
196
197             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
198             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
199             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
200
201             /* Load parameters for j particles */
202             jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
203             vdwjidx0A        = 2*vdwtype[jnrA+0];
204             vdwjidx0B        = 2*vdwtype[jnrB+0];
205
206             fjx0             = _mm_setzero_pd();
207             fjy0             = _mm_setzero_pd();
208             fjz0             = _mm_setzero_pd();
209
210             /**************************
211              * CALCULATE INTERACTIONS *
212              **************************/
213
214             r00              = _mm_mul_pd(rsq00,rinv00);
215
216             /* Compute parameters for interactions between i and j atoms */
217             qq00             = _mm_mul_pd(iq0,jq0);
218             gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
219                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
220
221             /* Calculate table index by multiplying r with table scale and truncate to integer */
222             rt               = _mm_mul_pd(r00,vftabscale);
223             vfitab           = _mm_cvttpd_epi32(rt);
224 #ifdef __XOP__
225             vfeps            = _mm_frcz_pd(rt);
226 #else
227             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
228 #endif
229             twovfeps         = _mm_add_pd(vfeps,vfeps);
230             vfitab           = _mm_slli_epi32(vfitab,3);
231
232             /* REACTION-FIELD ELECTROSTATICS */
233             velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_macc_pd(krf,rsq00,rinv00),crf));
234             felec            = _mm_mul_pd(qq00,_mm_msub_pd(rinv00,rinvsq00,krf2));
235
236             /* CUBIC SPLINE TABLE DISPERSION */
237             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
238             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
239             GMX_MM_TRANSPOSE2_PD(Y,F);
240             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
241             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
242             GMX_MM_TRANSPOSE2_PD(G,H);
243             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
244             VV               = _mm_macc_pd(vfeps,Fp,Y);
245             vvdw6            = _mm_mul_pd(c6_00,VV);
246             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
247             fvdw6            = _mm_mul_pd(c6_00,FF);
248
249             /* CUBIC SPLINE TABLE REPULSION */
250             vfitab           = _mm_add_epi32(vfitab,ifour);
251             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
252             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
253             GMX_MM_TRANSPOSE2_PD(Y,F);
254             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
255             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
256             GMX_MM_TRANSPOSE2_PD(G,H);
257             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
258             VV               = _mm_macc_pd(vfeps,Fp,Y);
259             vvdw12           = _mm_mul_pd(c12_00,VV);
260             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
261             fvdw12           = _mm_mul_pd(c12_00,FF);
262             vvdw             = _mm_add_pd(vvdw12,vvdw6);
263             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
264
265             /* Update potential sum for this i atom from the interaction with this j atom. */
266             velecsum         = _mm_add_pd(velecsum,velec);
267             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
268
269             fscal            = _mm_add_pd(felec,fvdw);
270
271             /* Update vectorial force */
272             fix0             = _mm_macc_pd(dx00,fscal,fix0);
273             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
274             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
275             
276             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
277             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
278             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
279
280             /**************************
281              * CALCULATE INTERACTIONS *
282              **************************/
283
284             /* Compute parameters for interactions between i and j atoms */
285             qq10             = _mm_mul_pd(iq1,jq0);
286
287             /* REACTION-FIELD ELECTROSTATICS */
288             velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_macc_pd(krf,rsq10,rinv10),crf));
289             felec            = _mm_mul_pd(qq10,_mm_msub_pd(rinv10,rinvsq10,krf2));
290
291             /* Update potential sum for this i atom from the interaction with this j atom. */
292             velecsum         = _mm_add_pd(velecsum,velec);
293
294             fscal            = felec;
295
296             /* Update vectorial force */
297             fix1             = _mm_macc_pd(dx10,fscal,fix1);
298             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
299             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
300             
301             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
302             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
303             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
304
305             /**************************
306              * CALCULATE INTERACTIONS *
307              **************************/
308
309             /* Compute parameters for interactions between i and j atoms */
310             qq20             = _mm_mul_pd(iq2,jq0);
311
312             /* REACTION-FIELD ELECTROSTATICS */
313             velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_macc_pd(krf,rsq20,rinv20),crf));
314             felec            = _mm_mul_pd(qq20,_mm_msub_pd(rinv20,rinvsq20,krf2));
315
316             /* Update potential sum for this i atom from the interaction with this j atom. */
317             velecsum         = _mm_add_pd(velecsum,velec);
318
319             fscal            = felec;
320
321             /* Update vectorial force */
322             fix2             = _mm_macc_pd(dx20,fscal,fix2);
323             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
324             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
325             
326             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
327             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
328             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
329
330             gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
331
332             /* Inner loop uses 143 flops */
333         }
334
335         if(jidx<j_index_end)
336         {
337
338             jnrA             = jjnr[jidx];
339             j_coord_offsetA  = DIM*jnrA;
340
341             /* load j atom coordinates */
342             gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
343                                               &jx0,&jy0,&jz0);
344
345             /* Calculate displacement vector */
346             dx00             = _mm_sub_pd(ix0,jx0);
347             dy00             = _mm_sub_pd(iy0,jy0);
348             dz00             = _mm_sub_pd(iz0,jz0);
349             dx10             = _mm_sub_pd(ix1,jx0);
350             dy10             = _mm_sub_pd(iy1,jy0);
351             dz10             = _mm_sub_pd(iz1,jz0);
352             dx20             = _mm_sub_pd(ix2,jx0);
353             dy20             = _mm_sub_pd(iy2,jy0);
354             dz20             = _mm_sub_pd(iz2,jz0);
355
356             /* Calculate squared distance and things based on it */
357             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
358             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
359             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
360
361             rinv00           = gmx_mm_invsqrt_pd(rsq00);
362             rinv10           = gmx_mm_invsqrt_pd(rsq10);
363             rinv20           = gmx_mm_invsqrt_pd(rsq20);
364
365             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
366             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
367             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
368
369             /* Load parameters for j particles */
370             jq0              = _mm_load_sd(charge+jnrA+0);
371             vdwjidx0A        = 2*vdwtype[jnrA+0];
372
373             fjx0             = _mm_setzero_pd();
374             fjy0             = _mm_setzero_pd();
375             fjz0             = _mm_setzero_pd();
376
377             /**************************
378              * CALCULATE INTERACTIONS *
379              **************************/
380
381             r00              = _mm_mul_pd(rsq00,rinv00);
382
383             /* Compute parameters for interactions between i and j atoms */
384             qq00             = _mm_mul_pd(iq0,jq0);
385             gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
386
387             /* Calculate table index by multiplying r with table scale and truncate to integer */
388             rt               = _mm_mul_pd(r00,vftabscale);
389             vfitab           = _mm_cvttpd_epi32(rt);
390 #ifdef __XOP__
391             vfeps            = _mm_frcz_pd(rt);
392 #else
393             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
394 #endif
395             twovfeps         = _mm_add_pd(vfeps,vfeps);
396             vfitab           = _mm_slli_epi32(vfitab,3);
397
398             /* REACTION-FIELD ELECTROSTATICS */
399             velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_macc_pd(krf,rsq00,rinv00),crf));
400             felec            = _mm_mul_pd(qq00,_mm_msub_pd(rinv00,rinvsq00,krf2));
401
402             /* CUBIC SPLINE TABLE DISPERSION */
403             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
404             F                = _mm_setzero_pd();
405             GMX_MM_TRANSPOSE2_PD(Y,F);
406             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
407             H                = _mm_setzero_pd();
408             GMX_MM_TRANSPOSE2_PD(G,H);
409             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
410             VV               = _mm_macc_pd(vfeps,Fp,Y);
411             vvdw6            = _mm_mul_pd(c6_00,VV);
412             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
413             fvdw6            = _mm_mul_pd(c6_00,FF);
414
415             /* CUBIC SPLINE TABLE REPULSION */
416             vfitab           = _mm_add_epi32(vfitab,ifour);
417             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
418             F                = _mm_setzero_pd();
419             GMX_MM_TRANSPOSE2_PD(Y,F);
420             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
421             H                = _mm_setzero_pd();
422             GMX_MM_TRANSPOSE2_PD(G,H);
423             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
424             VV               = _mm_macc_pd(vfeps,Fp,Y);
425             vvdw12           = _mm_mul_pd(c12_00,VV);
426             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
427             fvdw12           = _mm_mul_pd(c12_00,FF);
428             vvdw             = _mm_add_pd(vvdw12,vvdw6);
429             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
430
431             /* Update potential sum for this i atom from the interaction with this j atom. */
432             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
433             velecsum         = _mm_add_pd(velecsum,velec);
434             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
435             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
436
437             fscal            = _mm_add_pd(felec,fvdw);
438
439             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
440
441             /* Update vectorial force */
442             fix0             = _mm_macc_pd(dx00,fscal,fix0);
443             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
444             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
445             
446             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
447             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
448             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
449
450             /**************************
451              * CALCULATE INTERACTIONS *
452              **************************/
453
454             /* Compute parameters for interactions between i and j atoms */
455             qq10             = _mm_mul_pd(iq1,jq0);
456
457             /* REACTION-FIELD ELECTROSTATICS */
458             velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_macc_pd(krf,rsq10,rinv10),crf));
459             felec            = _mm_mul_pd(qq10,_mm_msub_pd(rinv10,rinvsq10,krf2));
460
461             /* Update potential sum for this i atom from the interaction with this j atom. */
462             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
463             velecsum         = _mm_add_pd(velecsum,velec);
464
465             fscal            = felec;
466
467             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
468
469             /* Update vectorial force */
470             fix1             = _mm_macc_pd(dx10,fscal,fix1);
471             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
472             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
473             
474             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
475             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
476             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
477
478             /**************************
479              * CALCULATE INTERACTIONS *
480              **************************/
481
482             /* Compute parameters for interactions between i and j atoms */
483             qq20             = _mm_mul_pd(iq2,jq0);
484
485             /* REACTION-FIELD ELECTROSTATICS */
486             velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_macc_pd(krf,rsq20,rinv20),crf));
487             felec            = _mm_mul_pd(qq20,_mm_msub_pd(rinv20,rinvsq20,krf2));
488
489             /* Update potential sum for this i atom from the interaction with this j atom. */
490             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
491             velecsum         = _mm_add_pd(velecsum,velec);
492
493             fscal            = felec;
494
495             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
496
497             /* Update vectorial force */
498             fix2             = _mm_macc_pd(dx20,fscal,fix2);
499             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
500             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
501             
502             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
503             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
504             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
505
506             gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
507
508             /* Inner loop uses 143 flops */
509         }
510
511         /* End of innermost loop */
512
513         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
514                                               f+i_coord_offset,fshift+i_shift_offset);
515
516         ggid                        = gid[iidx];
517         /* Update potential energies */
518         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
519         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
520
521         /* Increment number of inner iterations */
522         inneriter                  += j_index_end - j_index_start;
523
524         /* Outer loop uses 20 flops */
525     }
526
527     /* Increment number of outer iterations */
528     outeriter        += nri;
529
530     /* Update outer/inner flops */
531
532     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*143);
533 }
534 /*
535  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_avx_128_fma_double
536  * Electrostatics interaction: ReactionField
537  * VdW interaction:            CubicSplineTable
538  * Geometry:                   Water3-Particle
539  * Calculate force/pot:        Force
540  */
541 void
542 nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_avx_128_fma_double
543                     (t_nblist * gmx_restrict                nlist,
544                      rvec * gmx_restrict                    xx,
545                      rvec * gmx_restrict                    ff,
546                      t_forcerec * gmx_restrict              fr,
547                      t_mdatoms * gmx_restrict               mdatoms,
548                      nb_kernel_data_t * gmx_restrict        kernel_data,
549                      t_nrnb * gmx_restrict                  nrnb)
550 {
551     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
552      * just 0 for non-waters.
553      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
554      * jnr indices corresponding to data put in the four positions in the SIMD register.
555      */
556     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
557     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
558     int              jnrA,jnrB;
559     int              j_coord_offsetA,j_coord_offsetB;
560     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
561     real             rcutoff_scalar;
562     real             *shiftvec,*fshift,*x,*f;
563     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
564     int              vdwioffset0;
565     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
566     int              vdwioffset1;
567     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
568     int              vdwioffset2;
569     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
570     int              vdwjidx0A,vdwjidx0B;
571     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
572     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
573     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
574     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
575     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
576     real             *charge;
577     int              nvdwtype;
578     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
579     int              *vdwtype;
580     real             *vdwparam;
581     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
582     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
583     __m128i          vfitab;
584     __m128i          ifour       = _mm_set1_epi32(4);
585     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
586     real             *vftab;
587     __m128d          dummy_mask,cutoff_mask;
588     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
589     __m128d          one     = _mm_set1_pd(1.0);
590     __m128d          two     = _mm_set1_pd(2.0);
591     x                = xx[0];
592     f                = ff[0];
593
594     nri              = nlist->nri;
595     iinr             = nlist->iinr;
596     jindex           = nlist->jindex;
597     jjnr             = nlist->jjnr;
598     shiftidx         = nlist->shift;
599     gid              = nlist->gid;
600     shiftvec         = fr->shift_vec[0];
601     fshift           = fr->fshift[0];
602     facel            = _mm_set1_pd(fr->epsfac);
603     charge           = mdatoms->chargeA;
604     krf              = _mm_set1_pd(fr->ic->k_rf);
605     krf2             = _mm_set1_pd(fr->ic->k_rf*2.0);
606     crf              = _mm_set1_pd(fr->ic->c_rf);
607     nvdwtype         = fr->ntype;
608     vdwparam         = fr->nbfp;
609     vdwtype          = mdatoms->typeA;
610
611     vftab            = kernel_data->table_vdw->data;
612     vftabscale       = _mm_set1_pd(kernel_data->table_vdw->scale);
613
614     /* Setup water-specific parameters */
615     inr              = nlist->iinr[0];
616     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
617     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
618     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
619     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
620
621     /* Avoid stupid compiler warnings */
622     jnrA = jnrB = 0;
623     j_coord_offsetA = 0;
624     j_coord_offsetB = 0;
625
626     outeriter        = 0;
627     inneriter        = 0;
628
629     /* Start outer loop over neighborlists */
630     for(iidx=0; iidx<nri; iidx++)
631     {
632         /* Load shift vector for this list */
633         i_shift_offset   = DIM*shiftidx[iidx];
634
635         /* Load limits for loop over neighbors */
636         j_index_start    = jindex[iidx];
637         j_index_end      = jindex[iidx+1];
638
639         /* Get outer coordinate index */
640         inr              = iinr[iidx];
641         i_coord_offset   = DIM*inr;
642
643         /* Load i particle coords and add shift vector */
644         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
645                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
646
647         fix0             = _mm_setzero_pd();
648         fiy0             = _mm_setzero_pd();
649         fiz0             = _mm_setzero_pd();
650         fix1             = _mm_setzero_pd();
651         fiy1             = _mm_setzero_pd();
652         fiz1             = _mm_setzero_pd();
653         fix2             = _mm_setzero_pd();
654         fiy2             = _mm_setzero_pd();
655         fiz2             = _mm_setzero_pd();
656
657         /* Start inner kernel loop */
658         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
659         {
660
661             /* Get j neighbor index, and coordinate index */
662             jnrA             = jjnr[jidx];
663             jnrB             = jjnr[jidx+1];
664             j_coord_offsetA  = DIM*jnrA;
665             j_coord_offsetB  = DIM*jnrB;
666
667             /* load j atom coordinates */
668             gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
669                                               &jx0,&jy0,&jz0);
670
671             /* Calculate displacement vector */
672             dx00             = _mm_sub_pd(ix0,jx0);
673             dy00             = _mm_sub_pd(iy0,jy0);
674             dz00             = _mm_sub_pd(iz0,jz0);
675             dx10             = _mm_sub_pd(ix1,jx0);
676             dy10             = _mm_sub_pd(iy1,jy0);
677             dz10             = _mm_sub_pd(iz1,jz0);
678             dx20             = _mm_sub_pd(ix2,jx0);
679             dy20             = _mm_sub_pd(iy2,jy0);
680             dz20             = _mm_sub_pd(iz2,jz0);
681
682             /* Calculate squared distance and things based on it */
683             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
684             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
685             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
686
687             rinv00           = gmx_mm_invsqrt_pd(rsq00);
688             rinv10           = gmx_mm_invsqrt_pd(rsq10);
689             rinv20           = gmx_mm_invsqrt_pd(rsq20);
690
691             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
692             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
693             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
694
695             /* Load parameters for j particles */
696             jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
697             vdwjidx0A        = 2*vdwtype[jnrA+0];
698             vdwjidx0B        = 2*vdwtype[jnrB+0];
699
700             fjx0             = _mm_setzero_pd();
701             fjy0             = _mm_setzero_pd();
702             fjz0             = _mm_setzero_pd();
703
704             /**************************
705              * CALCULATE INTERACTIONS *
706              **************************/
707
708             r00              = _mm_mul_pd(rsq00,rinv00);
709
710             /* Compute parameters for interactions between i and j atoms */
711             qq00             = _mm_mul_pd(iq0,jq0);
712             gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
713                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
714
715             /* Calculate table index by multiplying r with table scale and truncate to integer */
716             rt               = _mm_mul_pd(r00,vftabscale);
717             vfitab           = _mm_cvttpd_epi32(rt);
718 #ifdef __XOP__
719             vfeps            = _mm_frcz_pd(rt);
720 #else
721             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
722 #endif
723             twovfeps         = _mm_add_pd(vfeps,vfeps);
724             vfitab           = _mm_slli_epi32(vfitab,3);
725
726             /* REACTION-FIELD ELECTROSTATICS */
727             felec            = _mm_mul_pd(qq00,_mm_msub_pd(rinv00,rinvsq00,krf2));
728
729             /* CUBIC SPLINE TABLE DISPERSION */
730             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
731             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
732             GMX_MM_TRANSPOSE2_PD(Y,F);
733             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
734             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
735             GMX_MM_TRANSPOSE2_PD(G,H);
736             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
737             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
738             fvdw6            = _mm_mul_pd(c6_00,FF);
739
740             /* CUBIC SPLINE TABLE REPULSION */
741             vfitab           = _mm_add_epi32(vfitab,ifour);
742             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
743             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
744             GMX_MM_TRANSPOSE2_PD(Y,F);
745             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
746             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
747             GMX_MM_TRANSPOSE2_PD(G,H);
748             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
749             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
750             fvdw12           = _mm_mul_pd(c12_00,FF);
751             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
752
753             fscal            = _mm_add_pd(felec,fvdw);
754
755             /* Update vectorial force */
756             fix0             = _mm_macc_pd(dx00,fscal,fix0);
757             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
758             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
759             
760             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
761             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
762             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
763
764             /**************************
765              * CALCULATE INTERACTIONS *
766              **************************/
767
768             /* Compute parameters for interactions between i and j atoms */
769             qq10             = _mm_mul_pd(iq1,jq0);
770
771             /* REACTION-FIELD ELECTROSTATICS */
772             felec            = _mm_mul_pd(qq10,_mm_msub_pd(rinv10,rinvsq10,krf2));
773
774             fscal            = felec;
775
776             /* Update vectorial force */
777             fix1             = _mm_macc_pd(dx10,fscal,fix1);
778             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
779             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
780             
781             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
782             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
783             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
784
785             /**************************
786              * CALCULATE INTERACTIONS *
787              **************************/
788
789             /* Compute parameters for interactions between i and j atoms */
790             qq20             = _mm_mul_pd(iq2,jq0);
791
792             /* REACTION-FIELD ELECTROSTATICS */
793             felec            = _mm_mul_pd(qq20,_mm_msub_pd(rinv20,rinvsq20,krf2));
794
795             fscal            = felec;
796
797             /* Update vectorial force */
798             fix2             = _mm_macc_pd(dx20,fscal,fix2);
799             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
800             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
801             
802             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
803             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
804             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
805
806             gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
807
808             /* Inner loop uses 120 flops */
809         }
810
811         if(jidx<j_index_end)
812         {
813
814             jnrA             = jjnr[jidx];
815             j_coord_offsetA  = DIM*jnrA;
816
817             /* load j atom coordinates */
818             gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
819                                               &jx0,&jy0,&jz0);
820
821             /* Calculate displacement vector */
822             dx00             = _mm_sub_pd(ix0,jx0);
823             dy00             = _mm_sub_pd(iy0,jy0);
824             dz00             = _mm_sub_pd(iz0,jz0);
825             dx10             = _mm_sub_pd(ix1,jx0);
826             dy10             = _mm_sub_pd(iy1,jy0);
827             dz10             = _mm_sub_pd(iz1,jz0);
828             dx20             = _mm_sub_pd(ix2,jx0);
829             dy20             = _mm_sub_pd(iy2,jy0);
830             dz20             = _mm_sub_pd(iz2,jz0);
831
832             /* Calculate squared distance and things based on it */
833             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
834             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
835             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
836
837             rinv00           = gmx_mm_invsqrt_pd(rsq00);
838             rinv10           = gmx_mm_invsqrt_pd(rsq10);
839             rinv20           = gmx_mm_invsqrt_pd(rsq20);
840
841             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
842             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
843             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
844
845             /* Load parameters for j particles */
846             jq0              = _mm_load_sd(charge+jnrA+0);
847             vdwjidx0A        = 2*vdwtype[jnrA+0];
848
849             fjx0             = _mm_setzero_pd();
850             fjy0             = _mm_setzero_pd();
851             fjz0             = _mm_setzero_pd();
852
853             /**************************
854              * CALCULATE INTERACTIONS *
855              **************************/
856
857             r00              = _mm_mul_pd(rsq00,rinv00);
858
859             /* Compute parameters for interactions between i and j atoms */
860             qq00             = _mm_mul_pd(iq0,jq0);
861             gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
862
863             /* Calculate table index by multiplying r with table scale and truncate to integer */
864             rt               = _mm_mul_pd(r00,vftabscale);
865             vfitab           = _mm_cvttpd_epi32(rt);
866 #ifdef __XOP__
867             vfeps            = _mm_frcz_pd(rt);
868 #else
869             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
870 #endif
871             twovfeps         = _mm_add_pd(vfeps,vfeps);
872             vfitab           = _mm_slli_epi32(vfitab,3);
873
874             /* REACTION-FIELD ELECTROSTATICS */
875             felec            = _mm_mul_pd(qq00,_mm_msub_pd(rinv00,rinvsq00,krf2));
876
877             /* CUBIC SPLINE TABLE DISPERSION */
878             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
879             F                = _mm_setzero_pd();
880             GMX_MM_TRANSPOSE2_PD(Y,F);
881             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
882             H                = _mm_setzero_pd();
883             GMX_MM_TRANSPOSE2_PD(G,H);
884             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
885             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
886             fvdw6            = _mm_mul_pd(c6_00,FF);
887
888             /* CUBIC SPLINE TABLE REPULSION */
889             vfitab           = _mm_add_epi32(vfitab,ifour);
890             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
891             F                = _mm_setzero_pd();
892             GMX_MM_TRANSPOSE2_PD(Y,F);
893             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
894             H                = _mm_setzero_pd();
895             GMX_MM_TRANSPOSE2_PD(G,H);
896             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
897             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
898             fvdw12           = _mm_mul_pd(c12_00,FF);
899             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
900
901             fscal            = _mm_add_pd(felec,fvdw);
902
903             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
904
905             /* Update vectorial force */
906             fix0             = _mm_macc_pd(dx00,fscal,fix0);
907             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
908             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
909             
910             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
911             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
912             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
913
914             /**************************
915              * CALCULATE INTERACTIONS *
916              **************************/
917
918             /* Compute parameters for interactions between i and j atoms */
919             qq10             = _mm_mul_pd(iq1,jq0);
920
921             /* REACTION-FIELD ELECTROSTATICS */
922             felec            = _mm_mul_pd(qq10,_mm_msub_pd(rinv10,rinvsq10,krf2));
923
924             fscal            = felec;
925
926             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
927
928             /* Update vectorial force */
929             fix1             = _mm_macc_pd(dx10,fscal,fix1);
930             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
931             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
932             
933             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
934             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
935             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
936
937             /**************************
938              * CALCULATE INTERACTIONS *
939              **************************/
940
941             /* Compute parameters for interactions between i and j atoms */
942             qq20             = _mm_mul_pd(iq2,jq0);
943
944             /* REACTION-FIELD ELECTROSTATICS */
945             felec            = _mm_mul_pd(qq20,_mm_msub_pd(rinv20,rinvsq20,krf2));
946
947             fscal            = felec;
948
949             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
950
951             /* Update vectorial force */
952             fix2             = _mm_macc_pd(dx20,fscal,fix2);
953             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
954             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
955             
956             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
957             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
958             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
959
960             gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
961
962             /* Inner loop uses 120 flops */
963         }
964
965         /* End of innermost loop */
966
967         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
968                                               f+i_coord_offset,fshift+i_shift_offset);
969
970         /* Increment number of inner iterations */
971         inneriter                  += j_index_end - j_index_start;
972
973         /* Outer loop uses 18 flops */
974     }
975
976     /* Increment number of outer iterations */
977     outeriter        += nri;
978
979     /* Update outer/inner flops */
980
981     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*120);
982 }