Compile nonbonded kernels as C++
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_double / nb_kernel_ElecRF_VdwNone_GeomW4W4_avx_128_fma_double.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_128_fma_double.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_avx_128_fma_double
51  * Electrostatics interaction: ReactionField
52  * VdW interaction:            None
53  * Geometry:                   Water4-Water4
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_avx_128_fma_double
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67      * just 0 for non-waters.
68      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB;
74     int              j_coord_offsetA,j_coord_offsetB;
75     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
76     real             rcutoff_scalar;
77     real             *shiftvec,*fshift,*x,*f;
78     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
79     int              vdwioffset1;
80     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
81     int              vdwioffset2;
82     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
83     int              vdwioffset3;
84     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
85     int              vdwjidx1A,vdwjidx1B;
86     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
87     int              vdwjidx2A,vdwjidx2B;
88     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
89     int              vdwjidx3A,vdwjidx3B;
90     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
91     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
92     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
93     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
94     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
95     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
96     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
97     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
98     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
99     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
100     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
101     real             *charge;
102     __m128d          dummy_mask,cutoff_mask;
103     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
104     __m128d          one     = _mm_set1_pd(1.0);
105     __m128d          two     = _mm_set1_pd(2.0);
106     x                = xx[0];
107     f                = ff[0];
108
109     nri              = nlist->nri;
110     iinr             = nlist->iinr;
111     jindex           = nlist->jindex;
112     jjnr             = nlist->jjnr;
113     shiftidx         = nlist->shift;
114     gid              = nlist->gid;
115     shiftvec         = fr->shift_vec[0];
116     fshift           = fr->fshift[0];
117     facel            = _mm_set1_pd(fr->ic->epsfac);
118     charge           = mdatoms->chargeA;
119     krf              = _mm_set1_pd(fr->ic->k_rf);
120     krf2             = _mm_set1_pd(fr->ic->k_rf*2.0);
121     crf              = _mm_set1_pd(fr->ic->c_rf);
122
123     /* Setup water-specific parameters */
124     inr              = nlist->iinr[0];
125     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
126     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
127     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
128
129     jq1              = _mm_set1_pd(charge[inr+1]);
130     jq2              = _mm_set1_pd(charge[inr+2]);
131     jq3              = _mm_set1_pd(charge[inr+3]);
132     qq11             = _mm_mul_pd(iq1,jq1);
133     qq12             = _mm_mul_pd(iq1,jq2);
134     qq13             = _mm_mul_pd(iq1,jq3);
135     qq21             = _mm_mul_pd(iq2,jq1);
136     qq22             = _mm_mul_pd(iq2,jq2);
137     qq23             = _mm_mul_pd(iq2,jq3);
138     qq31             = _mm_mul_pd(iq3,jq1);
139     qq32             = _mm_mul_pd(iq3,jq2);
140     qq33             = _mm_mul_pd(iq3,jq3);
141
142     /* Avoid stupid compiler warnings */
143     jnrA = jnrB = 0;
144     j_coord_offsetA = 0;
145     j_coord_offsetB = 0;
146
147     outeriter        = 0;
148     inneriter        = 0;
149
150     /* Start outer loop over neighborlists */
151     for(iidx=0; iidx<nri; iidx++)
152     {
153         /* Load shift vector for this list */
154         i_shift_offset   = DIM*shiftidx[iidx];
155
156         /* Load limits for loop over neighbors */
157         j_index_start    = jindex[iidx];
158         j_index_end      = jindex[iidx+1];
159
160         /* Get outer coordinate index */
161         inr              = iinr[iidx];
162         i_coord_offset   = DIM*inr;
163
164         /* Load i particle coords and add shift vector */
165         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
166                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
167
168         fix1             = _mm_setzero_pd();
169         fiy1             = _mm_setzero_pd();
170         fiz1             = _mm_setzero_pd();
171         fix2             = _mm_setzero_pd();
172         fiy2             = _mm_setzero_pd();
173         fiz2             = _mm_setzero_pd();
174         fix3             = _mm_setzero_pd();
175         fiy3             = _mm_setzero_pd();
176         fiz3             = _mm_setzero_pd();
177
178         /* Reset potential sums */
179         velecsum         = _mm_setzero_pd();
180
181         /* Start inner kernel loop */
182         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
183         {
184
185             /* Get j neighbor index, and coordinate index */
186             jnrA             = jjnr[jidx];
187             jnrB             = jjnr[jidx+1];
188             j_coord_offsetA  = DIM*jnrA;
189             j_coord_offsetB  = DIM*jnrB;
190
191             /* load j atom coordinates */
192             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
193                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
194
195             /* Calculate displacement vector */
196             dx11             = _mm_sub_pd(ix1,jx1);
197             dy11             = _mm_sub_pd(iy1,jy1);
198             dz11             = _mm_sub_pd(iz1,jz1);
199             dx12             = _mm_sub_pd(ix1,jx2);
200             dy12             = _mm_sub_pd(iy1,jy2);
201             dz12             = _mm_sub_pd(iz1,jz2);
202             dx13             = _mm_sub_pd(ix1,jx3);
203             dy13             = _mm_sub_pd(iy1,jy3);
204             dz13             = _mm_sub_pd(iz1,jz3);
205             dx21             = _mm_sub_pd(ix2,jx1);
206             dy21             = _mm_sub_pd(iy2,jy1);
207             dz21             = _mm_sub_pd(iz2,jz1);
208             dx22             = _mm_sub_pd(ix2,jx2);
209             dy22             = _mm_sub_pd(iy2,jy2);
210             dz22             = _mm_sub_pd(iz2,jz2);
211             dx23             = _mm_sub_pd(ix2,jx3);
212             dy23             = _mm_sub_pd(iy2,jy3);
213             dz23             = _mm_sub_pd(iz2,jz3);
214             dx31             = _mm_sub_pd(ix3,jx1);
215             dy31             = _mm_sub_pd(iy3,jy1);
216             dz31             = _mm_sub_pd(iz3,jz1);
217             dx32             = _mm_sub_pd(ix3,jx2);
218             dy32             = _mm_sub_pd(iy3,jy2);
219             dz32             = _mm_sub_pd(iz3,jz2);
220             dx33             = _mm_sub_pd(ix3,jx3);
221             dy33             = _mm_sub_pd(iy3,jy3);
222             dz33             = _mm_sub_pd(iz3,jz3);
223
224             /* Calculate squared distance and things based on it */
225             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
226             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
227             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
228             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
229             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
230             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
231             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
232             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
233             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
234
235             rinv11           = avx128fma_invsqrt_d(rsq11);
236             rinv12           = avx128fma_invsqrt_d(rsq12);
237             rinv13           = avx128fma_invsqrt_d(rsq13);
238             rinv21           = avx128fma_invsqrt_d(rsq21);
239             rinv22           = avx128fma_invsqrt_d(rsq22);
240             rinv23           = avx128fma_invsqrt_d(rsq23);
241             rinv31           = avx128fma_invsqrt_d(rsq31);
242             rinv32           = avx128fma_invsqrt_d(rsq32);
243             rinv33           = avx128fma_invsqrt_d(rsq33);
244
245             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
246             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
247             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
248             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
249             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
250             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
251             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
252             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
253             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
254
255             fjx1             = _mm_setzero_pd();
256             fjy1             = _mm_setzero_pd();
257             fjz1             = _mm_setzero_pd();
258             fjx2             = _mm_setzero_pd();
259             fjy2             = _mm_setzero_pd();
260             fjz2             = _mm_setzero_pd();
261             fjx3             = _mm_setzero_pd();
262             fjy3             = _mm_setzero_pd();
263             fjz3             = _mm_setzero_pd();
264
265             /**************************
266              * CALCULATE INTERACTIONS *
267              **************************/
268
269             /* REACTION-FIELD ELECTROSTATICS */
270             velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_macc_pd(krf,rsq11,rinv11),crf));
271             felec            = _mm_mul_pd(qq11,_mm_msub_pd(rinv11,rinvsq11,krf2));
272
273             /* Update potential sum for this i atom from the interaction with this j atom. */
274             velecsum         = _mm_add_pd(velecsum,velec);
275
276             fscal            = felec;
277
278             /* Update vectorial force */
279             fix1             = _mm_macc_pd(dx11,fscal,fix1);
280             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
281             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
282             
283             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
284             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
285             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
286
287             /**************************
288              * CALCULATE INTERACTIONS *
289              **************************/
290
291             /* REACTION-FIELD ELECTROSTATICS */
292             velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_macc_pd(krf,rsq12,rinv12),crf));
293             felec            = _mm_mul_pd(qq12,_mm_msub_pd(rinv12,rinvsq12,krf2));
294
295             /* Update potential sum for this i atom from the interaction with this j atom. */
296             velecsum         = _mm_add_pd(velecsum,velec);
297
298             fscal            = felec;
299
300             /* Update vectorial force */
301             fix1             = _mm_macc_pd(dx12,fscal,fix1);
302             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
303             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
304             
305             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
306             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
307             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
308
309             /**************************
310              * CALCULATE INTERACTIONS *
311              **************************/
312
313             /* REACTION-FIELD ELECTROSTATICS */
314             velec            = _mm_mul_pd(qq13,_mm_sub_pd(_mm_macc_pd(krf,rsq13,rinv13),crf));
315             felec            = _mm_mul_pd(qq13,_mm_msub_pd(rinv13,rinvsq13,krf2));
316
317             /* Update potential sum for this i atom from the interaction with this j atom. */
318             velecsum         = _mm_add_pd(velecsum,velec);
319
320             fscal            = felec;
321
322             /* Update vectorial force */
323             fix1             = _mm_macc_pd(dx13,fscal,fix1);
324             fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
325             fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
326             
327             fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
328             fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
329             fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
330
331             /**************************
332              * CALCULATE INTERACTIONS *
333              **************************/
334
335             /* REACTION-FIELD ELECTROSTATICS */
336             velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_macc_pd(krf,rsq21,rinv21),crf));
337             felec            = _mm_mul_pd(qq21,_mm_msub_pd(rinv21,rinvsq21,krf2));
338
339             /* Update potential sum for this i atom from the interaction with this j atom. */
340             velecsum         = _mm_add_pd(velecsum,velec);
341
342             fscal            = felec;
343
344             /* Update vectorial force */
345             fix2             = _mm_macc_pd(dx21,fscal,fix2);
346             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
347             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
348             
349             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
350             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
351             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
352
353             /**************************
354              * CALCULATE INTERACTIONS *
355              **************************/
356
357             /* REACTION-FIELD ELECTROSTATICS */
358             velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_macc_pd(krf,rsq22,rinv22),crf));
359             felec            = _mm_mul_pd(qq22,_mm_msub_pd(rinv22,rinvsq22,krf2));
360
361             /* Update potential sum for this i atom from the interaction with this j atom. */
362             velecsum         = _mm_add_pd(velecsum,velec);
363
364             fscal            = felec;
365
366             /* Update vectorial force */
367             fix2             = _mm_macc_pd(dx22,fscal,fix2);
368             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
369             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
370             
371             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
372             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
373             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
374
375             /**************************
376              * CALCULATE INTERACTIONS *
377              **************************/
378
379             /* REACTION-FIELD ELECTROSTATICS */
380             velec            = _mm_mul_pd(qq23,_mm_sub_pd(_mm_macc_pd(krf,rsq23,rinv23),crf));
381             felec            = _mm_mul_pd(qq23,_mm_msub_pd(rinv23,rinvsq23,krf2));
382
383             /* Update potential sum for this i atom from the interaction with this j atom. */
384             velecsum         = _mm_add_pd(velecsum,velec);
385
386             fscal            = felec;
387
388             /* Update vectorial force */
389             fix2             = _mm_macc_pd(dx23,fscal,fix2);
390             fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
391             fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
392             
393             fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
394             fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
395             fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
396
397             /**************************
398              * CALCULATE INTERACTIONS *
399              **************************/
400
401             /* REACTION-FIELD ELECTROSTATICS */
402             velec            = _mm_mul_pd(qq31,_mm_sub_pd(_mm_macc_pd(krf,rsq31,rinv31),crf));
403             felec            = _mm_mul_pd(qq31,_mm_msub_pd(rinv31,rinvsq31,krf2));
404
405             /* Update potential sum for this i atom from the interaction with this j atom. */
406             velecsum         = _mm_add_pd(velecsum,velec);
407
408             fscal            = felec;
409
410             /* Update vectorial force */
411             fix3             = _mm_macc_pd(dx31,fscal,fix3);
412             fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
413             fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
414             
415             fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
416             fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
417             fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
418
419             /**************************
420              * CALCULATE INTERACTIONS *
421              **************************/
422
423             /* REACTION-FIELD ELECTROSTATICS */
424             velec            = _mm_mul_pd(qq32,_mm_sub_pd(_mm_macc_pd(krf,rsq32,rinv32),crf));
425             felec            = _mm_mul_pd(qq32,_mm_msub_pd(rinv32,rinvsq32,krf2));
426
427             /* Update potential sum for this i atom from the interaction with this j atom. */
428             velecsum         = _mm_add_pd(velecsum,velec);
429
430             fscal            = felec;
431
432             /* Update vectorial force */
433             fix3             = _mm_macc_pd(dx32,fscal,fix3);
434             fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
435             fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
436             
437             fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
438             fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
439             fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
440
441             /**************************
442              * CALCULATE INTERACTIONS *
443              **************************/
444
445             /* REACTION-FIELD ELECTROSTATICS */
446             velec            = _mm_mul_pd(qq33,_mm_sub_pd(_mm_macc_pd(krf,rsq33,rinv33),crf));
447             felec            = _mm_mul_pd(qq33,_mm_msub_pd(rinv33,rinvsq33,krf2));
448
449             /* Update potential sum for this i atom from the interaction with this j atom. */
450             velecsum         = _mm_add_pd(velecsum,velec);
451
452             fscal            = felec;
453
454             /* Update vectorial force */
455             fix3             = _mm_macc_pd(dx33,fscal,fix3);
456             fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
457             fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
458             
459             fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
460             fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
461             fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
462
463             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
464
465             /* Inner loop uses 315 flops */
466         }
467
468         if(jidx<j_index_end)
469         {
470
471             jnrA             = jjnr[jidx];
472             j_coord_offsetA  = DIM*jnrA;
473
474             /* load j atom coordinates */
475             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA+DIM,
476                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
477
478             /* Calculate displacement vector */
479             dx11             = _mm_sub_pd(ix1,jx1);
480             dy11             = _mm_sub_pd(iy1,jy1);
481             dz11             = _mm_sub_pd(iz1,jz1);
482             dx12             = _mm_sub_pd(ix1,jx2);
483             dy12             = _mm_sub_pd(iy1,jy2);
484             dz12             = _mm_sub_pd(iz1,jz2);
485             dx13             = _mm_sub_pd(ix1,jx3);
486             dy13             = _mm_sub_pd(iy1,jy3);
487             dz13             = _mm_sub_pd(iz1,jz3);
488             dx21             = _mm_sub_pd(ix2,jx1);
489             dy21             = _mm_sub_pd(iy2,jy1);
490             dz21             = _mm_sub_pd(iz2,jz1);
491             dx22             = _mm_sub_pd(ix2,jx2);
492             dy22             = _mm_sub_pd(iy2,jy2);
493             dz22             = _mm_sub_pd(iz2,jz2);
494             dx23             = _mm_sub_pd(ix2,jx3);
495             dy23             = _mm_sub_pd(iy2,jy3);
496             dz23             = _mm_sub_pd(iz2,jz3);
497             dx31             = _mm_sub_pd(ix3,jx1);
498             dy31             = _mm_sub_pd(iy3,jy1);
499             dz31             = _mm_sub_pd(iz3,jz1);
500             dx32             = _mm_sub_pd(ix3,jx2);
501             dy32             = _mm_sub_pd(iy3,jy2);
502             dz32             = _mm_sub_pd(iz3,jz2);
503             dx33             = _mm_sub_pd(ix3,jx3);
504             dy33             = _mm_sub_pd(iy3,jy3);
505             dz33             = _mm_sub_pd(iz3,jz3);
506
507             /* Calculate squared distance and things based on it */
508             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
509             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
510             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
511             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
512             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
513             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
514             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
515             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
516             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
517
518             rinv11           = avx128fma_invsqrt_d(rsq11);
519             rinv12           = avx128fma_invsqrt_d(rsq12);
520             rinv13           = avx128fma_invsqrt_d(rsq13);
521             rinv21           = avx128fma_invsqrt_d(rsq21);
522             rinv22           = avx128fma_invsqrt_d(rsq22);
523             rinv23           = avx128fma_invsqrt_d(rsq23);
524             rinv31           = avx128fma_invsqrt_d(rsq31);
525             rinv32           = avx128fma_invsqrt_d(rsq32);
526             rinv33           = avx128fma_invsqrt_d(rsq33);
527
528             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
529             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
530             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
531             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
532             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
533             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
534             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
535             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
536             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
537
538             fjx1             = _mm_setzero_pd();
539             fjy1             = _mm_setzero_pd();
540             fjz1             = _mm_setzero_pd();
541             fjx2             = _mm_setzero_pd();
542             fjy2             = _mm_setzero_pd();
543             fjz2             = _mm_setzero_pd();
544             fjx3             = _mm_setzero_pd();
545             fjy3             = _mm_setzero_pd();
546             fjz3             = _mm_setzero_pd();
547
548             /**************************
549              * CALCULATE INTERACTIONS *
550              **************************/
551
552             /* REACTION-FIELD ELECTROSTATICS */
553             velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_macc_pd(krf,rsq11,rinv11),crf));
554             felec            = _mm_mul_pd(qq11,_mm_msub_pd(rinv11,rinvsq11,krf2));
555
556             /* Update potential sum for this i atom from the interaction with this j atom. */
557             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
558             velecsum         = _mm_add_pd(velecsum,velec);
559
560             fscal            = felec;
561
562             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
563
564             /* Update vectorial force */
565             fix1             = _mm_macc_pd(dx11,fscal,fix1);
566             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
567             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
568             
569             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
570             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
571             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
572
573             /**************************
574              * CALCULATE INTERACTIONS *
575              **************************/
576
577             /* REACTION-FIELD ELECTROSTATICS */
578             velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_macc_pd(krf,rsq12,rinv12),crf));
579             felec            = _mm_mul_pd(qq12,_mm_msub_pd(rinv12,rinvsq12,krf2));
580
581             /* Update potential sum for this i atom from the interaction with this j atom. */
582             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
583             velecsum         = _mm_add_pd(velecsum,velec);
584
585             fscal            = felec;
586
587             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
588
589             /* Update vectorial force */
590             fix1             = _mm_macc_pd(dx12,fscal,fix1);
591             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
592             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
593             
594             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
595             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
596             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
597
598             /**************************
599              * CALCULATE INTERACTIONS *
600              **************************/
601
602             /* REACTION-FIELD ELECTROSTATICS */
603             velec            = _mm_mul_pd(qq13,_mm_sub_pd(_mm_macc_pd(krf,rsq13,rinv13),crf));
604             felec            = _mm_mul_pd(qq13,_mm_msub_pd(rinv13,rinvsq13,krf2));
605
606             /* Update potential sum for this i atom from the interaction with this j atom. */
607             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
608             velecsum         = _mm_add_pd(velecsum,velec);
609
610             fscal            = felec;
611
612             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
613
614             /* Update vectorial force */
615             fix1             = _mm_macc_pd(dx13,fscal,fix1);
616             fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
617             fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
618             
619             fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
620             fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
621             fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
622
623             /**************************
624              * CALCULATE INTERACTIONS *
625              **************************/
626
627             /* REACTION-FIELD ELECTROSTATICS */
628             velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_macc_pd(krf,rsq21,rinv21),crf));
629             felec            = _mm_mul_pd(qq21,_mm_msub_pd(rinv21,rinvsq21,krf2));
630
631             /* Update potential sum for this i atom from the interaction with this j atom. */
632             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
633             velecsum         = _mm_add_pd(velecsum,velec);
634
635             fscal            = felec;
636
637             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
638
639             /* Update vectorial force */
640             fix2             = _mm_macc_pd(dx21,fscal,fix2);
641             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
642             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
643             
644             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
645             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
646             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
647
648             /**************************
649              * CALCULATE INTERACTIONS *
650              **************************/
651
652             /* REACTION-FIELD ELECTROSTATICS */
653             velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_macc_pd(krf,rsq22,rinv22),crf));
654             felec            = _mm_mul_pd(qq22,_mm_msub_pd(rinv22,rinvsq22,krf2));
655
656             /* Update potential sum for this i atom from the interaction with this j atom. */
657             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
658             velecsum         = _mm_add_pd(velecsum,velec);
659
660             fscal            = felec;
661
662             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
663
664             /* Update vectorial force */
665             fix2             = _mm_macc_pd(dx22,fscal,fix2);
666             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
667             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
668             
669             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
670             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
671             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
672
673             /**************************
674              * CALCULATE INTERACTIONS *
675              **************************/
676
677             /* REACTION-FIELD ELECTROSTATICS */
678             velec            = _mm_mul_pd(qq23,_mm_sub_pd(_mm_macc_pd(krf,rsq23,rinv23),crf));
679             felec            = _mm_mul_pd(qq23,_mm_msub_pd(rinv23,rinvsq23,krf2));
680
681             /* Update potential sum for this i atom from the interaction with this j atom. */
682             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
683             velecsum         = _mm_add_pd(velecsum,velec);
684
685             fscal            = felec;
686
687             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
688
689             /* Update vectorial force */
690             fix2             = _mm_macc_pd(dx23,fscal,fix2);
691             fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
692             fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
693             
694             fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
695             fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
696             fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
697
698             /**************************
699              * CALCULATE INTERACTIONS *
700              **************************/
701
702             /* REACTION-FIELD ELECTROSTATICS */
703             velec            = _mm_mul_pd(qq31,_mm_sub_pd(_mm_macc_pd(krf,rsq31,rinv31),crf));
704             felec            = _mm_mul_pd(qq31,_mm_msub_pd(rinv31,rinvsq31,krf2));
705
706             /* Update potential sum for this i atom from the interaction with this j atom. */
707             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
708             velecsum         = _mm_add_pd(velecsum,velec);
709
710             fscal            = felec;
711
712             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
713
714             /* Update vectorial force */
715             fix3             = _mm_macc_pd(dx31,fscal,fix3);
716             fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
717             fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
718             
719             fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
720             fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
721             fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
722
723             /**************************
724              * CALCULATE INTERACTIONS *
725              **************************/
726
727             /* REACTION-FIELD ELECTROSTATICS */
728             velec            = _mm_mul_pd(qq32,_mm_sub_pd(_mm_macc_pd(krf,rsq32,rinv32),crf));
729             felec            = _mm_mul_pd(qq32,_mm_msub_pd(rinv32,rinvsq32,krf2));
730
731             /* Update potential sum for this i atom from the interaction with this j atom. */
732             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
733             velecsum         = _mm_add_pd(velecsum,velec);
734
735             fscal            = felec;
736
737             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
738
739             /* Update vectorial force */
740             fix3             = _mm_macc_pd(dx32,fscal,fix3);
741             fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
742             fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
743             
744             fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
745             fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
746             fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
747
748             /**************************
749              * CALCULATE INTERACTIONS *
750              **************************/
751
752             /* REACTION-FIELD ELECTROSTATICS */
753             velec            = _mm_mul_pd(qq33,_mm_sub_pd(_mm_macc_pd(krf,rsq33,rinv33),crf));
754             felec            = _mm_mul_pd(qq33,_mm_msub_pd(rinv33,rinvsq33,krf2));
755
756             /* Update potential sum for this i atom from the interaction with this j atom. */
757             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
758             velecsum         = _mm_add_pd(velecsum,velec);
759
760             fscal            = felec;
761
762             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
763
764             /* Update vectorial force */
765             fix3             = _mm_macc_pd(dx33,fscal,fix3);
766             fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
767             fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
768             
769             fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
770             fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
771             fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
772
773             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
774
775             /* Inner loop uses 315 flops */
776         }
777
778         /* End of innermost loop */
779
780         gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
781                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
782
783         ggid                        = gid[iidx];
784         /* Update potential energies */
785         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
786
787         /* Increment number of inner iterations */
788         inneriter                  += j_index_end - j_index_start;
789
790         /* Outer loop uses 19 flops */
791     }
792
793     /* Increment number of outer iterations */
794     outeriter        += nri;
795
796     /* Update outer/inner flops */
797
798     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*315);
799 }
800 /*
801  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW4W4_F_avx_128_fma_double
802  * Electrostatics interaction: ReactionField
803  * VdW interaction:            None
804  * Geometry:                   Water4-Water4
805  * Calculate force/pot:        Force
806  */
807 void
808 nb_kernel_ElecRF_VdwNone_GeomW4W4_F_avx_128_fma_double
809                     (t_nblist                    * gmx_restrict       nlist,
810                      rvec                        * gmx_restrict          xx,
811                      rvec                        * gmx_restrict          ff,
812                      struct t_forcerec           * gmx_restrict          fr,
813                      t_mdatoms                   * gmx_restrict     mdatoms,
814                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
815                      t_nrnb                      * gmx_restrict        nrnb)
816 {
817     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
818      * just 0 for non-waters.
819      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
820      * jnr indices corresponding to data put in the four positions in the SIMD register.
821      */
822     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
823     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
824     int              jnrA,jnrB;
825     int              j_coord_offsetA,j_coord_offsetB;
826     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
827     real             rcutoff_scalar;
828     real             *shiftvec,*fshift,*x,*f;
829     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
830     int              vdwioffset1;
831     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
832     int              vdwioffset2;
833     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
834     int              vdwioffset3;
835     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
836     int              vdwjidx1A,vdwjidx1B;
837     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
838     int              vdwjidx2A,vdwjidx2B;
839     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
840     int              vdwjidx3A,vdwjidx3B;
841     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
842     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
843     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
844     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
845     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
846     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
847     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
848     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
849     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
850     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
851     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
852     real             *charge;
853     __m128d          dummy_mask,cutoff_mask;
854     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
855     __m128d          one     = _mm_set1_pd(1.0);
856     __m128d          two     = _mm_set1_pd(2.0);
857     x                = xx[0];
858     f                = ff[0];
859
860     nri              = nlist->nri;
861     iinr             = nlist->iinr;
862     jindex           = nlist->jindex;
863     jjnr             = nlist->jjnr;
864     shiftidx         = nlist->shift;
865     gid              = nlist->gid;
866     shiftvec         = fr->shift_vec[0];
867     fshift           = fr->fshift[0];
868     facel            = _mm_set1_pd(fr->ic->epsfac);
869     charge           = mdatoms->chargeA;
870     krf              = _mm_set1_pd(fr->ic->k_rf);
871     krf2             = _mm_set1_pd(fr->ic->k_rf*2.0);
872     crf              = _mm_set1_pd(fr->ic->c_rf);
873
874     /* Setup water-specific parameters */
875     inr              = nlist->iinr[0];
876     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
877     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
878     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
879
880     jq1              = _mm_set1_pd(charge[inr+1]);
881     jq2              = _mm_set1_pd(charge[inr+2]);
882     jq3              = _mm_set1_pd(charge[inr+3]);
883     qq11             = _mm_mul_pd(iq1,jq1);
884     qq12             = _mm_mul_pd(iq1,jq2);
885     qq13             = _mm_mul_pd(iq1,jq3);
886     qq21             = _mm_mul_pd(iq2,jq1);
887     qq22             = _mm_mul_pd(iq2,jq2);
888     qq23             = _mm_mul_pd(iq2,jq3);
889     qq31             = _mm_mul_pd(iq3,jq1);
890     qq32             = _mm_mul_pd(iq3,jq2);
891     qq33             = _mm_mul_pd(iq3,jq3);
892
893     /* Avoid stupid compiler warnings */
894     jnrA = jnrB = 0;
895     j_coord_offsetA = 0;
896     j_coord_offsetB = 0;
897
898     outeriter        = 0;
899     inneriter        = 0;
900
901     /* Start outer loop over neighborlists */
902     for(iidx=0; iidx<nri; iidx++)
903     {
904         /* Load shift vector for this list */
905         i_shift_offset   = DIM*shiftidx[iidx];
906
907         /* Load limits for loop over neighbors */
908         j_index_start    = jindex[iidx];
909         j_index_end      = jindex[iidx+1];
910
911         /* Get outer coordinate index */
912         inr              = iinr[iidx];
913         i_coord_offset   = DIM*inr;
914
915         /* Load i particle coords and add shift vector */
916         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
917                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
918
919         fix1             = _mm_setzero_pd();
920         fiy1             = _mm_setzero_pd();
921         fiz1             = _mm_setzero_pd();
922         fix2             = _mm_setzero_pd();
923         fiy2             = _mm_setzero_pd();
924         fiz2             = _mm_setzero_pd();
925         fix3             = _mm_setzero_pd();
926         fiy3             = _mm_setzero_pd();
927         fiz3             = _mm_setzero_pd();
928
929         /* Start inner kernel loop */
930         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
931         {
932
933             /* Get j neighbor index, and coordinate index */
934             jnrA             = jjnr[jidx];
935             jnrB             = jjnr[jidx+1];
936             j_coord_offsetA  = DIM*jnrA;
937             j_coord_offsetB  = DIM*jnrB;
938
939             /* load j atom coordinates */
940             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
941                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
942
943             /* Calculate displacement vector */
944             dx11             = _mm_sub_pd(ix1,jx1);
945             dy11             = _mm_sub_pd(iy1,jy1);
946             dz11             = _mm_sub_pd(iz1,jz1);
947             dx12             = _mm_sub_pd(ix1,jx2);
948             dy12             = _mm_sub_pd(iy1,jy2);
949             dz12             = _mm_sub_pd(iz1,jz2);
950             dx13             = _mm_sub_pd(ix1,jx3);
951             dy13             = _mm_sub_pd(iy1,jy3);
952             dz13             = _mm_sub_pd(iz1,jz3);
953             dx21             = _mm_sub_pd(ix2,jx1);
954             dy21             = _mm_sub_pd(iy2,jy1);
955             dz21             = _mm_sub_pd(iz2,jz1);
956             dx22             = _mm_sub_pd(ix2,jx2);
957             dy22             = _mm_sub_pd(iy2,jy2);
958             dz22             = _mm_sub_pd(iz2,jz2);
959             dx23             = _mm_sub_pd(ix2,jx3);
960             dy23             = _mm_sub_pd(iy2,jy3);
961             dz23             = _mm_sub_pd(iz2,jz3);
962             dx31             = _mm_sub_pd(ix3,jx1);
963             dy31             = _mm_sub_pd(iy3,jy1);
964             dz31             = _mm_sub_pd(iz3,jz1);
965             dx32             = _mm_sub_pd(ix3,jx2);
966             dy32             = _mm_sub_pd(iy3,jy2);
967             dz32             = _mm_sub_pd(iz3,jz2);
968             dx33             = _mm_sub_pd(ix3,jx3);
969             dy33             = _mm_sub_pd(iy3,jy3);
970             dz33             = _mm_sub_pd(iz3,jz3);
971
972             /* Calculate squared distance and things based on it */
973             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
974             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
975             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
976             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
977             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
978             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
979             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
980             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
981             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
982
983             rinv11           = avx128fma_invsqrt_d(rsq11);
984             rinv12           = avx128fma_invsqrt_d(rsq12);
985             rinv13           = avx128fma_invsqrt_d(rsq13);
986             rinv21           = avx128fma_invsqrt_d(rsq21);
987             rinv22           = avx128fma_invsqrt_d(rsq22);
988             rinv23           = avx128fma_invsqrt_d(rsq23);
989             rinv31           = avx128fma_invsqrt_d(rsq31);
990             rinv32           = avx128fma_invsqrt_d(rsq32);
991             rinv33           = avx128fma_invsqrt_d(rsq33);
992
993             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
994             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
995             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
996             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
997             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
998             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
999             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
1000             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
1001             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
1002
1003             fjx1             = _mm_setzero_pd();
1004             fjy1             = _mm_setzero_pd();
1005             fjz1             = _mm_setzero_pd();
1006             fjx2             = _mm_setzero_pd();
1007             fjy2             = _mm_setzero_pd();
1008             fjz2             = _mm_setzero_pd();
1009             fjx3             = _mm_setzero_pd();
1010             fjy3             = _mm_setzero_pd();
1011             fjz3             = _mm_setzero_pd();
1012
1013             /**************************
1014              * CALCULATE INTERACTIONS *
1015              **************************/
1016
1017             /* REACTION-FIELD ELECTROSTATICS */
1018             felec            = _mm_mul_pd(qq11,_mm_msub_pd(rinv11,rinvsq11,krf2));
1019
1020             fscal            = felec;
1021
1022             /* Update vectorial force */
1023             fix1             = _mm_macc_pd(dx11,fscal,fix1);
1024             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
1025             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
1026             
1027             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
1028             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
1029             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
1030
1031             /**************************
1032              * CALCULATE INTERACTIONS *
1033              **************************/
1034
1035             /* REACTION-FIELD ELECTROSTATICS */
1036             felec            = _mm_mul_pd(qq12,_mm_msub_pd(rinv12,rinvsq12,krf2));
1037
1038             fscal            = felec;
1039
1040             /* Update vectorial force */
1041             fix1             = _mm_macc_pd(dx12,fscal,fix1);
1042             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
1043             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
1044             
1045             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
1046             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
1047             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
1048
1049             /**************************
1050              * CALCULATE INTERACTIONS *
1051              **************************/
1052
1053             /* REACTION-FIELD ELECTROSTATICS */
1054             felec            = _mm_mul_pd(qq13,_mm_msub_pd(rinv13,rinvsq13,krf2));
1055
1056             fscal            = felec;
1057
1058             /* Update vectorial force */
1059             fix1             = _mm_macc_pd(dx13,fscal,fix1);
1060             fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
1061             fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
1062             
1063             fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
1064             fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
1065             fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
1066
1067             /**************************
1068              * CALCULATE INTERACTIONS *
1069              **************************/
1070
1071             /* REACTION-FIELD ELECTROSTATICS */
1072             felec            = _mm_mul_pd(qq21,_mm_msub_pd(rinv21,rinvsq21,krf2));
1073
1074             fscal            = felec;
1075
1076             /* Update vectorial force */
1077             fix2             = _mm_macc_pd(dx21,fscal,fix2);
1078             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
1079             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
1080             
1081             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
1082             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
1083             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
1084
1085             /**************************
1086              * CALCULATE INTERACTIONS *
1087              **************************/
1088
1089             /* REACTION-FIELD ELECTROSTATICS */
1090             felec            = _mm_mul_pd(qq22,_mm_msub_pd(rinv22,rinvsq22,krf2));
1091
1092             fscal            = felec;
1093
1094             /* Update vectorial force */
1095             fix2             = _mm_macc_pd(dx22,fscal,fix2);
1096             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
1097             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
1098             
1099             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
1100             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
1101             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
1102
1103             /**************************
1104              * CALCULATE INTERACTIONS *
1105              **************************/
1106
1107             /* REACTION-FIELD ELECTROSTATICS */
1108             felec            = _mm_mul_pd(qq23,_mm_msub_pd(rinv23,rinvsq23,krf2));
1109
1110             fscal            = felec;
1111
1112             /* Update vectorial force */
1113             fix2             = _mm_macc_pd(dx23,fscal,fix2);
1114             fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
1115             fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
1116             
1117             fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
1118             fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
1119             fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
1120
1121             /**************************
1122              * CALCULATE INTERACTIONS *
1123              **************************/
1124
1125             /* REACTION-FIELD ELECTROSTATICS */
1126             felec            = _mm_mul_pd(qq31,_mm_msub_pd(rinv31,rinvsq31,krf2));
1127
1128             fscal            = felec;
1129
1130             /* Update vectorial force */
1131             fix3             = _mm_macc_pd(dx31,fscal,fix3);
1132             fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
1133             fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
1134             
1135             fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
1136             fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
1137             fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
1138
1139             /**************************
1140              * CALCULATE INTERACTIONS *
1141              **************************/
1142
1143             /* REACTION-FIELD ELECTROSTATICS */
1144             felec            = _mm_mul_pd(qq32,_mm_msub_pd(rinv32,rinvsq32,krf2));
1145
1146             fscal            = felec;
1147
1148             /* Update vectorial force */
1149             fix3             = _mm_macc_pd(dx32,fscal,fix3);
1150             fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
1151             fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
1152             
1153             fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
1154             fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
1155             fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
1156
1157             /**************************
1158              * CALCULATE INTERACTIONS *
1159              **************************/
1160
1161             /* REACTION-FIELD ELECTROSTATICS */
1162             felec            = _mm_mul_pd(qq33,_mm_msub_pd(rinv33,rinvsq33,krf2));
1163
1164             fscal            = felec;
1165
1166             /* Update vectorial force */
1167             fix3             = _mm_macc_pd(dx33,fscal,fix3);
1168             fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
1169             fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
1170             
1171             fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
1172             fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
1173             fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
1174
1175             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1176
1177             /* Inner loop uses 270 flops */
1178         }
1179
1180         if(jidx<j_index_end)
1181         {
1182
1183             jnrA             = jjnr[jidx];
1184             j_coord_offsetA  = DIM*jnrA;
1185
1186             /* load j atom coordinates */
1187             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA+DIM,
1188                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1189
1190             /* Calculate displacement vector */
1191             dx11             = _mm_sub_pd(ix1,jx1);
1192             dy11             = _mm_sub_pd(iy1,jy1);
1193             dz11             = _mm_sub_pd(iz1,jz1);
1194             dx12             = _mm_sub_pd(ix1,jx2);
1195             dy12             = _mm_sub_pd(iy1,jy2);
1196             dz12             = _mm_sub_pd(iz1,jz2);
1197             dx13             = _mm_sub_pd(ix1,jx3);
1198             dy13             = _mm_sub_pd(iy1,jy3);
1199             dz13             = _mm_sub_pd(iz1,jz3);
1200             dx21             = _mm_sub_pd(ix2,jx1);
1201             dy21             = _mm_sub_pd(iy2,jy1);
1202             dz21             = _mm_sub_pd(iz2,jz1);
1203             dx22             = _mm_sub_pd(ix2,jx2);
1204             dy22             = _mm_sub_pd(iy2,jy2);
1205             dz22             = _mm_sub_pd(iz2,jz2);
1206             dx23             = _mm_sub_pd(ix2,jx3);
1207             dy23             = _mm_sub_pd(iy2,jy3);
1208             dz23             = _mm_sub_pd(iz2,jz3);
1209             dx31             = _mm_sub_pd(ix3,jx1);
1210             dy31             = _mm_sub_pd(iy3,jy1);
1211             dz31             = _mm_sub_pd(iz3,jz1);
1212             dx32             = _mm_sub_pd(ix3,jx2);
1213             dy32             = _mm_sub_pd(iy3,jy2);
1214             dz32             = _mm_sub_pd(iz3,jz2);
1215             dx33             = _mm_sub_pd(ix3,jx3);
1216             dy33             = _mm_sub_pd(iy3,jy3);
1217             dz33             = _mm_sub_pd(iz3,jz3);
1218
1219             /* Calculate squared distance and things based on it */
1220             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1221             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1222             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1223             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1224             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1225             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1226             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1227             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1228             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1229
1230             rinv11           = avx128fma_invsqrt_d(rsq11);
1231             rinv12           = avx128fma_invsqrt_d(rsq12);
1232             rinv13           = avx128fma_invsqrt_d(rsq13);
1233             rinv21           = avx128fma_invsqrt_d(rsq21);
1234             rinv22           = avx128fma_invsqrt_d(rsq22);
1235             rinv23           = avx128fma_invsqrt_d(rsq23);
1236             rinv31           = avx128fma_invsqrt_d(rsq31);
1237             rinv32           = avx128fma_invsqrt_d(rsq32);
1238             rinv33           = avx128fma_invsqrt_d(rsq33);
1239
1240             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1241             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1242             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
1243             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1244             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1245             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
1246             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
1247             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
1248             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
1249
1250             fjx1             = _mm_setzero_pd();
1251             fjy1             = _mm_setzero_pd();
1252             fjz1             = _mm_setzero_pd();
1253             fjx2             = _mm_setzero_pd();
1254             fjy2             = _mm_setzero_pd();
1255             fjz2             = _mm_setzero_pd();
1256             fjx3             = _mm_setzero_pd();
1257             fjy3             = _mm_setzero_pd();
1258             fjz3             = _mm_setzero_pd();
1259
1260             /**************************
1261              * CALCULATE INTERACTIONS *
1262              **************************/
1263
1264             /* REACTION-FIELD ELECTROSTATICS */
1265             felec            = _mm_mul_pd(qq11,_mm_msub_pd(rinv11,rinvsq11,krf2));
1266
1267             fscal            = felec;
1268
1269             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1270
1271             /* Update vectorial force */
1272             fix1             = _mm_macc_pd(dx11,fscal,fix1);
1273             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
1274             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
1275             
1276             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
1277             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
1278             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
1279
1280             /**************************
1281              * CALCULATE INTERACTIONS *
1282              **************************/
1283
1284             /* REACTION-FIELD ELECTROSTATICS */
1285             felec            = _mm_mul_pd(qq12,_mm_msub_pd(rinv12,rinvsq12,krf2));
1286
1287             fscal            = felec;
1288
1289             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1290
1291             /* Update vectorial force */
1292             fix1             = _mm_macc_pd(dx12,fscal,fix1);
1293             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
1294             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
1295             
1296             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
1297             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
1298             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
1299
1300             /**************************
1301              * CALCULATE INTERACTIONS *
1302              **************************/
1303
1304             /* REACTION-FIELD ELECTROSTATICS */
1305             felec            = _mm_mul_pd(qq13,_mm_msub_pd(rinv13,rinvsq13,krf2));
1306
1307             fscal            = felec;
1308
1309             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1310
1311             /* Update vectorial force */
1312             fix1             = _mm_macc_pd(dx13,fscal,fix1);
1313             fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
1314             fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
1315             
1316             fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
1317             fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
1318             fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
1319
1320             /**************************
1321              * CALCULATE INTERACTIONS *
1322              **************************/
1323
1324             /* REACTION-FIELD ELECTROSTATICS */
1325             felec            = _mm_mul_pd(qq21,_mm_msub_pd(rinv21,rinvsq21,krf2));
1326
1327             fscal            = felec;
1328
1329             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1330
1331             /* Update vectorial force */
1332             fix2             = _mm_macc_pd(dx21,fscal,fix2);
1333             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
1334             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
1335             
1336             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
1337             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
1338             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
1339
1340             /**************************
1341              * CALCULATE INTERACTIONS *
1342              **************************/
1343
1344             /* REACTION-FIELD ELECTROSTATICS */
1345             felec            = _mm_mul_pd(qq22,_mm_msub_pd(rinv22,rinvsq22,krf2));
1346
1347             fscal            = felec;
1348
1349             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1350
1351             /* Update vectorial force */
1352             fix2             = _mm_macc_pd(dx22,fscal,fix2);
1353             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
1354             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
1355             
1356             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
1357             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
1358             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
1359
1360             /**************************
1361              * CALCULATE INTERACTIONS *
1362              **************************/
1363
1364             /* REACTION-FIELD ELECTROSTATICS */
1365             felec            = _mm_mul_pd(qq23,_mm_msub_pd(rinv23,rinvsq23,krf2));
1366
1367             fscal            = felec;
1368
1369             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1370
1371             /* Update vectorial force */
1372             fix2             = _mm_macc_pd(dx23,fscal,fix2);
1373             fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
1374             fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
1375             
1376             fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
1377             fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
1378             fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
1379
1380             /**************************
1381              * CALCULATE INTERACTIONS *
1382              **************************/
1383
1384             /* REACTION-FIELD ELECTROSTATICS */
1385             felec            = _mm_mul_pd(qq31,_mm_msub_pd(rinv31,rinvsq31,krf2));
1386
1387             fscal            = felec;
1388
1389             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1390
1391             /* Update vectorial force */
1392             fix3             = _mm_macc_pd(dx31,fscal,fix3);
1393             fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
1394             fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
1395             
1396             fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
1397             fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
1398             fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
1399
1400             /**************************
1401              * CALCULATE INTERACTIONS *
1402              **************************/
1403
1404             /* REACTION-FIELD ELECTROSTATICS */
1405             felec            = _mm_mul_pd(qq32,_mm_msub_pd(rinv32,rinvsq32,krf2));
1406
1407             fscal            = felec;
1408
1409             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1410
1411             /* Update vectorial force */
1412             fix3             = _mm_macc_pd(dx32,fscal,fix3);
1413             fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
1414             fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
1415             
1416             fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
1417             fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
1418             fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
1419
1420             /**************************
1421              * CALCULATE INTERACTIONS *
1422              **************************/
1423
1424             /* REACTION-FIELD ELECTROSTATICS */
1425             felec            = _mm_mul_pd(qq33,_mm_msub_pd(rinv33,rinvsq33,krf2));
1426
1427             fscal            = felec;
1428
1429             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1430
1431             /* Update vectorial force */
1432             fix3             = _mm_macc_pd(dx33,fscal,fix3);
1433             fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
1434             fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
1435             
1436             fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
1437             fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
1438             fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
1439
1440             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1441
1442             /* Inner loop uses 270 flops */
1443         }
1444
1445         /* End of innermost loop */
1446
1447         gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1448                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
1449
1450         /* Increment number of inner iterations */
1451         inneriter                  += j_index_end - j_index_start;
1452
1453         /* Outer loop uses 18 flops */
1454     }
1455
1456     /* Increment number of outer iterations */
1457     outeriter        += nri;
1458
1459     /* Update outer/inner flops */
1460
1461     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*270);
1462 }