Added option to gmx nmeig to print ZPE.
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_avx_128_fma_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_128_fma_single.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_128_fma_single
51  * Electrostatics interaction: ReactionField
52  * VdW interaction:            LennardJones
53  * Geometry:                   Water3-Water3
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_128_fma_single
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
77     real             rcutoff_scalar;
78     real             *shiftvec,*fshift,*x,*f;
79     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
80     real             scratch[4*DIM];
81     __m128           fscal,rcutoff,rcutoff2,jidxall;
82     int              vdwioffset0;
83     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84     int              vdwioffset1;
85     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86     int              vdwioffset2;
87     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
89     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
91     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
93     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
104     real             *charge;
105     int              nvdwtype;
106     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
107     int              *vdwtype;
108     real             *vdwparam;
109     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
110     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
111     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
112     real             rswitch_scalar,d_scalar;
113     __m128           dummy_mask,cutoff_mask;
114     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
115     __m128           one     = _mm_set1_ps(1.0);
116     __m128           two     = _mm_set1_ps(2.0);
117     x                = xx[0];
118     f                = ff[0];
119
120     nri              = nlist->nri;
121     iinr             = nlist->iinr;
122     jindex           = nlist->jindex;
123     jjnr             = nlist->jjnr;
124     shiftidx         = nlist->shift;
125     gid              = nlist->gid;
126     shiftvec         = fr->shift_vec[0];
127     fshift           = fr->fshift[0];
128     facel            = _mm_set1_ps(fr->ic->epsfac);
129     charge           = mdatoms->chargeA;
130     krf              = _mm_set1_ps(fr->ic->k_rf);
131     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
132     crf              = _mm_set1_ps(fr->ic->c_rf);
133     nvdwtype         = fr->ntype;
134     vdwparam         = fr->nbfp;
135     vdwtype          = mdatoms->typeA;
136
137     /* Setup water-specific parameters */
138     inr              = nlist->iinr[0];
139     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
140     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
141     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
142     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
143
144     jq0              = _mm_set1_ps(charge[inr+0]);
145     jq1              = _mm_set1_ps(charge[inr+1]);
146     jq2              = _mm_set1_ps(charge[inr+2]);
147     vdwjidx0A        = 2*vdwtype[inr+0];
148     qq00             = _mm_mul_ps(iq0,jq0);
149     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
150     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
151     qq01             = _mm_mul_ps(iq0,jq1);
152     qq02             = _mm_mul_ps(iq0,jq2);
153     qq10             = _mm_mul_ps(iq1,jq0);
154     qq11             = _mm_mul_ps(iq1,jq1);
155     qq12             = _mm_mul_ps(iq1,jq2);
156     qq20             = _mm_mul_ps(iq2,jq0);
157     qq21             = _mm_mul_ps(iq2,jq1);
158     qq22             = _mm_mul_ps(iq2,jq2);
159
160     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
161     rcutoff_scalar   = fr->ic->rcoulomb;
162     rcutoff          = _mm_set1_ps(rcutoff_scalar);
163     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
164
165     rswitch_scalar   = fr->ic->rvdw_switch;
166     rswitch          = _mm_set1_ps(rswitch_scalar);
167     /* Setup switch parameters */
168     d_scalar         = rcutoff_scalar-rswitch_scalar;
169     d                = _mm_set1_ps(d_scalar);
170     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
171     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
172     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
173     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
174     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
175     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
176
177     /* Avoid stupid compiler warnings */
178     jnrA = jnrB = jnrC = jnrD = 0;
179     j_coord_offsetA = 0;
180     j_coord_offsetB = 0;
181     j_coord_offsetC = 0;
182     j_coord_offsetD = 0;
183
184     outeriter        = 0;
185     inneriter        = 0;
186
187     for(iidx=0;iidx<4*DIM;iidx++)
188     {
189         scratch[iidx] = 0.0;
190     }
191
192     /* Start outer loop over neighborlists */
193     for(iidx=0; iidx<nri; iidx++)
194     {
195         /* Load shift vector for this list */
196         i_shift_offset   = DIM*shiftidx[iidx];
197
198         /* Load limits for loop over neighbors */
199         j_index_start    = jindex[iidx];
200         j_index_end      = jindex[iidx+1];
201
202         /* Get outer coordinate index */
203         inr              = iinr[iidx];
204         i_coord_offset   = DIM*inr;
205
206         /* Load i particle coords and add shift vector */
207         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
208                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
209
210         fix0             = _mm_setzero_ps();
211         fiy0             = _mm_setzero_ps();
212         fiz0             = _mm_setzero_ps();
213         fix1             = _mm_setzero_ps();
214         fiy1             = _mm_setzero_ps();
215         fiz1             = _mm_setzero_ps();
216         fix2             = _mm_setzero_ps();
217         fiy2             = _mm_setzero_ps();
218         fiz2             = _mm_setzero_ps();
219
220         /* Reset potential sums */
221         velecsum         = _mm_setzero_ps();
222         vvdwsum          = _mm_setzero_ps();
223
224         /* Start inner kernel loop */
225         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
226         {
227
228             /* Get j neighbor index, and coordinate index */
229             jnrA             = jjnr[jidx];
230             jnrB             = jjnr[jidx+1];
231             jnrC             = jjnr[jidx+2];
232             jnrD             = jjnr[jidx+3];
233             j_coord_offsetA  = DIM*jnrA;
234             j_coord_offsetB  = DIM*jnrB;
235             j_coord_offsetC  = DIM*jnrC;
236             j_coord_offsetD  = DIM*jnrD;
237
238             /* load j atom coordinates */
239             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
240                                               x+j_coord_offsetC,x+j_coord_offsetD,
241                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
242
243             /* Calculate displacement vector */
244             dx00             = _mm_sub_ps(ix0,jx0);
245             dy00             = _mm_sub_ps(iy0,jy0);
246             dz00             = _mm_sub_ps(iz0,jz0);
247             dx01             = _mm_sub_ps(ix0,jx1);
248             dy01             = _mm_sub_ps(iy0,jy1);
249             dz01             = _mm_sub_ps(iz0,jz1);
250             dx02             = _mm_sub_ps(ix0,jx2);
251             dy02             = _mm_sub_ps(iy0,jy2);
252             dz02             = _mm_sub_ps(iz0,jz2);
253             dx10             = _mm_sub_ps(ix1,jx0);
254             dy10             = _mm_sub_ps(iy1,jy0);
255             dz10             = _mm_sub_ps(iz1,jz0);
256             dx11             = _mm_sub_ps(ix1,jx1);
257             dy11             = _mm_sub_ps(iy1,jy1);
258             dz11             = _mm_sub_ps(iz1,jz1);
259             dx12             = _mm_sub_ps(ix1,jx2);
260             dy12             = _mm_sub_ps(iy1,jy2);
261             dz12             = _mm_sub_ps(iz1,jz2);
262             dx20             = _mm_sub_ps(ix2,jx0);
263             dy20             = _mm_sub_ps(iy2,jy0);
264             dz20             = _mm_sub_ps(iz2,jz0);
265             dx21             = _mm_sub_ps(ix2,jx1);
266             dy21             = _mm_sub_ps(iy2,jy1);
267             dz21             = _mm_sub_ps(iz2,jz1);
268             dx22             = _mm_sub_ps(ix2,jx2);
269             dy22             = _mm_sub_ps(iy2,jy2);
270             dz22             = _mm_sub_ps(iz2,jz2);
271
272             /* Calculate squared distance and things based on it */
273             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
274             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
275             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
276             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
277             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
278             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
279             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
280             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
281             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
282
283             rinv00           = avx128fma_invsqrt_f(rsq00);
284             rinv01           = avx128fma_invsqrt_f(rsq01);
285             rinv02           = avx128fma_invsqrt_f(rsq02);
286             rinv10           = avx128fma_invsqrt_f(rsq10);
287             rinv11           = avx128fma_invsqrt_f(rsq11);
288             rinv12           = avx128fma_invsqrt_f(rsq12);
289             rinv20           = avx128fma_invsqrt_f(rsq20);
290             rinv21           = avx128fma_invsqrt_f(rsq21);
291             rinv22           = avx128fma_invsqrt_f(rsq22);
292
293             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
294             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
295             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
296             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
297             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
298             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
299             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
300             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
301             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
302
303             fjx0             = _mm_setzero_ps();
304             fjy0             = _mm_setzero_ps();
305             fjz0             = _mm_setzero_ps();
306             fjx1             = _mm_setzero_ps();
307             fjy1             = _mm_setzero_ps();
308             fjz1             = _mm_setzero_ps();
309             fjx2             = _mm_setzero_ps();
310             fjy2             = _mm_setzero_ps();
311             fjz2             = _mm_setzero_ps();
312
313             /**************************
314              * CALCULATE INTERACTIONS *
315              **************************/
316
317             if (gmx_mm_any_lt(rsq00,rcutoff2))
318             {
319
320             r00              = _mm_mul_ps(rsq00,rinv00);
321
322             /* REACTION-FIELD ELECTROSTATICS */
323             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_macc_ps(krf,rsq00,rinv00),crf));
324             felec            = _mm_mul_ps(qq00,_mm_msub_ps(rinv00,rinvsq00,krf2));
325
326             /* LENNARD-JONES DISPERSION/REPULSION */
327
328             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
329             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
330             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
331             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
332             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
333
334             d                = _mm_sub_ps(r00,rswitch);
335             d                = _mm_max_ps(d,_mm_setzero_ps());
336             d2               = _mm_mul_ps(d,d);
337             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
338
339             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
340
341             /* Evaluate switch function */
342             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
343             fvdw             = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
344             vvdw             = _mm_mul_ps(vvdw,sw);
345             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
346
347             /* Update potential sum for this i atom from the interaction with this j atom. */
348             velec            = _mm_and_ps(velec,cutoff_mask);
349             velecsum         = _mm_add_ps(velecsum,velec);
350             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
351             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
352
353             fscal            = _mm_add_ps(felec,fvdw);
354
355             fscal            = _mm_and_ps(fscal,cutoff_mask);
356
357              /* Update vectorial force */
358             fix0             = _mm_macc_ps(dx00,fscal,fix0);
359             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
360             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
361
362             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
363             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
364             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
365
366             }
367
368             /**************************
369              * CALCULATE INTERACTIONS *
370              **************************/
371
372             if (gmx_mm_any_lt(rsq01,rcutoff2))
373             {
374
375             /* REACTION-FIELD ELECTROSTATICS */
376             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_macc_ps(krf,rsq01,rinv01),crf));
377             felec            = _mm_mul_ps(qq01,_mm_msub_ps(rinv01,rinvsq01,krf2));
378
379             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
380
381             /* Update potential sum for this i atom from the interaction with this j atom. */
382             velec            = _mm_and_ps(velec,cutoff_mask);
383             velecsum         = _mm_add_ps(velecsum,velec);
384
385             fscal            = felec;
386
387             fscal            = _mm_and_ps(fscal,cutoff_mask);
388
389              /* Update vectorial force */
390             fix0             = _mm_macc_ps(dx01,fscal,fix0);
391             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
392             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
393
394             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
395             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
396             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
397
398             }
399
400             /**************************
401              * CALCULATE INTERACTIONS *
402              **************************/
403
404             if (gmx_mm_any_lt(rsq02,rcutoff2))
405             {
406
407             /* REACTION-FIELD ELECTROSTATICS */
408             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_macc_ps(krf,rsq02,rinv02),crf));
409             felec            = _mm_mul_ps(qq02,_mm_msub_ps(rinv02,rinvsq02,krf2));
410
411             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
412
413             /* Update potential sum for this i atom from the interaction with this j atom. */
414             velec            = _mm_and_ps(velec,cutoff_mask);
415             velecsum         = _mm_add_ps(velecsum,velec);
416
417             fscal            = felec;
418
419             fscal            = _mm_and_ps(fscal,cutoff_mask);
420
421              /* Update vectorial force */
422             fix0             = _mm_macc_ps(dx02,fscal,fix0);
423             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
424             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
425
426             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
427             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
428             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
429
430             }
431
432             /**************************
433              * CALCULATE INTERACTIONS *
434              **************************/
435
436             if (gmx_mm_any_lt(rsq10,rcutoff2))
437             {
438
439             /* REACTION-FIELD ELECTROSTATICS */
440             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_macc_ps(krf,rsq10,rinv10),crf));
441             felec            = _mm_mul_ps(qq10,_mm_msub_ps(rinv10,rinvsq10,krf2));
442
443             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
444
445             /* Update potential sum for this i atom from the interaction with this j atom. */
446             velec            = _mm_and_ps(velec,cutoff_mask);
447             velecsum         = _mm_add_ps(velecsum,velec);
448
449             fscal            = felec;
450
451             fscal            = _mm_and_ps(fscal,cutoff_mask);
452
453              /* Update vectorial force */
454             fix1             = _mm_macc_ps(dx10,fscal,fix1);
455             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
456             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
457
458             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
459             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
460             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
461
462             }
463
464             /**************************
465              * CALCULATE INTERACTIONS *
466              **************************/
467
468             if (gmx_mm_any_lt(rsq11,rcutoff2))
469             {
470
471             /* REACTION-FIELD ELECTROSTATICS */
472             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_macc_ps(krf,rsq11,rinv11),crf));
473             felec            = _mm_mul_ps(qq11,_mm_msub_ps(rinv11,rinvsq11,krf2));
474
475             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
476
477             /* Update potential sum for this i atom from the interaction with this j atom. */
478             velec            = _mm_and_ps(velec,cutoff_mask);
479             velecsum         = _mm_add_ps(velecsum,velec);
480
481             fscal            = felec;
482
483             fscal            = _mm_and_ps(fscal,cutoff_mask);
484
485              /* Update vectorial force */
486             fix1             = _mm_macc_ps(dx11,fscal,fix1);
487             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
488             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
489
490             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
491             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
492             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
493
494             }
495
496             /**************************
497              * CALCULATE INTERACTIONS *
498              **************************/
499
500             if (gmx_mm_any_lt(rsq12,rcutoff2))
501             {
502
503             /* REACTION-FIELD ELECTROSTATICS */
504             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_macc_ps(krf,rsq12,rinv12),crf));
505             felec            = _mm_mul_ps(qq12,_mm_msub_ps(rinv12,rinvsq12,krf2));
506
507             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
508
509             /* Update potential sum for this i atom from the interaction with this j atom. */
510             velec            = _mm_and_ps(velec,cutoff_mask);
511             velecsum         = _mm_add_ps(velecsum,velec);
512
513             fscal            = felec;
514
515             fscal            = _mm_and_ps(fscal,cutoff_mask);
516
517              /* Update vectorial force */
518             fix1             = _mm_macc_ps(dx12,fscal,fix1);
519             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
520             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
521
522             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
523             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
524             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
525
526             }
527
528             /**************************
529              * CALCULATE INTERACTIONS *
530              **************************/
531
532             if (gmx_mm_any_lt(rsq20,rcutoff2))
533             {
534
535             /* REACTION-FIELD ELECTROSTATICS */
536             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_macc_ps(krf,rsq20,rinv20),crf));
537             felec            = _mm_mul_ps(qq20,_mm_msub_ps(rinv20,rinvsq20,krf2));
538
539             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
540
541             /* Update potential sum for this i atom from the interaction with this j atom. */
542             velec            = _mm_and_ps(velec,cutoff_mask);
543             velecsum         = _mm_add_ps(velecsum,velec);
544
545             fscal            = felec;
546
547             fscal            = _mm_and_ps(fscal,cutoff_mask);
548
549              /* Update vectorial force */
550             fix2             = _mm_macc_ps(dx20,fscal,fix2);
551             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
552             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
553
554             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
555             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
556             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
557
558             }
559
560             /**************************
561              * CALCULATE INTERACTIONS *
562              **************************/
563
564             if (gmx_mm_any_lt(rsq21,rcutoff2))
565             {
566
567             /* REACTION-FIELD ELECTROSTATICS */
568             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_macc_ps(krf,rsq21,rinv21),crf));
569             felec            = _mm_mul_ps(qq21,_mm_msub_ps(rinv21,rinvsq21,krf2));
570
571             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
572
573             /* Update potential sum for this i atom from the interaction with this j atom. */
574             velec            = _mm_and_ps(velec,cutoff_mask);
575             velecsum         = _mm_add_ps(velecsum,velec);
576
577             fscal            = felec;
578
579             fscal            = _mm_and_ps(fscal,cutoff_mask);
580
581              /* Update vectorial force */
582             fix2             = _mm_macc_ps(dx21,fscal,fix2);
583             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
584             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
585
586             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
587             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
588             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
589
590             }
591
592             /**************************
593              * CALCULATE INTERACTIONS *
594              **************************/
595
596             if (gmx_mm_any_lt(rsq22,rcutoff2))
597             {
598
599             /* REACTION-FIELD ELECTROSTATICS */
600             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_macc_ps(krf,rsq22,rinv22),crf));
601             felec            = _mm_mul_ps(qq22,_mm_msub_ps(rinv22,rinvsq22,krf2));
602
603             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
604
605             /* Update potential sum for this i atom from the interaction with this j atom. */
606             velec            = _mm_and_ps(velec,cutoff_mask);
607             velecsum         = _mm_add_ps(velecsum,velec);
608
609             fscal            = felec;
610
611             fscal            = _mm_and_ps(fscal,cutoff_mask);
612
613              /* Update vectorial force */
614             fix2             = _mm_macc_ps(dx22,fscal,fix2);
615             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
616             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
617
618             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
619             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
620             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
621
622             }
623
624             fjptrA             = f+j_coord_offsetA;
625             fjptrB             = f+j_coord_offsetB;
626             fjptrC             = f+j_coord_offsetC;
627             fjptrD             = f+j_coord_offsetD;
628
629             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
630                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
631
632             /* Inner loop uses 385 flops */
633         }
634
635         if(jidx<j_index_end)
636         {
637
638             /* Get j neighbor index, and coordinate index */
639             jnrlistA         = jjnr[jidx];
640             jnrlistB         = jjnr[jidx+1];
641             jnrlistC         = jjnr[jidx+2];
642             jnrlistD         = jjnr[jidx+3];
643             /* Sign of each element will be negative for non-real atoms.
644              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
645              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
646              */
647             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
648             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
649             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
650             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
651             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
652             j_coord_offsetA  = DIM*jnrA;
653             j_coord_offsetB  = DIM*jnrB;
654             j_coord_offsetC  = DIM*jnrC;
655             j_coord_offsetD  = DIM*jnrD;
656
657             /* load j atom coordinates */
658             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
659                                               x+j_coord_offsetC,x+j_coord_offsetD,
660                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
661
662             /* Calculate displacement vector */
663             dx00             = _mm_sub_ps(ix0,jx0);
664             dy00             = _mm_sub_ps(iy0,jy0);
665             dz00             = _mm_sub_ps(iz0,jz0);
666             dx01             = _mm_sub_ps(ix0,jx1);
667             dy01             = _mm_sub_ps(iy0,jy1);
668             dz01             = _mm_sub_ps(iz0,jz1);
669             dx02             = _mm_sub_ps(ix0,jx2);
670             dy02             = _mm_sub_ps(iy0,jy2);
671             dz02             = _mm_sub_ps(iz0,jz2);
672             dx10             = _mm_sub_ps(ix1,jx0);
673             dy10             = _mm_sub_ps(iy1,jy0);
674             dz10             = _mm_sub_ps(iz1,jz0);
675             dx11             = _mm_sub_ps(ix1,jx1);
676             dy11             = _mm_sub_ps(iy1,jy1);
677             dz11             = _mm_sub_ps(iz1,jz1);
678             dx12             = _mm_sub_ps(ix1,jx2);
679             dy12             = _mm_sub_ps(iy1,jy2);
680             dz12             = _mm_sub_ps(iz1,jz2);
681             dx20             = _mm_sub_ps(ix2,jx0);
682             dy20             = _mm_sub_ps(iy2,jy0);
683             dz20             = _mm_sub_ps(iz2,jz0);
684             dx21             = _mm_sub_ps(ix2,jx1);
685             dy21             = _mm_sub_ps(iy2,jy1);
686             dz21             = _mm_sub_ps(iz2,jz1);
687             dx22             = _mm_sub_ps(ix2,jx2);
688             dy22             = _mm_sub_ps(iy2,jy2);
689             dz22             = _mm_sub_ps(iz2,jz2);
690
691             /* Calculate squared distance and things based on it */
692             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
693             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
694             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
695             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
696             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
697             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
698             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
699             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
700             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
701
702             rinv00           = avx128fma_invsqrt_f(rsq00);
703             rinv01           = avx128fma_invsqrt_f(rsq01);
704             rinv02           = avx128fma_invsqrt_f(rsq02);
705             rinv10           = avx128fma_invsqrt_f(rsq10);
706             rinv11           = avx128fma_invsqrt_f(rsq11);
707             rinv12           = avx128fma_invsqrt_f(rsq12);
708             rinv20           = avx128fma_invsqrt_f(rsq20);
709             rinv21           = avx128fma_invsqrt_f(rsq21);
710             rinv22           = avx128fma_invsqrt_f(rsq22);
711
712             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
713             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
714             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
715             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
716             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
717             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
718             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
719             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
720             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
721
722             fjx0             = _mm_setzero_ps();
723             fjy0             = _mm_setzero_ps();
724             fjz0             = _mm_setzero_ps();
725             fjx1             = _mm_setzero_ps();
726             fjy1             = _mm_setzero_ps();
727             fjz1             = _mm_setzero_ps();
728             fjx2             = _mm_setzero_ps();
729             fjy2             = _mm_setzero_ps();
730             fjz2             = _mm_setzero_ps();
731
732             /**************************
733              * CALCULATE INTERACTIONS *
734              **************************/
735
736             if (gmx_mm_any_lt(rsq00,rcutoff2))
737             {
738
739             r00              = _mm_mul_ps(rsq00,rinv00);
740             r00              = _mm_andnot_ps(dummy_mask,r00);
741
742             /* REACTION-FIELD ELECTROSTATICS */
743             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_macc_ps(krf,rsq00,rinv00),crf));
744             felec            = _mm_mul_ps(qq00,_mm_msub_ps(rinv00,rinvsq00,krf2));
745
746             /* LENNARD-JONES DISPERSION/REPULSION */
747
748             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
749             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
750             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
751             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
752             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
753
754             d                = _mm_sub_ps(r00,rswitch);
755             d                = _mm_max_ps(d,_mm_setzero_ps());
756             d2               = _mm_mul_ps(d,d);
757             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
758
759             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
760
761             /* Evaluate switch function */
762             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
763             fvdw             = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
764             vvdw             = _mm_mul_ps(vvdw,sw);
765             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
766
767             /* Update potential sum for this i atom from the interaction with this j atom. */
768             velec            = _mm_and_ps(velec,cutoff_mask);
769             velec            = _mm_andnot_ps(dummy_mask,velec);
770             velecsum         = _mm_add_ps(velecsum,velec);
771             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
772             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
773             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
774
775             fscal            = _mm_add_ps(felec,fvdw);
776
777             fscal            = _mm_and_ps(fscal,cutoff_mask);
778
779             fscal            = _mm_andnot_ps(dummy_mask,fscal);
780
781              /* Update vectorial force */
782             fix0             = _mm_macc_ps(dx00,fscal,fix0);
783             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
784             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
785
786             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
787             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
788             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
789
790             }
791
792             /**************************
793              * CALCULATE INTERACTIONS *
794              **************************/
795
796             if (gmx_mm_any_lt(rsq01,rcutoff2))
797             {
798
799             /* REACTION-FIELD ELECTROSTATICS */
800             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_macc_ps(krf,rsq01,rinv01),crf));
801             felec            = _mm_mul_ps(qq01,_mm_msub_ps(rinv01,rinvsq01,krf2));
802
803             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
804
805             /* Update potential sum for this i atom from the interaction with this j atom. */
806             velec            = _mm_and_ps(velec,cutoff_mask);
807             velec            = _mm_andnot_ps(dummy_mask,velec);
808             velecsum         = _mm_add_ps(velecsum,velec);
809
810             fscal            = felec;
811
812             fscal            = _mm_and_ps(fscal,cutoff_mask);
813
814             fscal            = _mm_andnot_ps(dummy_mask,fscal);
815
816              /* Update vectorial force */
817             fix0             = _mm_macc_ps(dx01,fscal,fix0);
818             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
819             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
820
821             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
822             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
823             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
824
825             }
826
827             /**************************
828              * CALCULATE INTERACTIONS *
829              **************************/
830
831             if (gmx_mm_any_lt(rsq02,rcutoff2))
832             {
833
834             /* REACTION-FIELD ELECTROSTATICS */
835             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_macc_ps(krf,rsq02,rinv02),crf));
836             felec            = _mm_mul_ps(qq02,_mm_msub_ps(rinv02,rinvsq02,krf2));
837
838             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
839
840             /* Update potential sum for this i atom from the interaction with this j atom. */
841             velec            = _mm_and_ps(velec,cutoff_mask);
842             velec            = _mm_andnot_ps(dummy_mask,velec);
843             velecsum         = _mm_add_ps(velecsum,velec);
844
845             fscal            = felec;
846
847             fscal            = _mm_and_ps(fscal,cutoff_mask);
848
849             fscal            = _mm_andnot_ps(dummy_mask,fscal);
850
851              /* Update vectorial force */
852             fix0             = _mm_macc_ps(dx02,fscal,fix0);
853             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
854             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
855
856             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
857             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
858             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
859
860             }
861
862             /**************************
863              * CALCULATE INTERACTIONS *
864              **************************/
865
866             if (gmx_mm_any_lt(rsq10,rcutoff2))
867             {
868
869             /* REACTION-FIELD ELECTROSTATICS */
870             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_macc_ps(krf,rsq10,rinv10),crf));
871             felec            = _mm_mul_ps(qq10,_mm_msub_ps(rinv10,rinvsq10,krf2));
872
873             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
874
875             /* Update potential sum for this i atom from the interaction with this j atom. */
876             velec            = _mm_and_ps(velec,cutoff_mask);
877             velec            = _mm_andnot_ps(dummy_mask,velec);
878             velecsum         = _mm_add_ps(velecsum,velec);
879
880             fscal            = felec;
881
882             fscal            = _mm_and_ps(fscal,cutoff_mask);
883
884             fscal            = _mm_andnot_ps(dummy_mask,fscal);
885
886              /* Update vectorial force */
887             fix1             = _mm_macc_ps(dx10,fscal,fix1);
888             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
889             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
890
891             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
892             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
893             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
894
895             }
896
897             /**************************
898              * CALCULATE INTERACTIONS *
899              **************************/
900
901             if (gmx_mm_any_lt(rsq11,rcutoff2))
902             {
903
904             /* REACTION-FIELD ELECTROSTATICS */
905             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_macc_ps(krf,rsq11,rinv11),crf));
906             felec            = _mm_mul_ps(qq11,_mm_msub_ps(rinv11,rinvsq11,krf2));
907
908             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
909
910             /* Update potential sum for this i atom from the interaction with this j atom. */
911             velec            = _mm_and_ps(velec,cutoff_mask);
912             velec            = _mm_andnot_ps(dummy_mask,velec);
913             velecsum         = _mm_add_ps(velecsum,velec);
914
915             fscal            = felec;
916
917             fscal            = _mm_and_ps(fscal,cutoff_mask);
918
919             fscal            = _mm_andnot_ps(dummy_mask,fscal);
920
921              /* Update vectorial force */
922             fix1             = _mm_macc_ps(dx11,fscal,fix1);
923             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
924             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
925
926             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
927             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
928             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
929
930             }
931
932             /**************************
933              * CALCULATE INTERACTIONS *
934              **************************/
935
936             if (gmx_mm_any_lt(rsq12,rcutoff2))
937             {
938
939             /* REACTION-FIELD ELECTROSTATICS */
940             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_macc_ps(krf,rsq12,rinv12),crf));
941             felec            = _mm_mul_ps(qq12,_mm_msub_ps(rinv12,rinvsq12,krf2));
942
943             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
944
945             /* Update potential sum for this i atom from the interaction with this j atom. */
946             velec            = _mm_and_ps(velec,cutoff_mask);
947             velec            = _mm_andnot_ps(dummy_mask,velec);
948             velecsum         = _mm_add_ps(velecsum,velec);
949
950             fscal            = felec;
951
952             fscal            = _mm_and_ps(fscal,cutoff_mask);
953
954             fscal            = _mm_andnot_ps(dummy_mask,fscal);
955
956              /* Update vectorial force */
957             fix1             = _mm_macc_ps(dx12,fscal,fix1);
958             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
959             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
960
961             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
962             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
963             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
964
965             }
966
967             /**************************
968              * CALCULATE INTERACTIONS *
969              **************************/
970
971             if (gmx_mm_any_lt(rsq20,rcutoff2))
972             {
973
974             /* REACTION-FIELD ELECTROSTATICS */
975             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_macc_ps(krf,rsq20,rinv20),crf));
976             felec            = _mm_mul_ps(qq20,_mm_msub_ps(rinv20,rinvsq20,krf2));
977
978             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
979
980             /* Update potential sum for this i atom from the interaction with this j atom. */
981             velec            = _mm_and_ps(velec,cutoff_mask);
982             velec            = _mm_andnot_ps(dummy_mask,velec);
983             velecsum         = _mm_add_ps(velecsum,velec);
984
985             fscal            = felec;
986
987             fscal            = _mm_and_ps(fscal,cutoff_mask);
988
989             fscal            = _mm_andnot_ps(dummy_mask,fscal);
990
991              /* Update vectorial force */
992             fix2             = _mm_macc_ps(dx20,fscal,fix2);
993             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
994             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
995
996             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
997             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
998             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
999
1000             }
1001
1002             /**************************
1003              * CALCULATE INTERACTIONS *
1004              **************************/
1005
1006             if (gmx_mm_any_lt(rsq21,rcutoff2))
1007             {
1008
1009             /* REACTION-FIELD ELECTROSTATICS */
1010             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_macc_ps(krf,rsq21,rinv21),crf));
1011             felec            = _mm_mul_ps(qq21,_mm_msub_ps(rinv21,rinvsq21,krf2));
1012
1013             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1014
1015             /* Update potential sum for this i atom from the interaction with this j atom. */
1016             velec            = _mm_and_ps(velec,cutoff_mask);
1017             velec            = _mm_andnot_ps(dummy_mask,velec);
1018             velecsum         = _mm_add_ps(velecsum,velec);
1019
1020             fscal            = felec;
1021
1022             fscal            = _mm_and_ps(fscal,cutoff_mask);
1023
1024             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1025
1026              /* Update vectorial force */
1027             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1028             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1029             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1030
1031             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1032             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1033             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1034
1035             }
1036
1037             /**************************
1038              * CALCULATE INTERACTIONS *
1039              **************************/
1040
1041             if (gmx_mm_any_lt(rsq22,rcutoff2))
1042             {
1043
1044             /* REACTION-FIELD ELECTROSTATICS */
1045             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_macc_ps(krf,rsq22,rinv22),crf));
1046             felec            = _mm_mul_ps(qq22,_mm_msub_ps(rinv22,rinvsq22,krf2));
1047
1048             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1049
1050             /* Update potential sum for this i atom from the interaction with this j atom. */
1051             velec            = _mm_and_ps(velec,cutoff_mask);
1052             velec            = _mm_andnot_ps(dummy_mask,velec);
1053             velecsum         = _mm_add_ps(velecsum,velec);
1054
1055             fscal            = felec;
1056
1057             fscal            = _mm_and_ps(fscal,cutoff_mask);
1058
1059             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1060
1061              /* Update vectorial force */
1062             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1063             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1064             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1065
1066             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1067             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1068             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1069
1070             }
1071
1072             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1073             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1074             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1075             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1076
1077             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1078                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1079
1080             /* Inner loop uses 386 flops */
1081         }
1082
1083         /* End of innermost loop */
1084
1085         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1086                                               f+i_coord_offset,fshift+i_shift_offset);
1087
1088         ggid                        = gid[iidx];
1089         /* Update potential energies */
1090         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1091         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1092
1093         /* Increment number of inner iterations */
1094         inneriter                  += j_index_end - j_index_start;
1095
1096         /* Outer loop uses 20 flops */
1097     }
1098
1099     /* Increment number of outer iterations */
1100     outeriter        += nri;
1101
1102     /* Update outer/inner flops */
1103
1104     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*386);
1105 }
1106 /*
1107  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_128_fma_single
1108  * Electrostatics interaction: ReactionField
1109  * VdW interaction:            LennardJones
1110  * Geometry:                   Water3-Water3
1111  * Calculate force/pot:        Force
1112  */
1113 void
1114 nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_128_fma_single
1115                     (t_nblist                    * gmx_restrict       nlist,
1116                      rvec                        * gmx_restrict          xx,
1117                      rvec                        * gmx_restrict          ff,
1118                      struct t_forcerec           * gmx_restrict          fr,
1119                      t_mdatoms                   * gmx_restrict     mdatoms,
1120                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1121                      t_nrnb                      * gmx_restrict        nrnb)
1122 {
1123     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1124      * just 0 for non-waters.
1125      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1126      * jnr indices corresponding to data put in the four positions in the SIMD register.
1127      */
1128     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1129     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1130     int              jnrA,jnrB,jnrC,jnrD;
1131     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1132     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1133     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1134     real             rcutoff_scalar;
1135     real             *shiftvec,*fshift,*x,*f;
1136     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1137     real             scratch[4*DIM];
1138     __m128           fscal,rcutoff,rcutoff2,jidxall;
1139     int              vdwioffset0;
1140     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1141     int              vdwioffset1;
1142     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1143     int              vdwioffset2;
1144     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1145     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1146     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1147     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1148     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1149     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1150     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1151     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1152     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1153     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1154     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1155     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1156     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1157     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1158     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1159     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1160     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1161     real             *charge;
1162     int              nvdwtype;
1163     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1164     int              *vdwtype;
1165     real             *vdwparam;
1166     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1167     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1168     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1169     real             rswitch_scalar,d_scalar;
1170     __m128           dummy_mask,cutoff_mask;
1171     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1172     __m128           one     = _mm_set1_ps(1.0);
1173     __m128           two     = _mm_set1_ps(2.0);
1174     x                = xx[0];
1175     f                = ff[0];
1176
1177     nri              = nlist->nri;
1178     iinr             = nlist->iinr;
1179     jindex           = nlist->jindex;
1180     jjnr             = nlist->jjnr;
1181     shiftidx         = nlist->shift;
1182     gid              = nlist->gid;
1183     shiftvec         = fr->shift_vec[0];
1184     fshift           = fr->fshift[0];
1185     facel            = _mm_set1_ps(fr->ic->epsfac);
1186     charge           = mdatoms->chargeA;
1187     krf              = _mm_set1_ps(fr->ic->k_rf);
1188     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
1189     crf              = _mm_set1_ps(fr->ic->c_rf);
1190     nvdwtype         = fr->ntype;
1191     vdwparam         = fr->nbfp;
1192     vdwtype          = mdatoms->typeA;
1193
1194     /* Setup water-specific parameters */
1195     inr              = nlist->iinr[0];
1196     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1197     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1198     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1199     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1200
1201     jq0              = _mm_set1_ps(charge[inr+0]);
1202     jq1              = _mm_set1_ps(charge[inr+1]);
1203     jq2              = _mm_set1_ps(charge[inr+2]);
1204     vdwjidx0A        = 2*vdwtype[inr+0];
1205     qq00             = _mm_mul_ps(iq0,jq0);
1206     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1207     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1208     qq01             = _mm_mul_ps(iq0,jq1);
1209     qq02             = _mm_mul_ps(iq0,jq2);
1210     qq10             = _mm_mul_ps(iq1,jq0);
1211     qq11             = _mm_mul_ps(iq1,jq1);
1212     qq12             = _mm_mul_ps(iq1,jq2);
1213     qq20             = _mm_mul_ps(iq2,jq0);
1214     qq21             = _mm_mul_ps(iq2,jq1);
1215     qq22             = _mm_mul_ps(iq2,jq2);
1216
1217     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1218     rcutoff_scalar   = fr->ic->rcoulomb;
1219     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1220     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1221
1222     rswitch_scalar   = fr->ic->rvdw_switch;
1223     rswitch          = _mm_set1_ps(rswitch_scalar);
1224     /* Setup switch parameters */
1225     d_scalar         = rcutoff_scalar-rswitch_scalar;
1226     d                = _mm_set1_ps(d_scalar);
1227     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1228     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1229     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1230     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1231     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1232     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1233
1234     /* Avoid stupid compiler warnings */
1235     jnrA = jnrB = jnrC = jnrD = 0;
1236     j_coord_offsetA = 0;
1237     j_coord_offsetB = 0;
1238     j_coord_offsetC = 0;
1239     j_coord_offsetD = 0;
1240
1241     outeriter        = 0;
1242     inneriter        = 0;
1243
1244     for(iidx=0;iidx<4*DIM;iidx++)
1245     {
1246         scratch[iidx] = 0.0;
1247     }
1248
1249     /* Start outer loop over neighborlists */
1250     for(iidx=0; iidx<nri; iidx++)
1251     {
1252         /* Load shift vector for this list */
1253         i_shift_offset   = DIM*shiftidx[iidx];
1254
1255         /* Load limits for loop over neighbors */
1256         j_index_start    = jindex[iidx];
1257         j_index_end      = jindex[iidx+1];
1258
1259         /* Get outer coordinate index */
1260         inr              = iinr[iidx];
1261         i_coord_offset   = DIM*inr;
1262
1263         /* Load i particle coords and add shift vector */
1264         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1265                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1266
1267         fix0             = _mm_setzero_ps();
1268         fiy0             = _mm_setzero_ps();
1269         fiz0             = _mm_setzero_ps();
1270         fix1             = _mm_setzero_ps();
1271         fiy1             = _mm_setzero_ps();
1272         fiz1             = _mm_setzero_ps();
1273         fix2             = _mm_setzero_ps();
1274         fiy2             = _mm_setzero_ps();
1275         fiz2             = _mm_setzero_ps();
1276
1277         /* Start inner kernel loop */
1278         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1279         {
1280
1281             /* Get j neighbor index, and coordinate index */
1282             jnrA             = jjnr[jidx];
1283             jnrB             = jjnr[jidx+1];
1284             jnrC             = jjnr[jidx+2];
1285             jnrD             = jjnr[jidx+3];
1286             j_coord_offsetA  = DIM*jnrA;
1287             j_coord_offsetB  = DIM*jnrB;
1288             j_coord_offsetC  = DIM*jnrC;
1289             j_coord_offsetD  = DIM*jnrD;
1290
1291             /* load j atom coordinates */
1292             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1293                                               x+j_coord_offsetC,x+j_coord_offsetD,
1294                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1295
1296             /* Calculate displacement vector */
1297             dx00             = _mm_sub_ps(ix0,jx0);
1298             dy00             = _mm_sub_ps(iy0,jy0);
1299             dz00             = _mm_sub_ps(iz0,jz0);
1300             dx01             = _mm_sub_ps(ix0,jx1);
1301             dy01             = _mm_sub_ps(iy0,jy1);
1302             dz01             = _mm_sub_ps(iz0,jz1);
1303             dx02             = _mm_sub_ps(ix0,jx2);
1304             dy02             = _mm_sub_ps(iy0,jy2);
1305             dz02             = _mm_sub_ps(iz0,jz2);
1306             dx10             = _mm_sub_ps(ix1,jx0);
1307             dy10             = _mm_sub_ps(iy1,jy0);
1308             dz10             = _mm_sub_ps(iz1,jz0);
1309             dx11             = _mm_sub_ps(ix1,jx1);
1310             dy11             = _mm_sub_ps(iy1,jy1);
1311             dz11             = _mm_sub_ps(iz1,jz1);
1312             dx12             = _mm_sub_ps(ix1,jx2);
1313             dy12             = _mm_sub_ps(iy1,jy2);
1314             dz12             = _mm_sub_ps(iz1,jz2);
1315             dx20             = _mm_sub_ps(ix2,jx0);
1316             dy20             = _mm_sub_ps(iy2,jy0);
1317             dz20             = _mm_sub_ps(iz2,jz0);
1318             dx21             = _mm_sub_ps(ix2,jx1);
1319             dy21             = _mm_sub_ps(iy2,jy1);
1320             dz21             = _mm_sub_ps(iz2,jz1);
1321             dx22             = _mm_sub_ps(ix2,jx2);
1322             dy22             = _mm_sub_ps(iy2,jy2);
1323             dz22             = _mm_sub_ps(iz2,jz2);
1324
1325             /* Calculate squared distance and things based on it */
1326             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1327             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1328             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1329             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1330             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1331             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1332             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1333             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1334             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1335
1336             rinv00           = avx128fma_invsqrt_f(rsq00);
1337             rinv01           = avx128fma_invsqrt_f(rsq01);
1338             rinv02           = avx128fma_invsqrt_f(rsq02);
1339             rinv10           = avx128fma_invsqrt_f(rsq10);
1340             rinv11           = avx128fma_invsqrt_f(rsq11);
1341             rinv12           = avx128fma_invsqrt_f(rsq12);
1342             rinv20           = avx128fma_invsqrt_f(rsq20);
1343             rinv21           = avx128fma_invsqrt_f(rsq21);
1344             rinv22           = avx128fma_invsqrt_f(rsq22);
1345
1346             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1347             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1348             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1349             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1350             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1351             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1352             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1353             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1354             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1355
1356             fjx0             = _mm_setzero_ps();
1357             fjy0             = _mm_setzero_ps();
1358             fjz0             = _mm_setzero_ps();
1359             fjx1             = _mm_setzero_ps();
1360             fjy1             = _mm_setzero_ps();
1361             fjz1             = _mm_setzero_ps();
1362             fjx2             = _mm_setzero_ps();
1363             fjy2             = _mm_setzero_ps();
1364             fjz2             = _mm_setzero_ps();
1365
1366             /**************************
1367              * CALCULATE INTERACTIONS *
1368              **************************/
1369
1370             if (gmx_mm_any_lt(rsq00,rcutoff2))
1371             {
1372
1373             r00              = _mm_mul_ps(rsq00,rinv00);
1374
1375             /* REACTION-FIELD ELECTROSTATICS */
1376             felec            = _mm_mul_ps(qq00,_mm_msub_ps(rinv00,rinvsq00,krf2));
1377
1378             /* LENNARD-JONES DISPERSION/REPULSION */
1379
1380             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1381             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
1382             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1383             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
1384             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1385
1386             d                = _mm_sub_ps(r00,rswitch);
1387             d                = _mm_max_ps(d,_mm_setzero_ps());
1388             d2               = _mm_mul_ps(d,d);
1389             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1390
1391             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1392
1393             /* Evaluate switch function */
1394             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1395             fvdw             = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1396             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1397
1398             fscal            = _mm_add_ps(felec,fvdw);
1399
1400             fscal            = _mm_and_ps(fscal,cutoff_mask);
1401
1402              /* Update vectorial force */
1403             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1404             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1405             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1406
1407             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1408             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1409             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1410
1411             }
1412
1413             /**************************
1414              * CALCULATE INTERACTIONS *
1415              **************************/
1416
1417             if (gmx_mm_any_lt(rsq01,rcutoff2))
1418             {
1419
1420             /* REACTION-FIELD ELECTROSTATICS */
1421             felec            = _mm_mul_ps(qq01,_mm_msub_ps(rinv01,rinvsq01,krf2));
1422
1423             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1424
1425             fscal            = felec;
1426
1427             fscal            = _mm_and_ps(fscal,cutoff_mask);
1428
1429              /* Update vectorial force */
1430             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1431             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1432             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1433
1434             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1435             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1436             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1437
1438             }
1439
1440             /**************************
1441              * CALCULATE INTERACTIONS *
1442              **************************/
1443
1444             if (gmx_mm_any_lt(rsq02,rcutoff2))
1445             {
1446
1447             /* REACTION-FIELD ELECTROSTATICS */
1448             felec            = _mm_mul_ps(qq02,_mm_msub_ps(rinv02,rinvsq02,krf2));
1449
1450             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1451
1452             fscal            = felec;
1453
1454             fscal            = _mm_and_ps(fscal,cutoff_mask);
1455
1456              /* Update vectorial force */
1457             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1458             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1459             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1460
1461             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1462             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1463             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1464
1465             }
1466
1467             /**************************
1468              * CALCULATE INTERACTIONS *
1469              **************************/
1470
1471             if (gmx_mm_any_lt(rsq10,rcutoff2))
1472             {
1473
1474             /* REACTION-FIELD ELECTROSTATICS */
1475             felec            = _mm_mul_ps(qq10,_mm_msub_ps(rinv10,rinvsq10,krf2));
1476
1477             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1478
1479             fscal            = felec;
1480
1481             fscal            = _mm_and_ps(fscal,cutoff_mask);
1482
1483              /* Update vectorial force */
1484             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1485             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1486             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1487
1488             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1489             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1490             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1491
1492             }
1493
1494             /**************************
1495              * CALCULATE INTERACTIONS *
1496              **************************/
1497
1498             if (gmx_mm_any_lt(rsq11,rcutoff2))
1499             {
1500
1501             /* REACTION-FIELD ELECTROSTATICS */
1502             felec            = _mm_mul_ps(qq11,_mm_msub_ps(rinv11,rinvsq11,krf2));
1503
1504             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1505
1506             fscal            = felec;
1507
1508             fscal            = _mm_and_ps(fscal,cutoff_mask);
1509
1510              /* Update vectorial force */
1511             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1512             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1513             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1514
1515             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1516             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1517             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1518
1519             }
1520
1521             /**************************
1522              * CALCULATE INTERACTIONS *
1523              **************************/
1524
1525             if (gmx_mm_any_lt(rsq12,rcutoff2))
1526             {
1527
1528             /* REACTION-FIELD ELECTROSTATICS */
1529             felec            = _mm_mul_ps(qq12,_mm_msub_ps(rinv12,rinvsq12,krf2));
1530
1531             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1532
1533             fscal            = felec;
1534
1535             fscal            = _mm_and_ps(fscal,cutoff_mask);
1536
1537              /* Update vectorial force */
1538             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1539             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1540             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1541
1542             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1543             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1544             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1545
1546             }
1547
1548             /**************************
1549              * CALCULATE INTERACTIONS *
1550              **************************/
1551
1552             if (gmx_mm_any_lt(rsq20,rcutoff2))
1553             {
1554
1555             /* REACTION-FIELD ELECTROSTATICS */
1556             felec            = _mm_mul_ps(qq20,_mm_msub_ps(rinv20,rinvsq20,krf2));
1557
1558             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1559
1560             fscal            = felec;
1561
1562             fscal            = _mm_and_ps(fscal,cutoff_mask);
1563
1564              /* Update vectorial force */
1565             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1566             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1567             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1568
1569             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1570             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1571             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1572
1573             }
1574
1575             /**************************
1576              * CALCULATE INTERACTIONS *
1577              **************************/
1578
1579             if (gmx_mm_any_lt(rsq21,rcutoff2))
1580             {
1581
1582             /* REACTION-FIELD ELECTROSTATICS */
1583             felec            = _mm_mul_ps(qq21,_mm_msub_ps(rinv21,rinvsq21,krf2));
1584
1585             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1586
1587             fscal            = felec;
1588
1589             fscal            = _mm_and_ps(fscal,cutoff_mask);
1590
1591              /* Update vectorial force */
1592             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1593             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1594             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1595
1596             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1597             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1598             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1599
1600             }
1601
1602             /**************************
1603              * CALCULATE INTERACTIONS *
1604              **************************/
1605
1606             if (gmx_mm_any_lt(rsq22,rcutoff2))
1607             {
1608
1609             /* REACTION-FIELD ELECTROSTATICS */
1610             felec            = _mm_mul_ps(qq22,_mm_msub_ps(rinv22,rinvsq22,krf2));
1611
1612             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1613
1614             fscal            = felec;
1615
1616             fscal            = _mm_and_ps(fscal,cutoff_mask);
1617
1618              /* Update vectorial force */
1619             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1620             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1621             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1622
1623             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1624             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1625             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1626
1627             }
1628
1629             fjptrA             = f+j_coord_offsetA;
1630             fjptrB             = f+j_coord_offsetB;
1631             fjptrC             = f+j_coord_offsetC;
1632             fjptrD             = f+j_coord_offsetD;
1633
1634             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1635                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1636
1637             /* Inner loop uses 328 flops */
1638         }
1639
1640         if(jidx<j_index_end)
1641         {
1642
1643             /* Get j neighbor index, and coordinate index */
1644             jnrlistA         = jjnr[jidx];
1645             jnrlistB         = jjnr[jidx+1];
1646             jnrlistC         = jjnr[jidx+2];
1647             jnrlistD         = jjnr[jidx+3];
1648             /* Sign of each element will be negative for non-real atoms.
1649              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1650              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1651              */
1652             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1653             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1654             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1655             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1656             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1657             j_coord_offsetA  = DIM*jnrA;
1658             j_coord_offsetB  = DIM*jnrB;
1659             j_coord_offsetC  = DIM*jnrC;
1660             j_coord_offsetD  = DIM*jnrD;
1661
1662             /* load j atom coordinates */
1663             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1664                                               x+j_coord_offsetC,x+j_coord_offsetD,
1665                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1666
1667             /* Calculate displacement vector */
1668             dx00             = _mm_sub_ps(ix0,jx0);
1669             dy00             = _mm_sub_ps(iy0,jy0);
1670             dz00             = _mm_sub_ps(iz0,jz0);
1671             dx01             = _mm_sub_ps(ix0,jx1);
1672             dy01             = _mm_sub_ps(iy0,jy1);
1673             dz01             = _mm_sub_ps(iz0,jz1);
1674             dx02             = _mm_sub_ps(ix0,jx2);
1675             dy02             = _mm_sub_ps(iy0,jy2);
1676             dz02             = _mm_sub_ps(iz0,jz2);
1677             dx10             = _mm_sub_ps(ix1,jx0);
1678             dy10             = _mm_sub_ps(iy1,jy0);
1679             dz10             = _mm_sub_ps(iz1,jz0);
1680             dx11             = _mm_sub_ps(ix1,jx1);
1681             dy11             = _mm_sub_ps(iy1,jy1);
1682             dz11             = _mm_sub_ps(iz1,jz1);
1683             dx12             = _mm_sub_ps(ix1,jx2);
1684             dy12             = _mm_sub_ps(iy1,jy2);
1685             dz12             = _mm_sub_ps(iz1,jz2);
1686             dx20             = _mm_sub_ps(ix2,jx0);
1687             dy20             = _mm_sub_ps(iy2,jy0);
1688             dz20             = _mm_sub_ps(iz2,jz0);
1689             dx21             = _mm_sub_ps(ix2,jx1);
1690             dy21             = _mm_sub_ps(iy2,jy1);
1691             dz21             = _mm_sub_ps(iz2,jz1);
1692             dx22             = _mm_sub_ps(ix2,jx2);
1693             dy22             = _mm_sub_ps(iy2,jy2);
1694             dz22             = _mm_sub_ps(iz2,jz2);
1695
1696             /* Calculate squared distance and things based on it */
1697             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1698             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1699             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1700             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1701             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1702             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1703             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1704             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1705             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1706
1707             rinv00           = avx128fma_invsqrt_f(rsq00);
1708             rinv01           = avx128fma_invsqrt_f(rsq01);
1709             rinv02           = avx128fma_invsqrt_f(rsq02);
1710             rinv10           = avx128fma_invsqrt_f(rsq10);
1711             rinv11           = avx128fma_invsqrt_f(rsq11);
1712             rinv12           = avx128fma_invsqrt_f(rsq12);
1713             rinv20           = avx128fma_invsqrt_f(rsq20);
1714             rinv21           = avx128fma_invsqrt_f(rsq21);
1715             rinv22           = avx128fma_invsqrt_f(rsq22);
1716
1717             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1718             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1719             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1720             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1721             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1722             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1723             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1724             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1725             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1726
1727             fjx0             = _mm_setzero_ps();
1728             fjy0             = _mm_setzero_ps();
1729             fjz0             = _mm_setzero_ps();
1730             fjx1             = _mm_setzero_ps();
1731             fjy1             = _mm_setzero_ps();
1732             fjz1             = _mm_setzero_ps();
1733             fjx2             = _mm_setzero_ps();
1734             fjy2             = _mm_setzero_ps();
1735             fjz2             = _mm_setzero_ps();
1736
1737             /**************************
1738              * CALCULATE INTERACTIONS *
1739              **************************/
1740
1741             if (gmx_mm_any_lt(rsq00,rcutoff2))
1742             {
1743
1744             r00              = _mm_mul_ps(rsq00,rinv00);
1745             r00              = _mm_andnot_ps(dummy_mask,r00);
1746
1747             /* REACTION-FIELD ELECTROSTATICS */
1748             felec            = _mm_mul_ps(qq00,_mm_msub_ps(rinv00,rinvsq00,krf2));
1749
1750             /* LENNARD-JONES DISPERSION/REPULSION */
1751
1752             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1753             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
1754             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1755             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
1756             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1757
1758             d                = _mm_sub_ps(r00,rswitch);
1759             d                = _mm_max_ps(d,_mm_setzero_ps());
1760             d2               = _mm_mul_ps(d,d);
1761             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1762
1763             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1764
1765             /* Evaluate switch function */
1766             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1767             fvdw             = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1768             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1769
1770             fscal            = _mm_add_ps(felec,fvdw);
1771
1772             fscal            = _mm_and_ps(fscal,cutoff_mask);
1773
1774             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1775
1776              /* Update vectorial force */
1777             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1778             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1779             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1780
1781             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1782             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1783             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1784
1785             }
1786
1787             /**************************
1788              * CALCULATE INTERACTIONS *
1789              **************************/
1790
1791             if (gmx_mm_any_lt(rsq01,rcutoff2))
1792             {
1793
1794             /* REACTION-FIELD ELECTROSTATICS */
1795             felec            = _mm_mul_ps(qq01,_mm_msub_ps(rinv01,rinvsq01,krf2));
1796
1797             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1798
1799             fscal            = felec;
1800
1801             fscal            = _mm_and_ps(fscal,cutoff_mask);
1802
1803             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1804
1805              /* Update vectorial force */
1806             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1807             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1808             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1809
1810             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1811             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1812             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1813
1814             }
1815
1816             /**************************
1817              * CALCULATE INTERACTIONS *
1818              **************************/
1819
1820             if (gmx_mm_any_lt(rsq02,rcutoff2))
1821             {
1822
1823             /* REACTION-FIELD ELECTROSTATICS */
1824             felec            = _mm_mul_ps(qq02,_mm_msub_ps(rinv02,rinvsq02,krf2));
1825
1826             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1827
1828             fscal            = felec;
1829
1830             fscal            = _mm_and_ps(fscal,cutoff_mask);
1831
1832             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1833
1834              /* Update vectorial force */
1835             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1836             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1837             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1838
1839             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1840             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1841             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1842
1843             }
1844
1845             /**************************
1846              * CALCULATE INTERACTIONS *
1847              **************************/
1848
1849             if (gmx_mm_any_lt(rsq10,rcutoff2))
1850             {
1851
1852             /* REACTION-FIELD ELECTROSTATICS */
1853             felec            = _mm_mul_ps(qq10,_mm_msub_ps(rinv10,rinvsq10,krf2));
1854
1855             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1856
1857             fscal            = felec;
1858
1859             fscal            = _mm_and_ps(fscal,cutoff_mask);
1860
1861             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1862
1863              /* Update vectorial force */
1864             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1865             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1866             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1867
1868             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1869             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1870             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1871
1872             }
1873
1874             /**************************
1875              * CALCULATE INTERACTIONS *
1876              **************************/
1877
1878             if (gmx_mm_any_lt(rsq11,rcutoff2))
1879             {
1880
1881             /* REACTION-FIELD ELECTROSTATICS */
1882             felec            = _mm_mul_ps(qq11,_mm_msub_ps(rinv11,rinvsq11,krf2));
1883
1884             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1885
1886             fscal            = felec;
1887
1888             fscal            = _mm_and_ps(fscal,cutoff_mask);
1889
1890             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1891
1892              /* Update vectorial force */
1893             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1894             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1895             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1896
1897             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1898             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1899             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1900
1901             }
1902
1903             /**************************
1904              * CALCULATE INTERACTIONS *
1905              **************************/
1906
1907             if (gmx_mm_any_lt(rsq12,rcutoff2))
1908             {
1909
1910             /* REACTION-FIELD ELECTROSTATICS */
1911             felec            = _mm_mul_ps(qq12,_mm_msub_ps(rinv12,rinvsq12,krf2));
1912
1913             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1914
1915             fscal            = felec;
1916
1917             fscal            = _mm_and_ps(fscal,cutoff_mask);
1918
1919             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1920
1921              /* Update vectorial force */
1922             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1923             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1924             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1925
1926             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1927             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1928             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1929
1930             }
1931
1932             /**************************
1933              * CALCULATE INTERACTIONS *
1934              **************************/
1935
1936             if (gmx_mm_any_lt(rsq20,rcutoff2))
1937             {
1938
1939             /* REACTION-FIELD ELECTROSTATICS */
1940             felec            = _mm_mul_ps(qq20,_mm_msub_ps(rinv20,rinvsq20,krf2));
1941
1942             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1943
1944             fscal            = felec;
1945
1946             fscal            = _mm_and_ps(fscal,cutoff_mask);
1947
1948             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1949
1950              /* Update vectorial force */
1951             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1952             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1953             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1954
1955             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1956             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1957             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1958
1959             }
1960
1961             /**************************
1962              * CALCULATE INTERACTIONS *
1963              **************************/
1964
1965             if (gmx_mm_any_lt(rsq21,rcutoff2))
1966             {
1967
1968             /* REACTION-FIELD ELECTROSTATICS */
1969             felec            = _mm_mul_ps(qq21,_mm_msub_ps(rinv21,rinvsq21,krf2));
1970
1971             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1972
1973             fscal            = felec;
1974
1975             fscal            = _mm_and_ps(fscal,cutoff_mask);
1976
1977             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1978
1979              /* Update vectorial force */
1980             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1981             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1982             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1983
1984             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1985             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1986             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1987
1988             }
1989
1990             /**************************
1991              * CALCULATE INTERACTIONS *
1992              **************************/
1993
1994             if (gmx_mm_any_lt(rsq22,rcutoff2))
1995             {
1996
1997             /* REACTION-FIELD ELECTROSTATICS */
1998             felec            = _mm_mul_ps(qq22,_mm_msub_ps(rinv22,rinvsq22,krf2));
1999
2000             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2001
2002             fscal            = felec;
2003
2004             fscal            = _mm_and_ps(fscal,cutoff_mask);
2005
2006             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2007
2008              /* Update vectorial force */
2009             fix2             = _mm_macc_ps(dx22,fscal,fix2);
2010             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
2011             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
2012
2013             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
2014             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
2015             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
2016
2017             }
2018
2019             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2020             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2021             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2022             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2023
2024             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2025                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2026
2027             /* Inner loop uses 329 flops */
2028         }
2029
2030         /* End of innermost loop */
2031
2032         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2033                                               f+i_coord_offset,fshift+i_shift_offset);
2034
2035         /* Increment number of inner iterations */
2036         inneriter                  += j_index_end - j_index_start;
2037
2038         /* Outer loop uses 18 flops */
2039     }
2040
2041     /* Increment number of outer iterations */
2042     outeriter        += nri;
2043
2044     /* Update outer/inner flops */
2045
2046     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*329);
2047 }