Added option to gmx nmeig to print ZPE.
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecEw_VdwNone_GeomW3W3_avx_128_fma_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_128_fma_single.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_avx_128_fma_single
51  * Electrostatics interaction: Ewald
52  * VdW interaction:            None
53  * Geometry:                   Water3-Water3
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_avx_128_fma_single
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
77     real             rcutoff_scalar;
78     real             *shiftvec,*fshift,*x,*f;
79     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
80     real             scratch[4*DIM];
81     __m128           fscal,rcutoff,rcutoff2,jidxall;
82     int              vdwioffset0;
83     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84     int              vdwioffset1;
85     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86     int              vdwioffset2;
87     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
89     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
91     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
93     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
104     real             *charge;
105     __m128i          ewitab;
106     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
107     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
108     real             *ewtab;
109     __m128           dummy_mask,cutoff_mask;
110     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
111     __m128           one     = _mm_set1_ps(1.0);
112     __m128           two     = _mm_set1_ps(2.0);
113     x                = xx[0];
114     f                = ff[0];
115
116     nri              = nlist->nri;
117     iinr             = nlist->iinr;
118     jindex           = nlist->jindex;
119     jjnr             = nlist->jjnr;
120     shiftidx         = nlist->shift;
121     gid              = nlist->gid;
122     shiftvec         = fr->shift_vec[0];
123     fshift           = fr->fshift[0];
124     facel            = _mm_set1_ps(fr->ic->epsfac);
125     charge           = mdatoms->chargeA;
126
127     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
128     beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
129     beta2            = _mm_mul_ps(beta,beta);
130     beta3            = _mm_mul_ps(beta,beta2);
131     ewtab            = fr->ic->tabq_coul_FDV0;
132     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
133     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
134
135     /* Setup water-specific parameters */
136     inr              = nlist->iinr[0];
137     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
138     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
139     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
140
141     jq0              = _mm_set1_ps(charge[inr+0]);
142     jq1              = _mm_set1_ps(charge[inr+1]);
143     jq2              = _mm_set1_ps(charge[inr+2]);
144     qq00             = _mm_mul_ps(iq0,jq0);
145     qq01             = _mm_mul_ps(iq0,jq1);
146     qq02             = _mm_mul_ps(iq0,jq2);
147     qq10             = _mm_mul_ps(iq1,jq0);
148     qq11             = _mm_mul_ps(iq1,jq1);
149     qq12             = _mm_mul_ps(iq1,jq2);
150     qq20             = _mm_mul_ps(iq2,jq0);
151     qq21             = _mm_mul_ps(iq2,jq1);
152     qq22             = _mm_mul_ps(iq2,jq2);
153
154     /* Avoid stupid compiler warnings */
155     jnrA = jnrB = jnrC = jnrD = 0;
156     j_coord_offsetA = 0;
157     j_coord_offsetB = 0;
158     j_coord_offsetC = 0;
159     j_coord_offsetD = 0;
160
161     outeriter        = 0;
162     inneriter        = 0;
163
164     for(iidx=0;iidx<4*DIM;iidx++)
165     {
166         scratch[iidx] = 0.0;
167     }
168
169     /* Start outer loop over neighborlists */
170     for(iidx=0; iidx<nri; iidx++)
171     {
172         /* Load shift vector for this list */
173         i_shift_offset   = DIM*shiftidx[iidx];
174
175         /* Load limits for loop over neighbors */
176         j_index_start    = jindex[iidx];
177         j_index_end      = jindex[iidx+1];
178
179         /* Get outer coordinate index */
180         inr              = iinr[iidx];
181         i_coord_offset   = DIM*inr;
182
183         /* Load i particle coords and add shift vector */
184         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
185                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
186
187         fix0             = _mm_setzero_ps();
188         fiy0             = _mm_setzero_ps();
189         fiz0             = _mm_setzero_ps();
190         fix1             = _mm_setzero_ps();
191         fiy1             = _mm_setzero_ps();
192         fiz1             = _mm_setzero_ps();
193         fix2             = _mm_setzero_ps();
194         fiy2             = _mm_setzero_ps();
195         fiz2             = _mm_setzero_ps();
196
197         /* Reset potential sums */
198         velecsum         = _mm_setzero_ps();
199
200         /* Start inner kernel loop */
201         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
202         {
203
204             /* Get j neighbor index, and coordinate index */
205             jnrA             = jjnr[jidx];
206             jnrB             = jjnr[jidx+1];
207             jnrC             = jjnr[jidx+2];
208             jnrD             = jjnr[jidx+3];
209             j_coord_offsetA  = DIM*jnrA;
210             j_coord_offsetB  = DIM*jnrB;
211             j_coord_offsetC  = DIM*jnrC;
212             j_coord_offsetD  = DIM*jnrD;
213
214             /* load j atom coordinates */
215             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
216                                               x+j_coord_offsetC,x+j_coord_offsetD,
217                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
218
219             /* Calculate displacement vector */
220             dx00             = _mm_sub_ps(ix0,jx0);
221             dy00             = _mm_sub_ps(iy0,jy0);
222             dz00             = _mm_sub_ps(iz0,jz0);
223             dx01             = _mm_sub_ps(ix0,jx1);
224             dy01             = _mm_sub_ps(iy0,jy1);
225             dz01             = _mm_sub_ps(iz0,jz1);
226             dx02             = _mm_sub_ps(ix0,jx2);
227             dy02             = _mm_sub_ps(iy0,jy2);
228             dz02             = _mm_sub_ps(iz0,jz2);
229             dx10             = _mm_sub_ps(ix1,jx0);
230             dy10             = _mm_sub_ps(iy1,jy0);
231             dz10             = _mm_sub_ps(iz1,jz0);
232             dx11             = _mm_sub_ps(ix1,jx1);
233             dy11             = _mm_sub_ps(iy1,jy1);
234             dz11             = _mm_sub_ps(iz1,jz1);
235             dx12             = _mm_sub_ps(ix1,jx2);
236             dy12             = _mm_sub_ps(iy1,jy2);
237             dz12             = _mm_sub_ps(iz1,jz2);
238             dx20             = _mm_sub_ps(ix2,jx0);
239             dy20             = _mm_sub_ps(iy2,jy0);
240             dz20             = _mm_sub_ps(iz2,jz0);
241             dx21             = _mm_sub_ps(ix2,jx1);
242             dy21             = _mm_sub_ps(iy2,jy1);
243             dz21             = _mm_sub_ps(iz2,jz1);
244             dx22             = _mm_sub_ps(ix2,jx2);
245             dy22             = _mm_sub_ps(iy2,jy2);
246             dz22             = _mm_sub_ps(iz2,jz2);
247
248             /* Calculate squared distance and things based on it */
249             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
250             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
251             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
252             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
253             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
254             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
255             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
256             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
257             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
258
259             rinv00           = avx128fma_invsqrt_f(rsq00);
260             rinv01           = avx128fma_invsqrt_f(rsq01);
261             rinv02           = avx128fma_invsqrt_f(rsq02);
262             rinv10           = avx128fma_invsqrt_f(rsq10);
263             rinv11           = avx128fma_invsqrt_f(rsq11);
264             rinv12           = avx128fma_invsqrt_f(rsq12);
265             rinv20           = avx128fma_invsqrt_f(rsq20);
266             rinv21           = avx128fma_invsqrt_f(rsq21);
267             rinv22           = avx128fma_invsqrt_f(rsq22);
268
269             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
270             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
271             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
272             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
273             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
274             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
275             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
276             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
277             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
278
279             fjx0             = _mm_setzero_ps();
280             fjy0             = _mm_setzero_ps();
281             fjz0             = _mm_setzero_ps();
282             fjx1             = _mm_setzero_ps();
283             fjy1             = _mm_setzero_ps();
284             fjz1             = _mm_setzero_ps();
285             fjx2             = _mm_setzero_ps();
286             fjy2             = _mm_setzero_ps();
287             fjz2             = _mm_setzero_ps();
288
289             /**************************
290              * CALCULATE INTERACTIONS *
291              **************************/
292
293             r00              = _mm_mul_ps(rsq00,rinv00);
294
295             /* EWALD ELECTROSTATICS */
296
297             /* Analytical PME correction */
298             zeta2            = _mm_mul_ps(beta2,rsq00);
299             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
300             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
301             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
302             felec            = _mm_mul_ps(qq00,felec);
303             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
304             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
305             velec            = _mm_mul_ps(qq00,velec);
306
307             /* Update potential sum for this i atom from the interaction with this j atom. */
308             velecsum         = _mm_add_ps(velecsum,velec);
309
310             fscal            = felec;
311
312              /* Update vectorial force */
313             fix0             = _mm_macc_ps(dx00,fscal,fix0);
314             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
315             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
316
317             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
318             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
319             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
320
321             /**************************
322              * CALCULATE INTERACTIONS *
323              **************************/
324
325             r01              = _mm_mul_ps(rsq01,rinv01);
326
327             /* EWALD ELECTROSTATICS */
328
329             /* Analytical PME correction */
330             zeta2            = _mm_mul_ps(beta2,rsq01);
331             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
332             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
333             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
334             felec            = _mm_mul_ps(qq01,felec);
335             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
336             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv01);
337             velec            = _mm_mul_ps(qq01,velec);
338
339             /* Update potential sum for this i atom from the interaction with this j atom. */
340             velecsum         = _mm_add_ps(velecsum,velec);
341
342             fscal            = felec;
343
344              /* Update vectorial force */
345             fix0             = _mm_macc_ps(dx01,fscal,fix0);
346             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
347             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
348
349             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
350             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
351             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
352
353             /**************************
354              * CALCULATE INTERACTIONS *
355              **************************/
356
357             r02              = _mm_mul_ps(rsq02,rinv02);
358
359             /* EWALD ELECTROSTATICS */
360
361             /* Analytical PME correction */
362             zeta2            = _mm_mul_ps(beta2,rsq02);
363             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
364             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
365             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
366             felec            = _mm_mul_ps(qq02,felec);
367             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
368             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv02);
369             velec            = _mm_mul_ps(qq02,velec);
370
371             /* Update potential sum for this i atom from the interaction with this j atom. */
372             velecsum         = _mm_add_ps(velecsum,velec);
373
374             fscal            = felec;
375
376              /* Update vectorial force */
377             fix0             = _mm_macc_ps(dx02,fscal,fix0);
378             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
379             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
380
381             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
382             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
383             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
384
385             /**************************
386              * CALCULATE INTERACTIONS *
387              **************************/
388
389             r10              = _mm_mul_ps(rsq10,rinv10);
390
391             /* EWALD ELECTROSTATICS */
392
393             /* Analytical PME correction */
394             zeta2            = _mm_mul_ps(beta2,rsq10);
395             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
396             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
397             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
398             felec            = _mm_mul_ps(qq10,felec);
399             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
400             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
401             velec            = _mm_mul_ps(qq10,velec);
402
403             /* Update potential sum for this i atom from the interaction with this j atom. */
404             velecsum         = _mm_add_ps(velecsum,velec);
405
406             fscal            = felec;
407
408              /* Update vectorial force */
409             fix1             = _mm_macc_ps(dx10,fscal,fix1);
410             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
411             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
412
413             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
414             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
415             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
416
417             /**************************
418              * CALCULATE INTERACTIONS *
419              **************************/
420
421             r11              = _mm_mul_ps(rsq11,rinv11);
422
423             /* EWALD ELECTROSTATICS */
424
425             /* Analytical PME correction */
426             zeta2            = _mm_mul_ps(beta2,rsq11);
427             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
428             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
429             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
430             felec            = _mm_mul_ps(qq11,felec);
431             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
432             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
433             velec            = _mm_mul_ps(qq11,velec);
434
435             /* Update potential sum for this i atom from the interaction with this j atom. */
436             velecsum         = _mm_add_ps(velecsum,velec);
437
438             fscal            = felec;
439
440              /* Update vectorial force */
441             fix1             = _mm_macc_ps(dx11,fscal,fix1);
442             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
443             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
444
445             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
446             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
447             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
448
449             /**************************
450              * CALCULATE INTERACTIONS *
451              **************************/
452
453             r12              = _mm_mul_ps(rsq12,rinv12);
454
455             /* EWALD ELECTROSTATICS */
456
457             /* Analytical PME correction */
458             zeta2            = _mm_mul_ps(beta2,rsq12);
459             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
460             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
461             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
462             felec            = _mm_mul_ps(qq12,felec);
463             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
464             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
465             velec            = _mm_mul_ps(qq12,velec);
466
467             /* Update potential sum for this i atom from the interaction with this j atom. */
468             velecsum         = _mm_add_ps(velecsum,velec);
469
470             fscal            = felec;
471
472              /* Update vectorial force */
473             fix1             = _mm_macc_ps(dx12,fscal,fix1);
474             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
475             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
476
477             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
478             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
479             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
480
481             /**************************
482              * CALCULATE INTERACTIONS *
483              **************************/
484
485             r20              = _mm_mul_ps(rsq20,rinv20);
486
487             /* EWALD ELECTROSTATICS */
488
489             /* Analytical PME correction */
490             zeta2            = _mm_mul_ps(beta2,rsq20);
491             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
492             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
493             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
494             felec            = _mm_mul_ps(qq20,felec);
495             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
496             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
497             velec            = _mm_mul_ps(qq20,velec);
498
499             /* Update potential sum for this i atom from the interaction with this j atom. */
500             velecsum         = _mm_add_ps(velecsum,velec);
501
502             fscal            = felec;
503
504              /* Update vectorial force */
505             fix2             = _mm_macc_ps(dx20,fscal,fix2);
506             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
507             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
508
509             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
510             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
511             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
512
513             /**************************
514              * CALCULATE INTERACTIONS *
515              **************************/
516
517             r21              = _mm_mul_ps(rsq21,rinv21);
518
519             /* EWALD ELECTROSTATICS */
520
521             /* Analytical PME correction */
522             zeta2            = _mm_mul_ps(beta2,rsq21);
523             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
524             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
525             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
526             felec            = _mm_mul_ps(qq21,felec);
527             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
528             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
529             velec            = _mm_mul_ps(qq21,velec);
530
531             /* Update potential sum for this i atom from the interaction with this j atom. */
532             velecsum         = _mm_add_ps(velecsum,velec);
533
534             fscal            = felec;
535
536              /* Update vectorial force */
537             fix2             = _mm_macc_ps(dx21,fscal,fix2);
538             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
539             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
540
541             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
542             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
543             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
544
545             /**************************
546              * CALCULATE INTERACTIONS *
547              **************************/
548
549             r22              = _mm_mul_ps(rsq22,rinv22);
550
551             /* EWALD ELECTROSTATICS */
552
553             /* Analytical PME correction */
554             zeta2            = _mm_mul_ps(beta2,rsq22);
555             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
556             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
557             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
558             felec            = _mm_mul_ps(qq22,felec);
559             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
560             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
561             velec            = _mm_mul_ps(qq22,velec);
562
563             /* Update potential sum for this i atom from the interaction with this j atom. */
564             velecsum         = _mm_add_ps(velecsum,velec);
565
566             fscal            = felec;
567
568              /* Update vectorial force */
569             fix2             = _mm_macc_ps(dx22,fscal,fix2);
570             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
571             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
572
573             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
574             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
575             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
576
577             fjptrA             = f+j_coord_offsetA;
578             fjptrB             = f+j_coord_offsetB;
579             fjptrC             = f+j_coord_offsetC;
580             fjptrD             = f+j_coord_offsetD;
581
582             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
583                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
584
585             /* Inner loop uses 261 flops */
586         }
587
588         if(jidx<j_index_end)
589         {
590
591             /* Get j neighbor index, and coordinate index */
592             jnrlistA         = jjnr[jidx];
593             jnrlistB         = jjnr[jidx+1];
594             jnrlistC         = jjnr[jidx+2];
595             jnrlistD         = jjnr[jidx+3];
596             /* Sign of each element will be negative for non-real atoms.
597              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
598              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
599              */
600             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
601             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
602             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
603             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
604             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
605             j_coord_offsetA  = DIM*jnrA;
606             j_coord_offsetB  = DIM*jnrB;
607             j_coord_offsetC  = DIM*jnrC;
608             j_coord_offsetD  = DIM*jnrD;
609
610             /* load j atom coordinates */
611             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
612                                               x+j_coord_offsetC,x+j_coord_offsetD,
613                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
614
615             /* Calculate displacement vector */
616             dx00             = _mm_sub_ps(ix0,jx0);
617             dy00             = _mm_sub_ps(iy0,jy0);
618             dz00             = _mm_sub_ps(iz0,jz0);
619             dx01             = _mm_sub_ps(ix0,jx1);
620             dy01             = _mm_sub_ps(iy0,jy1);
621             dz01             = _mm_sub_ps(iz0,jz1);
622             dx02             = _mm_sub_ps(ix0,jx2);
623             dy02             = _mm_sub_ps(iy0,jy2);
624             dz02             = _mm_sub_ps(iz0,jz2);
625             dx10             = _mm_sub_ps(ix1,jx0);
626             dy10             = _mm_sub_ps(iy1,jy0);
627             dz10             = _mm_sub_ps(iz1,jz0);
628             dx11             = _mm_sub_ps(ix1,jx1);
629             dy11             = _mm_sub_ps(iy1,jy1);
630             dz11             = _mm_sub_ps(iz1,jz1);
631             dx12             = _mm_sub_ps(ix1,jx2);
632             dy12             = _mm_sub_ps(iy1,jy2);
633             dz12             = _mm_sub_ps(iz1,jz2);
634             dx20             = _mm_sub_ps(ix2,jx0);
635             dy20             = _mm_sub_ps(iy2,jy0);
636             dz20             = _mm_sub_ps(iz2,jz0);
637             dx21             = _mm_sub_ps(ix2,jx1);
638             dy21             = _mm_sub_ps(iy2,jy1);
639             dz21             = _mm_sub_ps(iz2,jz1);
640             dx22             = _mm_sub_ps(ix2,jx2);
641             dy22             = _mm_sub_ps(iy2,jy2);
642             dz22             = _mm_sub_ps(iz2,jz2);
643
644             /* Calculate squared distance and things based on it */
645             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
646             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
647             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
648             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
649             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
650             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
651             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
652             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
653             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
654
655             rinv00           = avx128fma_invsqrt_f(rsq00);
656             rinv01           = avx128fma_invsqrt_f(rsq01);
657             rinv02           = avx128fma_invsqrt_f(rsq02);
658             rinv10           = avx128fma_invsqrt_f(rsq10);
659             rinv11           = avx128fma_invsqrt_f(rsq11);
660             rinv12           = avx128fma_invsqrt_f(rsq12);
661             rinv20           = avx128fma_invsqrt_f(rsq20);
662             rinv21           = avx128fma_invsqrt_f(rsq21);
663             rinv22           = avx128fma_invsqrt_f(rsq22);
664
665             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
666             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
667             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
668             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
669             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
670             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
671             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
672             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
673             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
674
675             fjx0             = _mm_setzero_ps();
676             fjy0             = _mm_setzero_ps();
677             fjz0             = _mm_setzero_ps();
678             fjx1             = _mm_setzero_ps();
679             fjy1             = _mm_setzero_ps();
680             fjz1             = _mm_setzero_ps();
681             fjx2             = _mm_setzero_ps();
682             fjy2             = _mm_setzero_ps();
683             fjz2             = _mm_setzero_ps();
684
685             /**************************
686              * CALCULATE INTERACTIONS *
687              **************************/
688
689             r00              = _mm_mul_ps(rsq00,rinv00);
690             r00              = _mm_andnot_ps(dummy_mask,r00);
691
692             /* EWALD ELECTROSTATICS */
693
694             /* Analytical PME correction */
695             zeta2            = _mm_mul_ps(beta2,rsq00);
696             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
697             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
698             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
699             felec            = _mm_mul_ps(qq00,felec);
700             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
701             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
702             velec            = _mm_mul_ps(qq00,velec);
703
704             /* Update potential sum for this i atom from the interaction with this j atom. */
705             velec            = _mm_andnot_ps(dummy_mask,velec);
706             velecsum         = _mm_add_ps(velecsum,velec);
707
708             fscal            = felec;
709
710             fscal            = _mm_andnot_ps(dummy_mask,fscal);
711
712              /* Update vectorial force */
713             fix0             = _mm_macc_ps(dx00,fscal,fix0);
714             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
715             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
716
717             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
718             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
719             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
720
721             /**************************
722              * CALCULATE INTERACTIONS *
723              **************************/
724
725             r01              = _mm_mul_ps(rsq01,rinv01);
726             r01              = _mm_andnot_ps(dummy_mask,r01);
727
728             /* EWALD ELECTROSTATICS */
729
730             /* Analytical PME correction */
731             zeta2            = _mm_mul_ps(beta2,rsq01);
732             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
733             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
734             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
735             felec            = _mm_mul_ps(qq01,felec);
736             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
737             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv01);
738             velec            = _mm_mul_ps(qq01,velec);
739
740             /* Update potential sum for this i atom from the interaction with this j atom. */
741             velec            = _mm_andnot_ps(dummy_mask,velec);
742             velecsum         = _mm_add_ps(velecsum,velec);
743
744             fscal            = felec;
745
746             fscal            = _mm_andnot_ps(dummy_mask,fscal);
747
748              /* Update vectorial force */
749             fix0             = _mm_macc_ps(dx01,fscal,fix0);
750             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
751             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
752
753             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
754             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
755             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
756
757             /**************************
758              * CALCULATE INTERACTIONS *
759              **************************/
760
761             r02              = _mm_mul_ps(rsq02,rinv02);
762             r02              = _mm_andnot_ps(dummy_mask,r02);
763
764             /* EWALD ELECTROSTATICS */
765
766             /* Analytical PME correction */
767             zeta2            = _mm_mul_ps(beta2,rsq02);
768             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
769             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
770             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
771             felec            = _mm_mul_ps(qq02,felec);
772             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
773             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv02);
774             velec            = _mm_mul_ps(qq02,velec);
775
776             /* Update potential sum for this i atom from the interaction with this j atom. */
777             velec            = _mm_andnot_ps(dummy_mask,velec);
778             velecsum         = _mm_add_ps(velecsum,velec);
779
780             fscal            = felec;
781
782             fscal            = _mm_andnot_ps(dummy_mask,fscal);
783
784              /* Update vectorial force */
785             fix0             = _mm_macc_ps(dx02,fscal,fix0);
786             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
787             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
788
789             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
790             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
791             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
792
793             /**************************
794              * CALCULATE INTERACTIONS *
795              **************************/
796
797             r10              = _mm_mul_ps(rsq10,rinv10);
798             r10              = _mm_andnot_ps(dummy_mask,r10);
799
800             /* EWALD ELECTROSTATICS */
801
802             /* Analytical PME correction */
803             zeta2            = _mm_mul_ps(beta2,rsq10);
804             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
805             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
806             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
807             felec            = _mm_mul_ps(qq10,felec);
808             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
809             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
810             velec            = _mm_mul_ps(qq10,velec);
811
812             /* Update potential sum for this i atom from the interaction with this j atom. */
813             velec            = _mm_andnot_ps(dummy_mask,velec);
814             velecsum         = _mm_add_ps(velecsum,velec);
815
816             fscal            = felec;
817
818             fscal            = _mm_andnot_ps(dummy_mask,fscal);
819
820              /* Update vectorial force */
821             fix1             = _mm_macc_ps(dx10,fscal,fix1);
822             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
823             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
824
825             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
826             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
827             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
828
829             /**************************
830              * CALCULATE INTERACTIONS *
831              **************************/
832
833             r11              = _mm_mul_ps(rsq11,rinv11);
834             r11              = _mm_andnot_ps(dummy_mask,r11);
835
836             /* EWALD ELECTROSTATICS */
837
838             /* Analytical PME correction */
839             zeta2            = _mm_mul_ps(beta2,rsq11);
840             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
841             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
842             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
843             felec            = _mm_mul_ps(qq11,felec);
844             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
845             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
846             velec            = _mm_mul_ps(qq11,velec);
847
848             /* Update potential sum for this i atom from the interaction with this j atom. */
849             velec            = _mm_andnot_ps(dummy_mask,velec);
850             velecsum         = _mm_add_ps(velecsum,velec);
851
852             fscal            = felec;
853
854             fscal            = _mm_andnot_ps(dummy_mask,fscal);
855
856              /* Update vectorial force */
857             fix1             = _mm_macc_ps(dx11,fscal,fix1);
858             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
859             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
860
861             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
862             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
863             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
864
865             /**************************
866              * CALCULATE INTERACTIONS *
867              **************************/
868
869             r12              = _mm_mul_ps(rsq12,rinv12);
870             r12              = _mm_andnot_ps(dummy_mask,r12);
871
872             /* EWALD ELECTROSTATICS */
873
874             /* Analytical PME correction */
875             zeta2            = _mm_mul_ps(beta2,rsq12);
876             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
877             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
878             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
879             felec            = _mm_mul_ps(qq12,felec);
880             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
881             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
882             velec            = _mm_mul_ps(qq12,velec);
883
884             /* Update potential sum for this i atom from the interaction with this j atom. */
885             velec            = _mm_andnot_ps(dummy_mask,velec);
886             velecsum         = _mm_add_ps(velecsum,velec);
887
888             fscal            = felec;
889
890             fscal            = _mm_andnot_ps(dummy_mask,fscal);
891
892              /* Update vectorial force */
893             fix1             = _mm_macc_ps(dx12,fscal,fix1);
894             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
895             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
896
897             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
898             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
899             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
900
901             /**************************
902              * CALCULATE INTERACTIONS *
903              **************************/
904
905             r20              = _mm_mul_ps(rsq20,rinv20);
906             r20              = _mm_andnot_ps(dummy_mask,r20);
907
908             /* EWALD ELECTROSTATICS */
909
910             /* Analytical PME correction */
911             zeta2            = _mm_mul_ps(beta2,rsq20);
912             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
913             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
914             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
915             felec            = _mm_mul_ps(qq20,felec);
916             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
917             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
918             velec            = _mm_mul_ps(qq20,velec);
919
920             /* Update potential sum for this i atom from the interaction with this j atom. */
921             velec            = _mm_andnot_ps(dummy_mask,velec);
922             velecsum         = _mm_add_ps(velecsum,velec);
923
924             fscal            = felec;
925
926             fscal            = _mm_andnot_ps(dummy_mask,fscal);
927
928              /* Update vectorial force */
929             fix2             = _mm_macc_ps(dx20,fscal,fix2);
930             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
931             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
932
933             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
934             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
935             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
936
937             /**************************
938              * CALCULATE INTERACTIONS *
939              **************************/
940
941             r21              = _mm_mul_ps(rsq21,rinv21);
942             r21              = _mm_andnot_ps(dummy_mask,r21);
943
944             /* EWALD ELECTROSTATICS */
945
946             /* Analytical PME correction */
947             zeta2            = _mm_mul_ps(beta2,rsq21);
948             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
949             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
950             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
951             felec            = _mm_mul_ps(qq21,felec);
952             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
953             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
954             velec            = _mm_mul_ps(qq21,velec);
955
956             /* Update potential sum for this i atom from the interaction with this j atom. */
957             velec            = _mm_andnot_ps(dummy_mask,velec);
958             velecsum         = _mm_add_ps(velecsum,velec);
959
960             fscal            = felec;
961
962             fscal            = _mm_andnot_ps(dummy_mask,fscal);
963
964              /* Update vectorial force */
965             fix2             = _mm_macc_ps(dx21,fscal,fix2);
966             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
967             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
968
969             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
970             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
971             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
972
973             /**************************
974              * CALCULATE INTERACTIONS *
975              **************************/
976
977             r22              = _mm_mul_ps(rsq22,rinv22);
978             r22              = _mm_andnot_ps(dummy_mask,r22);
979
980             /* EWALD ELECTROSTATICS */
981
982             /* Analytical PME correction */
983             zeta2            = _mm_mul_ps(beta2,rsq22);
984             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
985             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
986             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
987             felec            = _mm_mul_ps(qq22,felec);
988             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
989             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
990             velec            = _mm_mul_ps(qq22,velec);
991
992             /* Update potential sum for this i atom from the interaction with this j atom. */
993             velec            = _mm_andnot_ps(dummy_mask,velec);
994             velecsum         = _mm_add_ps(velecsum,velec);
995
996             fscal            = felec;
997
998             fscal            = _mm_andnot_ps(dummy_mask,fscal);
999
1000              /* Update vectorial force */
1001             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1002             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1003             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1004
1005             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1006             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1007             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1008
1009             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1010             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1011             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1012             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1013
1014             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1015                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1016
1017             /* Inner loop uses 270 flops */
1018         }
1019
1020         /* End of innermost loop */
1021
1022         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1023                                               f+i_coord_offset,fshift+i_shift_offset);
1024
1025         ggid                        = gid[iidx];
1026         /* Update potential energies */
1027         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1028
1029         /* Increment number of inner iterations */
1030         inneriter                  += j_index_end - j_index_start;
1031
1032         /* Outer loop uses 19 flops */
1033     }
1034
1035     /* Increment number of outer iterations */
1036     outeriter        += nri;
1037
1038     /* Update outer/inner flops */
1039
1040     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*270);
1041 }
1042 /*
1043  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW3W3_F_avx_128_fma_single
1044  * Electrostatics interaction: Ewald
1045  * VdW interaction:            None
1046  * Geometry:                   Water3-Water3
1047  * Calculate force/pot:        Force
1048  */
1049 void
1050 nb_kernel_ElecEw_VdwNone_GeomW3W3_F_avx_128_fma_single
1051                     (t_nblist                    * gmx_restrict       nlist,
1052                      rvec                        * gmx_restrict          xx,
1053                      rvec                        * gmx_restrict          ff,
1054                      struct t_forcerec           * gmx_restrict          fr,
1055                      t_mdatoms                   * gmx_restrict     mdatoms,
1056                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1057                      t_nrnb                      * gmx_restrict        nrnb)
1058 {
1059     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1060      * just 0 for non-waters.
1061      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1062      * jnr indices corresponding to data put in the four positions in the SIMD register.
1063      */
1064     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1065     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1066     int              jnrA,jnrB,jnrC,jnrD;
1067     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1068     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1069     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1070     real             rcutoff_scalar;
1071     real             *shiftvec,*fshift,*x,*f;
1072     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1073     real             scratch[4*DIM];
1074     __m128           fscal,rcutoff,rcutoff2,jidxall;
1075     int              vdwioffset0;
1076     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1077     int              vdwioffset1;
1078     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1079     int              vdwioffset2;
1080     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1081     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1082     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1083     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1084     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1085     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1086     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1087     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1088     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1089     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1090     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1091     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1092     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1093     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1094     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1095     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1096     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1097     real             *charge;
1098     __m128i          ewitab;
1099     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1100     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1101     real             *ewtab;
1102     __m128           dummy_mask,cutoff_mask;
1103     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1104     __m128           one     = _mm_set1_ps(1.0);
1105     __m128           two     = _mm_set1_ps(2.0);
1106     x                = xx[0];
1107     f                = ff[0];
1108
1109     nri              = nlist->nri;
1110     iinr             = nlist->iinr;
1111     jindex           = nlist->jindex;
1112     jjnr             = nlist->jjnr;
1113     shiftidx         = nlist->shift;
1114     gid              = nlist->gid;
1115     shiftvec         = fr->shift_vec[0];
1116     fshift           = fr->fshift[0];
1117     facel            = _mm_set1_ps(fr->ic->epsfac);
1118     charge           = mdatoms->chargeA;
1119
1120     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1121     beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
1122     beta2            = _mm_mul_ps(beta,beta);
1123     beta3            = _mm_mul_ps(beta,beta2);
1124     ewtab            = fr->ic->tabq_coul_F;
1125     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1126     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1127
1128     /* Setup water-specific parameters */
1129     inr              = nlist->iinr[0];
1130     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1131     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1132     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1133
1134     jq0              = _mm_set1_ps(charge[inr+0]);
1135     jq1              = _mm_set1_ps(charge[inr+1]);
1136     jq2              = _mm_set1_ps(charge[inr+2]);
1137     qq00             = _mm_mul_ps(iq0,jq0);
1138     qq01             = _mm_mul_ps(iq0,jq1);
1139     qq02             = _mm_mul_ps(iq0,jq2);
1140     qq10             = _mm_mul_ps(iq1,jq0);
1141     qq11             = _mm_mul_ps(iq1,jq1);
1142     qq12             = _mm_mul_ps(iq1,jq2);
1143     qq20             = _mm_mul_ps(iq2,jq0);
1144     qq21             = _mm_mul_ps(iq2,jq1);
1145     qq22             = _mm_mul_ps(iq2,jq2);
1146
1147     /* Avoid stupid compiler warnings */
1148     jnrA = jnrB = jnrC = jnrD = 0;
1149     j_coord_offsetA = 0;
1150     j_coord_offsetB = 0;
1151     j_coord_offsetC = 0;
1152     j_coord_offsetD = 0;
1153
1154     outeriter        = 0;
1155     inneriter        = 0;
1156
1157     for(iidx=0;iidx<4*DIM;iidx++)
1158     {
1159         scratch[iidx] = 0.0;
1160     }
1161
1162     /* Start outer loop over neighborlists */
1163     for(iidx=0; iidx<nri; iidx++)
1164     {
1165         /* Load shift vector for this list */
1166         i_shift_offset   = DIM*shiftidx[iidx];
1167
1168         /* Load limits for loop over neighbors */
1169         j_index_start    = jindex[iidx];
1170         j_index_end      = jindex[iidx+1];
1171
1172         /* Get outer coordinate index */
1173         inr              = iinr[iidx];
1174         i_coord_offset   = DIM*inr;
1175
1176         /* Load i particle coords and add shift vector */
1177         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1178                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1179
1180         fix0             = _mm_setzero_ps();
1181         fiy0             = _mm_setzero_ps();
1182         fiz0             = _mm_setzero_ps();
1183         fix1             = _mm_setzero_ps();
1184         fiy1             = _mm_setzero_ps();
1185         fiz1             = _mm_setzero_ps();
1186         fix2             = _mm_setzero_ps();
1187         fiy2             = _mm_setzero_ps();
1188         fiz2             = _mm_setzero_ps();
1189
1190         /* Start inner kernel loop */
1191         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1192         {
1193
1194             /* Get j neighbor index, and coordinate index */
1195             jnrA             = jjnr[jidx];
1196             jnrB             = jjnr[jidx+1];
1197             jnrC             = jjnr[jidx+2];
1198             jnrD             = jjnr[jidx+3];
1199             j_coord_offsetA  = DIM*jnrA;
1200             j_coord_offsetB  = DIM*jnrB;
1201             j_coord_offsetC  = DIM*jnrC;
1202             j_coord_offsetD  = DIM*jnrD;
1203
1204             /* load j atom coordinates */
1205             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1206                                               x+j_coord_offsetC,x+j_coord_offsetD,
1207                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1208
1209             /* Calculate displacement vector */
1210             dx00             = _mm_sub_ps(ix0,jx0);
1211             dy00             = _mm_sub_ps(iy0,jy0);
1212             dz00             = _mm_sub_ps(iz0,jz0);
1213             dx01             = _mm_sub_ps(ix0,jx1);
1214             dy01             = _mm_sub_ps(iy0,jy1);
1215             dz01             = _mm_sub_ps(iz0,jz1);
1216             dx02             = _mm_sub_ps(ix0,jx2);
1217             dy02             = _mm_sub_ps(iy0,jy2);
1218             dz02             = _mm_sub_ps(iz0,jz2);
1219             dx10             = _mm_sub_ps(ix1,jx0);
1220             dy10             = _mm_sub_ps(iy1,jy0);
1221             dz10             = _mm_sub_ps(iz1,jz0);
1222             dx11             = _mm_sub_ps(ix1,jx1);
1223             dy11             = _mm_sub_ps(iy1,jy1);
1224             dz11             = _mm_sub_ps(iz1,jz1);
1225             dx12             = _mm_sub_ps(ix1,jx2);
1226             dy12             = _mm_sub_ps(iy1,jy2);
1227             dz12             = _mm_sub_ps(iz1,jz2);
1228             dx20             = _mm_sub_ps(ix2,jx0);
1229             dy20             = _mm_sub_ps(iy2,jy0);
1230             dz20             = _mm_sub_ps(iz2,jz0);
1231             dx21             = _mm_sub_ps(ix2,jx1);
1232             dy21             = _mm_sub_ps(iy2,jy1);
1233             dz21             = _mm_sub_ps(iz2,jz1);
1234             dx22             = _mm_sub_ps(ix2,jx2);
1235             dy22             = _mm_sub_ps(iy2,jy2);
1236             dz22             = _mm_sub_ps(iz2,jz2);
1237
1238             /* Calculate squared distance and things based on it */
1239             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1240             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1241             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1242             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1243             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1244             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1245             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1246             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1247             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1248
1249             rinv00           = avx128fma_invsqrt_f(rsq00);
1250             rinv01           = avx128fma_invsqrt_f(rsq01);
1251             rinv02           = avx128fma_invsqrt_f(rsq02);
1252             rinv10           = avx128fma_invsqrt_f(rsq10);
1253             rinv11           = avx128fma_invsqrt_f(rsq11);
1254             rinv12           = avx128fma_invsqrt_f(rsq12);
1255             rinv20           = avx128fma_invsqrt_f(rsq20);
1256             rinv21           = avx128fma_invsqrt_f(rsq21);
1257             rinv22           = avx128fma_invsqrt_f(rsq22);
1258
1259             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1260             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1261             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1262             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1263             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1264             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1265             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1266             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1267             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1268
1269             fjx0             = _mm_setzero_ps();
1270             fjy0             = _mm_setzero_ps();
1271             fjz0             = _mm_setzero_ps();
1272             fjx1             = _mm_setzero_ps();
1273             fjy1             = _mm_setzero_ps();
1274             fjz1             = _mm_setzero_ps();
1275             fjx2             = _mm_setzero_ps();
1276             fjy2             = _mm_setzero_ps();
1277             fjz2             = _mm_setzero_ps();
1278
1279             /**************************
1280              * CALCULATE INTERACTIONS *
1281              **************************/
1282
1283             r00              = _mm_mul_ps(rsq00,rinv00);
1284
1285             /* EWALD ELECTROSTATICS */
1286
1287             /* Analytical PME correction */
1288             zeta2            = _mm_mul_ps(beta2,rsq00);
1289             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
1290             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1291             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1292             felec            = _mm_mul_ps(qq00,felec);
1293
1294             fscal            = felec;
1295
1296              /* Update vectorial force */
1297             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1298             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1299             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1300
1301             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1302             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1303             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1304
1305             /**************************
1306              * CALCULATE INTERACTIONS *
1307              **************************/
1308
1309             r01              = _mm_mul_ps(rsq01,rinv01);
1310
1311             /* EWALD ELECTROSTATICS */
1312
1313             /* Analytical PME correction */
1314             zeta2            = _mm_mul_ps(beta2,rsq01);
1315             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
1316             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1317             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1318             felec            = _mm_mul_ps(qq01,felec);
1319
1320             fscal            = felec;
1321
1322              /* Update vectorial force */
1323             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1324             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1325             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1326
1327             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1328             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1329             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1330
1331             /**************************
1332              * CALCULATE INTERACTIONS *
1333              **************************/
1334
1335             r02              = _mm_mul_ps(rsq02,rinv02);
1336
1337             /* EWALD ELECTROSTATICS */
1338
1339             /* Analytical PME correction */
1340             zeta2            = _mm_mul_ps(beta2,rsq02);
1341             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
1342             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1343             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1344             felec            = _mm_mul_ps(qq02,felec);
1345
1346             fscal            = felec;
1347
1348              /* Update vectorial force */
1349             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1350             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1351             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1352
1353             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1354             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1355             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1356
1357             /**************************
1358              * CALCULATE INTERACTIONS *
1359              **************************/
1360
1361             r10              = _mm_mul_ps(rsq10,rinv10);
1362
1363             /* EWALD ELECTROSTATICS */
1364
1365             /* Analytical PME correction */
1366             zeta2            = _mm_mul_ps(beta2,rsq10);
1367             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
1368             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1369             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1370             felec            = _mm_mul_ps(qq10,felec);
1371
1372             fscal            = felec;
1373
1374              /* Update vectorial force */
1375             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1376             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1377             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1378
1379             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1380             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1381             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1382
1383             /**************************
1384              * CALCULATE INTERACTIONS *
1385              **************************/
1386
1387             r11              = _mm_mul_ps(rsq11,rinv11);
1388
1389             /* EWALD ELECTROSTATICS */
1390
1391             /* Analytical PME correction */
1392             zeta2            = _mm_mul_ps(beta2,rsq11);
1393             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1394             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1395             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1396             felec            = _mm_mul_ps(qq11,felec);
1397
1398             fscal            = felec;
1399
1400              /* Update vectorial force */
1401             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1402             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1403             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1404
1405             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1406             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1407             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1408
1409             /**************************
1410              * CALCULATE INTERACTIONS *
1411              **************************/
1412
1413             r12              = _mm_mul_ps(rsq12,rinv12);
1414
1415             /* EWALD ELECTROSTATICS */
1416
1417             /* Analytical PME correction */
1418             zeta2            = _mm_mul_ps(beta2,rsq12);
1419             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
1420             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1421             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1422             felec            = _mm_mul_ps(qq12,felec);
1423
1424             fscal            = felec;
1425
1426              /* Update vectorial force */
1427             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1428             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1429             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1430
1431             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1432             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1433             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1434
1435             /**************************
1436              * CALCULATE INTERACTIONS *
1437              **************************/
1438
1439             r20              = _mm_mul_ps(rsq20,rinv20);
1440
1441             /* EWALD ELECTROSTATICS */
1442
1443             /* Analytical PME correction */
1444             zeta2            = _mm_mul_ps(beta2,rsq20);
1445             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
1446             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1447             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1448             felec            = _mm_mul_ps(qq20,felec);
1449
1450             fscal            = felec;
1451
1452              /* Update vectorial force */
1453             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1454             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1455             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1456
1457             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1458             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1459             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1460
1461             /**************************
1462              * CALCULATE INTERACTIONS *
1463              **************************/
1464
1465             r21              = _mm_mul_ps(rsq21,rinv21);
1466
1467             /* EWALD ELECTROSTATICS */
1468
1469             /* Analytical PME correction */
1470             zeta2            = _mm_mul_ps(beta2,rsq21);
1471             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
1472             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1473             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1474             felec            = _mm_mul_ps(qq21,felec);
1475
1476             fscal            = felec;
1477
1478              /* Update vectorial force */
1479             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1480             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1481             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1482
1483             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1484             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1485             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1486
1487             /**************************
1488              * CALCULATE INTERACTIONS *
1489              **************************/
1490
1491             r22              = _mm_mul_ps(rsq22,rinv22);
1492
1493             /* EWALD ELECTROSTATICS */
1494
1495             /* Analytical PME correction */
1496             zeta2            = _mm_mul_ps(beta2,rsq22);
1497             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
1498             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1499             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1500             felec            = _mm_mul_ps(qq22,felec);
1501
1502             fscal            = felec;
1503
1504              /* Update vectorial force */
1505             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1506             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1507             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1508
1509             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1510             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1511             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1512
1513             fjptrA             = f+j_coord_offsetA;
1514             fjptrB             = f+j_coord_offsetB;
1515             fjptrC             = f+j_coord_offsetC;
1516             fjptrD             = f+j_coord_offsetD;
1517
1518             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1519                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1520
1521             /* Inner loop uses 252 flops */
1522         }
1523
1524         if(jidx<j_index_end)
1525         {
1526
1527             /* Get j neighbor index, and coordinate index */
1528             jnrlistA         = jjnr[jidx];
1529             jnrlistB         = jjnr[jidx+1];
1530             jnrlistC         = jjnr[jidx+2];
1531             jnrlistD         = jjnr[jidx+3];
1532             /* Sign of each element will be negative for non-real atoms.
1533              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1534              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1535              */
1536             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1537             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1538             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1539             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1540             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1541             j_coord_offsetA  = DIM*jnrA;
1542             j_coord_offsetB  = DIM*jnrB;
1543             j_coord_offsetC  = DIM*jnrC;
1544             j_coord_offsetD  = DIM*jnrD;
1545
1546             /* load j atom coordinates */
1547             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1548                                               x+j_coord_offsetC,x+j_coord_offsetD,
1549                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1550
1551             /* Calculate displacement vector */
1552             dx00             = _mm_sub_ps(ix0,jx0);
1553             dy00             = _mm_sub_ps(iy0,jy0);
1554             dz00             = _mm_sub_ps(iz0,jz0);
1555             dx01             = _mm_sub_ps(ix0,jx1);
1556             dy01             = _mm_sub_ps(iy0,jy1);
1557             dz01             = _mm_sub_ps(iz0,jz1);
1558             dx02             = _mm_sub_ps(ix0,jx2);
1559             dy02             = _mm_sub_ps(iy0,jy2);
1560             dz02             = _mm_sub_ps(iz0,jz2);
1561             dx10             = _mm_sub_ps(ix1,jx0);
1562             dy10             = _mm_sub_ps(iy1,jy0);
1563             dz10             = _mm_sub_ps(iz1,jz0);
1564             dx11             = _mm_sub_ps(ix1,jx1);
1565             dy11             = _mm_sub_ps(iy1,jy1);
1566             dz11             = _mm_sub_ps(iz1,jz1);
1567             dx12             = _mm_sub_ps(ix1,jx2);
1568             dy12             = _mm_sub_ps(iy1,jy2);
1569             dz12             = _mm_sub_ps(iz1,jz2);
1570             dx20             = _mm_sub_ps(ix2,jx0);
1571             dy20             = _mm_sub_ps(iy2,jy0);
1572             dz20             = _mm_sub_ps(iz2,jz0);
1573             dx21             = _mm_sub_ps(ix2,jx1);
1574             dy21             = _mm_sub_ps(iy2,jy1);
1575             dz21             = _mm_sub_ps(iz2,jz1);
1576             dx22             = _mm_sub_ps(ix2,jx2);
1577             dy22             = _mm_sub_ps(iy2,jy2);
1578             dz22             = _mm_sub_ps(iz2,jz2);
1579
1580             /* Calculate squared distance and things based on it */
1581             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1582             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1583             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1584             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1585             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1586             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1587             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1588             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1589             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1590
1591             rinv00           = avx128fma_invsqrt_f(rsq00);
1592             rinv01           = avx128fma_invsqrt_f(rsq01);
1593             rinv02           = avx128fma_invsqrt_f(rsq02);
1594             rinv10           = avx128fma_invsqrt_f(rsq10);
1595             rinv11           = avx128fma_invsqrt_f(rsq11);
1596             rinv12           = avx128fma_invsqrt_f(rsq12);
1597             rinv20           = avx128fma_invsqrt_f(rsq20);
1598             rinv21           = avx128fma_invsqrt_f(rsq21);
1599             rinv22           = avx128fma_invsqrt_f(rsq22);
1600
1601             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1602             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1603             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1604             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1605             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1606             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1607             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1608             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1609             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1610
1611             fjx0             = _mm_setzero_ps();
1612             fjy0             = _mm_setzero_ps();
1613             fjz0             = _mm_setzero_ps();
1614             fjx1             = _mm_setzero_ps();
1615             fjy1             = _mm_setzero_ps();
1616             fjz1             = _mm_setzero_ps();
1617             fjx2             = _mm_setzero_ps();
1618             fjy2             = _mm_setzero_ps();
1619             fjz2             = _mm_setzero_ps();
1620
1621             /**************************
1622              * CALCULATE INTERACTIONS *
1623              **************************/
1624
1625             r00              = _mm_mul_ps(rsq00,rinv00);
1626             r00              = _mm_andnot_ps(dummy_mask,r00);
1627
1628             /* EWALD ELECTROSTATICS */
1629
1630             /* Analytical PME correction */
1631             zeta2            = _mm_mul_ps(beta2,rsq00);
1632             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
1633             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1634             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1635             felec            = _mm_mul_ps(qq00,felec);
1636
1637             fscal            = felec;
1638
1639             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1640
1641              /* Update vectorial force */
1642             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1643             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1644             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1645
1646             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1647             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1648             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1649
1650             /**************************
1651              * CALCULATE INTERACTIONS *
1652              **************************/
1653
1654             r01              = _mm_mul_ps(rsq01,rinv01);
1655             r01              = _mm_andnot_ps(dummy_mask,r01);
1656
1657             /* EWALD ELECTROSTATICS */
1658
1659             /* Analytical PME correction */
1660             zeta2            = _mm_mul_ps(beta2,rsq01);
1661             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
1662             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1663             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1664             felec            = _mm_mul_ps(qq01,felec);
1665
1666             fscal            = felec;
1667
1668             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1669
1670              /* Update vectorial force */
1671             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1672             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1673             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1674
1675             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1676             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1677             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1678
1679             /**************************
1680              * CALCULATE INTERACTIONS *
1681              **************************/
1682
1683             r02              = _mm_mul_ps(rsq02,rinv02);
1684             r02              = _mm_andnot_ps(dummy_mask,r02);
1685
1686             /* EWALD ELECTROSTATICS */
1687
1688             /* Analytical PME correction */
1689             zeta2            = _mm_mul_ps(beta2,rsq02);
1690             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
1691             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1692             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1693             felec            = _mm_mul_ps(qq02,felec);
1694
1695             fscal            = felec;
1696
1697             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1698
1699              /* Update vectorial force */
1700             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1701             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1702             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1703
1704             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1705             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1706             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1707
1708             /**************************
1709              * CALCULATE INTERACTIONS *
1710              **************************/
1711
1712             r10              = _mm_mul_ps(rsq10,rinv10);
1713             r10              = _mm_andnot_ps(dummy_mask,r10);
1714
1715             /* EWALD ELECTROSTATICS */
1716
1717             /* Analytical PME correction */
1718             zeta2            = _mm_mul_ps(beta2,rsq10);
1719             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
1720             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1721             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1722             felec            = _mm_mul_ps(qq10,felec);
1723
1724             fscal            = felec;
1725
1726             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1727
1728              /* Update vectorial force */
1729             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1730             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1731             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1732
1733             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1734             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1735             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1736
1737             /**************************
1738              * CALCULATE INTERACTIONS *
1739              **************************/
1740
1741             r11              = _mm_mul_ps(rsq11,rinv11);
1742             r11              = _mm_andnot_ps(dummy_mask,r11);
1743
1744             /* EWALD ELECTROSTATICS */
1745
1746             /* Analytical PME correction */
1747             zeta2            = _mm_mul_ps(beta2,rsq11);
1748             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1749             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1750             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1751             felec            = _mm_mul_ps(qq11,felec);
1752
1753             fscal            = felec;
1754
1755             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1756
1757              /* Update vectorial force */
1758             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1759             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1760             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1761
1762             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1763             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1764             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1765
1766             /**************************
1767              * CALCULATE INTERACTIONS *
1768              **************************/
1769
1770             r12              = _mm_mul_ps(rsq12,rinv12);
1771             r12              = _mm_andnot_ps(dummy_mask,r12);
1772
1773             /* EWALD ELECTROSTATICS */
1774
1775             /* Analytical PME correction */
1776             zeta2            = _mm_mul_ps(beta2,rsq12);
1777             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
1778             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1779             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1780             felec            = _mm_mul_ps(qq12,felec);
1781
1782             fscal            = felec;
1783
1784             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1785
1786              /* Update vectorial force */
1787             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1788             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1789             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1790
1791             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1792             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1793             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1794
1795             /**************************
1796              * CALCULATE INTERACTIONS *
1797              **************************/
1798
1799             r20              = _mm_mul_ps(rsq20,rinv20);
1800             r20              = _mm_andnot_ps(dummy_mask,r20);
1801
1802             /* EWALD ELECTROSTATICS */
1803
1804             /* Analytical PME correction */
1805             zeta2            = _mm_mul_ps(beta2,rsq20);
1806             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
1807             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1808             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1809             felec            = _mm_mul_ps(qq20,felec);
1810
1811             fscal            = felec;
1812
1813             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1814
1815              /* Update vectorial force */
1816             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1817             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1818             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1819
1820             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1821             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1822             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1823
1824             /**************************
1825              * CALCULATE INTERACTIONS *
1826              **************************/
1827
1828             r21              = _mm_mul_ps(rsq21,rinv21);
1829             r21              = _mm_andnot_ps(dummy_mask,r21);
1830
1831             /* EWALD ELECTROSTATICS */
1832
1833             /* Analytical PME correction */
1834             zeta2            = _mm_mul_ps(beta2,rsq21);
1835             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
1836             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1837             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1838             felec            = _mm_mul_ps(qq21,felec);
1839
1840             fscal            = felec;
1841
1842             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1843
1844              /* Update vectorial force */
1845             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1846             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1847             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1848
1849             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1850             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1851             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1852
1853             /**************************
1854              * CALCULATE INTERACTIONS *
1855              **************************/
1856
1857             r22              = _mm_mul_ps(rsq22,rinv22);
1858             r22              = _mm_andnot_ps(dummy_mask,r22);
1859
1860             /* EWALD ELECTROSTATICS */
1861
1862             /* Analytical PME correction */
1863             zeta2            = _mm_mul_ps(beta2,rsq22);
1864             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
1865             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1866             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1867             felec            = _mm_mul_ps(qq22,felec);
1868
1869             fscal            = felec;
1870
1871             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1872
1873              /* Update vectorial force */
1874             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1875             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1876             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1877
1878             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1879             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1880             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1881
1882             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1883             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1884             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1885             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1886
1887             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1888                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1889
1890             /* Inner loop uses 261 flops */
1891         }
1892
1893         /* End of innermost loop */
1894
1895         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1896                                               f+i_coord_offset,fshift+i_shift_offset);
1897
1898         /* Increment number of inner iterations */
1899         inneriter                  += j_index_end - j_index_start;
1900
1901         /* Outer loop uses 18 flops */
1902     }
1903
1904     /* Increment number of outer iterations */
1905     outeriter        += nri;
1906
1907     /* Update outer/inner flops */
1908
1909     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*261);
1910 }