Added option to gmx nmeig to print ZPE.
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_avx_128_fma_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_128_fma_single.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_128_fma_single
51  * Electrostatics interaction: Coulomb
52  * VdW interaction:            CubicSplineTable
53  * Geometry:                   Water3-Water3
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_128_fma_single
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
77     real             rcutoff_scalar;
78     real             *shiftvec,*fshift,*x,*f;
79     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
80     real             scratch[4*DIM];
81     __m128           fscal,rcutoff,rcutoff2,jidxall;
82     int              vdwioffset0;
83     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84     int              vdwioffset1;
85     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86     int              vdwioffset2;
87     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
89     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
91     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
93     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
104     real             *charge;
105     int              nvdwtype;
106     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
107     int              *vdwtype;
108     real             *vdwparam;
109     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
110     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
111     __m128i          vfitab;
112     __m128i          ifour       = _mm_set1_epi32(4);
113     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
114     real             *vftab;
115     __m128           dummy_mask,cutoff_mask;
116     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
117     __m128           one     = _mm_set1_ps(1.0);
118     __m128           two     = _mm_set1_ps(2.0);
119     x                = xx[0];
120     f                = ff[0];
121
122     nri              = nlist->nri;
123     iinr             = nlist->iinr;
124     jindex           = nlist->jindex;
125     jjnr             = nlist->jjnr;
126     shiftidx         = nlist->shift;
127     gid              = nlist->gid;
128     shiftvec         = fr->shift_vec[0];
129     fshift           = fr->fshift[0];
130     facel            = _mm_set1_ps(fr->ic->epsfac);
131     charge           = mdatoms->chargeA;
132     nvdwtype         = fr->ntype;
133     vdwparam         = fr->nbfp;
134     vdwtype          = mdatoms->typeA;
135
136     vftab            = kernel_data->table_vdw->data;
137     vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
138
139     /* Setup water-specific parameters */
140     inr              = nlist->iinr[0];
141     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
142     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
143     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
144     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
145
146     jq0              = _mm_set1_ps(charge[inr+0]);
147     jq1              = _mm_set1_ps(charge[inr+1]);
148     jq2              = _mm_set1_ps(charge[inr+2]);
149     vdwjidx0A        = 2*vdwtype[inr+0];
150     qq00             = _mm_mul_ps(iq0,jq0);
151     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
152     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
153     qq01             = _mm_mul_ps(iq0,jq1);
154     qq02             = _mm_mul_ps(iq0,jq2);
155     qq10             = _mm_mul_ps(iq1,jq0);
156     qq11             = _mm_mul_ps(iq1,jq1);
157     qq12             = _mm_mul_ps(iq1,jq2);
158     qq20             = _mm_mul_ps(iq2,jq0);
159     qq21             = _mm_mul_ps(iq2,jq1);
160     qq22             = _mm_mul_ps(iq2,jq2);
161
162     /* Avoid stupid compiler warnings */
163     jnrA = jnrB = jnrC = jnrD = 0;
164     j_coord_offsetA = 0;
165     j_coord_offsetB = 0;
166     j_coord_offsetC = 0;
167     j_coord_offsetD = 0;
168
169     outeriter        = 0;
170     inneriter        = 0;
171
172     for(iidx=0;iidx<4*DIM;iidx++)
173     {
174         scratch[iidx] = 0.0;
175     }
176
177     /* Start outer loop over neighborlists */
178     for(iidx=0; iidx<nri; iidx++)
179     {
180         /* Load shift vector for this list */
181         i_shift_offset   = DIM*shiftidx[iidx];
182
183         /* Load limits for loop over neighbors */
184         j_index_start    = jindex[iidx];
185         j_index_end      = jindex[iidx+1];
186
187         /* Get outer coordinate index */
188         inr              = iinr[iidx];
189         i_coord_offset   = DIM*inr;
190
191         /* Load i particle coords and add shift vector */
192         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
193                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
194
195         fix0             = _mm_setzero_ps();
196         fiy0             = _mm_setzero_ps();
197         fiz0             = _mm_setzero_ps();
198         fix1             = _mm_setzero_ps();
199         fiy1             = _mm_setzero_ps();
200         fiz1             = _mm_setzero_ps();
201         fix2             = _mm_setzero_ps();
202         fiy2             = _mm_setzero_ps();
203         fiz2             = _mm_setzero_ps();
204
205         /* Reset potential sums */
206         velecsum         = _mm_setzero_ps();
207         vvdwsum          = _mm_setzero_ps();
208
209         /* Start inner kernel loop */
210         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
211         {
212
213             /* Get j neighbor index, and coordinate index */
214             jnrA             = jjnr[jidx];
215             jnrB             = jjnr[jidx+1];
216             jnrC             = jjnr[jidx+2];
217             jnrD             = jjnr[jidx+3];
218             j_coord_offsetA  = DIM*jnrA;
219             j_coord_offsetB  = DIM*jnrB;
220             j_coord_offsetC  = DIM*jnrC;
221             j_coord_offsetD  = DIM*jnrD;
222
223             /* load j atom coordinates */
224             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
225                                               x+j_coord_offsetC,x+j_coord_offsetD,
226                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
227
228             /* Calculate displacement vector */
229             dx00             = _mm_sub_ps(ix0,jx0);
230             dy00             = _mm_sub_ps(iy0,jy0);
231             dz00             = _mm_sub_ps(iz0,jz0);
232             dx01             = _mm_sub_ps(ix0,jx1);
233             dy01             = _mm_sub_ps(iy0,jy1);
234             dz01             = _mm_sub_ps(iz0,jz1);
235             dx02             = _mm_sub_ps(ix0,jx2);
236             dy02             = _mm_sub_ps(iy0,jy2);
237             dz02             = _mm_sub_ps(iz0,jz2);
238             dx10             = _mm_sub_ps(ix1,jx0);
239             dy10             = _mm_sub_ps(iy1,jy0);
240             dz10             = _mm_sub_ps(iz1,jz0);
241             dx11             = _mm_sub_ps(ix1,jx1);
242             dy11             = _mm_sub_ps(iy1,jy1);
243             dz11             = _mm_sub_ps(iz1,jz1);
244             dx12             = _mm_sub_ps(ix1,jx2);
245             dy12             = _mm_sub_ps(iy1,jy2);
246             dz12             = _mm_sub_ps(iz1,jz2);
247             dx20             = _mm_sub_ps(ix2,jx0);
248             dy20             = _mm_sub_ps(iy2,jy0);
249             dz20             = _mm_sub_ps(iz2,jz0);
250             dx21             = _mm_sub_ps(ix2,jx1);
251             dy21             = _mm_sub_ps(iy2,jy1);
252             dz21             = _mm_sub_ps(iz2,jz1);
253             dx22             = _mm_sub_ps(ix2,jx2);
254             dy22             = _mm_sub_ps(iy2,jy2);
255             dz22             = _mm_sub_ps(iz2,jz2);
256
257             /* Calculate squared distance and things based on it */
258             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
259             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
260             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
261             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
262             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
263             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
264             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
265             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
266             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
267
268             rinv00           = avx128fma_invsqrt_f(rsq00);
269             rinv01           = avx128fma_invsqrt_f(rsq01);
270             rinv02           = avx128fma_invsqrt_f(rsq02);
271             rinv10           = avx128fma_invsqrt_f(rsq10);
272             rinv11           = avx128fma_invsqrt_f(rsq11);
273             rinv12           = avx128fma_invsqrt_f(rsq12);
274             rinv20           = avx128fma_invsqrt_f(rsq20);
275             rinv21           = avx128fma_invsqrt_f(rsq21);
276             rinv22           = avx128fma_invsqrt_f(rsq22);
277
278             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
279             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
280             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
281             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
282             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
283             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
284             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
285             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
286             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
287
288             fjx0             = _mm_setzero_ps();
289             fjy0             = _mm_setzero_ps();
290             fjz0             = _mm_setzero_ps();
291             fjx1             = _mm_setzero_ps();
292             fjy1             = _mm_setzero_ps();
293             fjz1             = _mm_setzero_ps();
294             fjx2             = _mm_setzero_ps();
295             fjy2             = _mm_setzero_ps();
296             fjz2             = _mm_setzero_ps();
297
298             /**************************
299              * CALCULATE INTERACTIONS *
300              **************************/
301
302             r00              = _mm_mul_ps(rsq00,rinv00);
303
304             /* Calculate table index by multiplying r with table scale and truncate to integer */
305             rt               = _mm_mul_ps(r00,vftabscale);
306             vfitab           = _mm_cvttps_epi32(rt);
307 #ifdef __XOP__
308             vfeps            = _mm_frcz_ps(rt);
309 #else
310             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
311 #endif
312             twovfeps         = _mm_add_ps(vfeps,vfeps);
313             vfitab           = _mm_slli_epi32(vfitab,3);
314
315             /* COULOMB ELECTROSTATICS */
316             velec            = _mm_mul_ps(qq00,rinv00);
317             felec            = _mm_mul_ps(velec,rinvsq00);
318
319             /* CUBIC SPLINE TABLE DISPERSION */
320             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
321             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
322             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
323             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
324             _MM_TRANSPOSE4_PS(Y,F,G,H);
325             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
326             VV               = _mm_macc_ps(vfeps,Fp,Y);
327             vvdw6            = _mm_mul_ps(c6_00,VV);
328             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
329             fvdw6            = _mm_mul_ps(c6_00,FF);
330
331             /* CUBIC SPLINE TABLE REPULSION */
332             vfitab           = _mm_add_epi32(vfitab,ifour);
333             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
334             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
335             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
336             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
337             _MM_TRANSPOSE4_PS(Y,F,G,H);
338             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
339             VV               = _mm_macc_ps(vfeps,Fp,Y);
340             vvdw12           = _mm_mul_ps(c12_00,VV);
341             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
342             fvdw12           = _mm_mul_ps(c12_00,FF);
343             vvdw             = _mm_add_ps(vvdw12,vvdw6);
344             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
345
346             /* Update potential sum for this i atom from the interaction with this j atom. */
347             velecsum         = _mm_add_ps(velecsum,velec);
348             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
349
350             fscal            = _mm_add_ps(felec,fvdw);
351
352              /* Update vectorial force */
353             fix0             = _mm_macc_ps(dx00,fscal,fix0);
354             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
355             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
356
357             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
358             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
359             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
360
361             /**************************
362              * CALCULATE INTERACTIONS *
363              **************************/
364
365             /* COULOMB ELECTROSTATICS */
366             velec            = _mm_mul_ps(qq01,rinv01);
367             felec            = _mm_mul_ps(velec,rinvsq01);
368
369             /* Update potential sum for this i atom from the interaction with this j atom. */
370             velecsum         = _mm_add_ps(velecsum,velec);
371
372             fscal            = felec;
373
374              /* Update vectorial force */
375             fix0             = _mm_macc_ps(dx01,fscal,fix0);
376             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
377             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
378
379             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
380             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
381             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
382
383             /**************************
384              * CALCULATE INTERACTIONS *
385              **************************/
386
387             /* COULOMB ELECTROSTATICS */
388             velec            = _mm_mul_ps(qq02,rinv02);
389             felec            = _mm_mul_ps(velec,rinvsq02);
390
391             /* Update potential sum for this i atom from the interaction with this j atom. */
392             velecsum         = _mm_add_ps(velecsum,velec);
393
394             fscal            = felec;
395
396              /* Update vectorial force */
397             fix0             = _mm_macc_ps(dx02,fscal,fix0);
398             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
399             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
400
401             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
402             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
403             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
404
405             /**************************
406              * CALCULATE INTERACTIONS *
407              **************************/
408
409             /* COULOMB ELECTROSTATICS */
410             velec            = _mm_mul_ps(qq10,rinv10);
411             felec            = _mm_mul_ps(velec,rinvsq10);
412
413             /* Update potential sum for this i atom from the interaction with this j atom. */
414             velecsum         = _mm_add_ps(velecsum,velec);
415
416             fscal            = felec;
417
418              /* Update vectorial force */
419             fix1             = _mm_macc_ps(dx10,fscal,fix1);
420             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
421             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
422
423             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
424             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
425             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
426
427             /**************************
428              * CALCULATE INTERACTIONS *
429              **************************/
430
431             /* COULOMB ELECTROSTATICS */
432             velec            = _mm_mul_ps(qq11,rinv11);
433             felec            = _mm_mul_ps(velec,rinvsq11);
434
435             /* Update potential sum for this i atom from the interaction with this j atom. */
436             velecsum         = _mm_add_ps(velecsum,velec);
437
438             fscal            = felec;
439
440              /* Update vectorial force */
441             fix1             = _mm_macc_ps(dx11,fscal,fix1);
442             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
443             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
444
445             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
446             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
447             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
448
449             /**************************
450              * CALCULATE INTERACTIONS *
451              **************************/
452
453             /* COULOMB ELECTROSTATICS */
454             velec            = _mm_mul_ps(qq12,rinv12);
455             felec            = _mm_mul_ps(velec,rinvsq12);
456
457             /* Update potential sum for this i atom from the interaction with this j atom. */
458             velecsum         = _mm_add_ps(velecsum,velec);
459
460             fscal            = felec;
461
462              /* Update vectorial force */
463             fix1             = _mm_macc_ps(dx12,fscal,fix1);
464             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
465             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
466
467             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
468             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
469             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
470
471             /**************************
472              * CALCULATE INTERACTIONS *
473              **************************/
474
475             /* COULOMB ELECTROSTATICS */
476             velec            = _mm_mul_ps(qq20,rinv20);
477             felec            = _mm_mul_ps(velec,rinvsq20);
478
479             /* Update potential sum for this i atom from the interaction with this j atom. */
480             velecsum         = _mm_add_ps(velecsum,velec);
481
482             fscal            = felec;
483
484              /* Update vectorial force */
485             fix2             = _mm_macc_ps(dx20,fscal,fix2);
486             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
487             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
488
489             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
490             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
491             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
492
493             /**************************
494              * CALCULATE INTERACTIONS *
495              **************************/
496
497             /* COULOMB ELECTROSTATICS */
498             velec            = _mm_mul_ps(qq21,rinv21);
499             felec            = _mm_mul_ps(velec,rinvsq21);
500
501             /* Update potential sum for this i atom from the interaction with this j atom. */
502             velecsum         = _mm_add_ps(velecsum,velec);
503
504             fscal            = felec;
505
506              /* Update vectorial force */
507             fix2             = _mm_macc_ps(dx21,fscal,fix2);
508             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
509             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
510
511             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
512             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
513             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
514
515             /**************************
516              * CALCULATE INTERACTIONS *
517              **************************/
518
519             /* COULOMB ELECTROSTATICS */
520             velec            = _mm_mul_ps(qq22,rinv22);
521             felec            = _mm_mul_ps(velec,rinvsq22);
522
523             /* Update potential sum for this i atom from the interaction with this j atom. */
524             velecsum         = _mm_add_ps(velecsum,velec);
525
526             fscal            = felec;
527
528              /* Update vectorial force */
529             fix2             = _mm_macc_ps(dx22,fscal,fix2);
530             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
531             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
532
533             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
534             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
535             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
536
537             fjptrA             = f+j_coord_offsetA;
538             fjptrB             = f+j_coord_offsetB;
539             fjptrC             = f+j_coord_offsetC;
540             fjptrD             = f+j_coord_offsetD;
541
542             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
543                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
544
545             /* Inner loop uses 314 flops */
546         }
547
548         if(jidx<j_index_end)
549         {
550
551             /* Get j neighbor index, and coordinate index */
552             jnrlistA         = jjnr[jidx];
553             jnrlistB         = jjnr[jidx+1];
554             jnrlistC         = jjnr[jidx+2];
555             jnrlistD         = jjnr[jidx+3];
556             /* Sign of each element will be negative for non-real atoms.
557              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
558              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
559              */
560             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
561             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
562             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
563             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
564             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
565             j_coord_offsetA  = DIM*jnrA;
566             j_coord_offsetB  = DIM*jnrB;
567             j_coord_offsetC  = DIM*jnrC;
568             j_coord_offsetD  = DIM*jnrD;
569
570             /* load j atom coordinates */
571             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
572                                               x+j_coord_offsetC,x+j_coord_offsetD,
573                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
574
575             /* Calculate displacement vector */
576             dx00             = _mm_sub_ps(ix0,jx0);
577             dy00             = _mm_sub_ps(iy0,jy0);
578             dz00             = _mm_sub_ps(iz0,jz0);
579             dx01             = _mm_sub_ps(ix0,jx1);
580             dy01             = _mm_sub_ps(iy0,jy1);
581             dz01             = _mm_sub_ps(iz0,jz1);
582             dx02             = _mm_sub_ps(ix0,jx2);
583             dy02             = _mm_sub_ps(iy0,jy2);
584             dz02             = _mm_sub_ps(iz0,jz2);
585             dx10             = _mm_sub_ps(ix1,jx0);
586             dy10             = _mm_sub_ps(iy1,jy0);
587             dz10             = _mm_sub_ps(iz1,jz0);
588             dx11             = _mm_sub_ps(ix1,jx1);
589             dy11             = _mm_sub_ps(iy1,jy1);
590             dz11             = _mm_sub_ps(iz1,jz1);
591             dx12             = _mm_sub_ps(ix1,jx2);
592             dy12             = _mm_sub_ps(iy1,jy2);
593             dz12             = _mm_sub_ps(iz1,jz2);
594             dx20             = _mm_sub_ps(ix2,jx0);
595             dy20             = _mm_sub_ps(iy2,jy0);
596             dz20             = _mm_sub_ps(iz2,jz0);
597             dx21             = _mm_sub_ps(ix2,jx1);
598             dy21             = _mm_sub_ps(iy2,jy1);
599             dz21             = _mm_sub_ps(iz2,jz1);
600             dx22             = _mm_sub_ps(ix2,jx2);
601             dy22             = _mm_sub_ps(iy2,jy2);
602             dz22             = _mm_sub_ps(iz2,jz2);
603
604             /* Calculate squared distance and things based on it */
605             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
606             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
607             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
608             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
609             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
610             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
611             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
612             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
613             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
614
615             rinv00           = avx128fma_invsqrt_f(rsq00);
616             rinv01           = avx128fma_invsqrt_f(rsq01);
617             rinv02           = avx128fma_invsqrt_f(rsq02);
618             rinv10           = avx128fma_invsqrt_f(rsq10);
619             rinv11           = avx128fma_invsqrt_f(rsq11);
620             rinv12           = avx128fma_invsqrt_f(rsq12);
621             rinv20           = avx128fma_invsqrt_f(rsq20);
622             rinv21           = avx128fma_invsqrt_f(rsq21);
623             rinv22           = avx128fma_invsqrt_f(rsq22);
624
625             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
626             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
627             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
628             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
629             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
630             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
631             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
632             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
633             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
634
635             fjx0             = _mm_setzero_ps();
636             fjy0             = _mm_setzero_ps();
637             fjz0             = _mm_setzero_ps();
638             fjx1             = _mm_setzero_ps();
639             fjy1             = _mm_setzero_ps();
640             fjz1             = _mm_setzero_ps();
641             fjx2             = _mm_setzero_ps();
642             fjy2             = _mm_setzero_ps();
643             fjz2             = _mm_setzero_ps();
644
645             /**************************
646              * CALCULATE INTERACTIONS *
647              **************************/
648
649             r00              = _mm_mul_ps(rsq00,rinv00);
650             r00              = _mm_andnot_ps(dummy_mask,r00);
651
652             /* Calculate table index by multiplying r with table scale and truncate to integer */
653             rt               = _mm_mul_ps(r00,vftabscale);
654             vfitab           = _mm_cvttps_epi32(rt);
655 #ifdef __XOP__
656             vfeps            = _mm_frcz_ps(rt);
657 #else
658             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
659 #endif
660             twovfeps         = _mm_add_ps(vfeps,vfeps);
661             vfitab           = _mm_slli_epi32(vfitab,3);
662
663             /* COULOMB ELECTROSTATICS */
664             velec            = _mm_mul_ps(qq00,rinv00);
665             felec            = _mm_mul_ps(velec,rinvsq00);
666
667             /* CUBIC SPLINE TABLE DISPERSION */
668             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
669             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
670             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
671             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
672             _MM_TRANSPOSE4_PS(Y,F,G,H);
673             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
674             VV               = _mm_macc_ps(vfeps,Fp,Y);
675             vvdw6            = _mm_mul_ps(c6_00,VV);
676             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
677             fvdw6            = _mm_mul_ps(c6_00,FF);
678
679             /* CUBIC SPLINE TABLE REPULSION */
680             vfitab           = _mm_add_epi32(vfitab,ifour);
681             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
682             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
683             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
684             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
685             _MM_TRANSPOSE4_PS(Y,F,G,H);
686             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
687             VV               = _mm_macc_ps(vfeps,Fp,Y);
688             vvdw12           = _mm_mul_ps(c12_00,VV);
689             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
690             fvdw12           = _mm_mul_ps(c12_00,FF);
691             vvdw             = _mm_add_ps(vvdw12,vvdw6);
692             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
693
694             /* Update potential sum for this i atom from the interaction with this j atom. */
695             velec            = _mm_andnot_ps(dummy_mask,velec);
696             velecsum         = _mm_add_ps(velecsum,velec);
697             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
698             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
699
700             fscal            = _mm_add_ps(felec,fvdw);
701
702             fscal            = _mm_andnot_ps(dummy_mask,fscal);
703
704              /* Update vectorial force */
705             fix0             = _mm_macc_ps(dx00,fscal,fix0);
706             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
707             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
708
709             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
710             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
711             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
712
713             /**************************
714              * CALCULATE INTERACTIONS *
715              **************************/
716
717             /* COULOMB ELECTROSTATICS */
718             velec            = _mm_mul_ps(qq01,rinv01);
719             felec            = _mm_mul_ps(velec,rinvsq01);
720
721             /* Update potential sum for this i atom from the interaction with this j atom. */
722             velec            = _mm_andnot_ps(dummy_mask,velec);
723             velecsum         = _mm_add_ps(velecsum,velec);
724
725             fscal            = felec;
726
727             fscal            = _mm_andnot_ps(dummy_mask,fscal);
728
729              /* Update vectorial force */
730             fix0             = _mm_macc_ps(dx01,fscal,fix0);
731             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
732             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
733
734             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
735             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
736             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
737
738             /**************************
739              * CALCULATE INTERACTIONS *
740              **************************/
741
742             /* COULOMB ELECTROSTATICS */
743             velec            = _mm_mul_ps(qq02,rinv02);
744             felec            = _mm_mul_ps(velec,rinvsq02);
745
746             /* Update potential sum for this i atom from the interaction with this j atom. */
747             velec            = _mm_andnot_ps(dummy_mask,velec);
748             velecsum         = _mm_add_ps(velecsum,velec);
749
750             fscal            = felec;
751
752             fscal            = _mm_andnot_ps(dummy_mask,fscal);
753
754              /* Update vectorial force */
755             fix0             = _mm_macc_ps(dx02,fscal,fix0);
756             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
757             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
758
759             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
760             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
761             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
762
763             /**************************
764              * CALCULATE INTERACTIONS *
765              **************************/
766
767             /* COULOMB ELECTROSTATICS */
768             velec            = _mm_mul_ps(qq10,rinv10);
769             felec            = _mm_mul_ps(velec,rinvsq10);
770
771             /* Update potential sum for this i atom from the interaction with this j atom. */
772             velec            = _mm_andnot_ps(dummy_mask,velec);
773             velecsum         = _mm_add_ps(velecsum,velec);
774
775             fscal            = felec;
776
777             fscal            = _mm_andnot_ps(dummy_mask,fscal);
778
779              /* Update vectorial force */
780             fix1             = _mm_macc_ps(dx10,fscal,fix1);
781             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
782             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
783
784             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
785             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
786             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
787
788             /**************************
789              * CALCULATE INTERACTIONS *
790              **************************/
791
792             /* COULOMB ELECTROSTATICS */
793             velec            = _mm_mul_ps(qq11,rinv11);
794             felec            = _mm_mul_ps(velec,rinvsq11);
795
796             /* Update potential sum for this i atom from the interaction with this j atom. */
797             velec            = _mm_andnot_ps(dummy_mask,velec);
798             velecsum         = _mm_add_ps(velecsum,velec);
799
800             fscal            = felec;
801
802             fscal            = _mm_andnot_ps(dummy_mask,fscal);
803
804              /* Update vectorial force */
805             fix1             = _mm_macc_ps(dx11,fscal,fix1);
806             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
807             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
808
809             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
810             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
811             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
812
813             /**************************
814              * CALCULATE INTERACTIONS *
815              **************************/
816
817             /* COULOMB ELECTROSTATICS */
818             velec            = _mm_mul_ps(qq12,rinv12);
819             felec            = _mm_mul_ps(velec,rinvsq12);
820
821             /* Update potential sum for this i atom from the interaction with this j atom. */
822             velec            = _mm_andnot_ps(dummy_mask,velec);
823             velecsum         = _mm_add_ps(velecsum,velec);
824
825             fscal            = felec;
826
827             fscal            = _mm_andnot_ps(dummy_mask,fscal);
828
829              /* Update vectorial force */
830             fix1             = _mm_macc_ps(dx12,fscal,fix1);
831             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
832             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
833
834             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
835             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
836             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
837
838             /**************************
839              * CALCULATE INTERACTIONS *
840              **************************/
841
842             /* COULOMB ELECTROSTATICS */
843             velec            = _mm_mul_ps(qq20,rinv20);
844             felec            = _mm_mul_ps(velec,rinvsq20);
845
846             /* Update potential sum for this i atom from the interaction with this j atom. */
847             velec            = _mm_andnot_ps(dummy_mask,velec);
848             velecsum         = _mm_add_ps(velecsum,velec);
849
850             fscal            = felec;
851
852             fscal            = _mm_andnot_ps(dummy_mask,fscal);
853
854              /* Update vectorial force */
855             fix2             = _mm_macc_ps(dx20,fscal,fix2);
856             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
857             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
858
859             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
860             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
861             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
862
863             /**************************
864              * CALCULATE INTERACTIONS *
865              **************************/
866
867             /* COULOMB ELECTROSTATICS */
868             velec            = _mm_mul_ps(qq21,rinv21);
869             felec            = _mm_mul_ps(velec,rinvsq21);
870
871             /* Update potential sum for this i atom from the interaction with this j atom. */
872             velec            = _mm_andnot_ps(dummy_mask,velec);
873             velecsum         = _mm_add_ps(velecsum,velec);
874
875             fscal            = felec;
876
877             fscal            = _mm_andnot_ps(dummy_mask,fscal);
878
879              /* Update vectorial force */
880             fix2             = _mm_macc_ps(dx21,fscal,fix2);
881             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
882             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
883
884             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
885             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
886             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
887
888             /**************************
889              * CALCULATE INTERACTIONS *
890              **************************/
891
892             /* COULOMB ELECTROSTATICS */
893             velec            = _mm_mul_ps(qq22,rinv22);
894             felec            = _mm_mul_ps(velec,rinvsq22);
895
896             /* Update potential sum for this i atom from the interaction with this j atom. */
897             velec            = _mm_andnot_ps(dummy_mask,velec);
898             velecsum         = _mm_add_ps(velecsum,velec);
899
900             fscal            = felec;
901
902             fscal            = _mm_andnot_ps(dummy_mask,fscal);
903
904              /* Update vectorial force */
905             fix2             = _mm_macc_ps(dx22,fscal,fix2);
906             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
907             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
908
909             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
910             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
911             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
912
913             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
914             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
915             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
916             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
917
918             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
919                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
920
921             /* Inner loop uses 315 flops */
922         }
923
924         /* End of innermost loop */
925
926         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
927                                               f+i_coord_offset,fshift+i_shift_offset);
928
929         ggid                        = gid[iidx];
930         /* Update potential energies */
931         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
932         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
933
934         /* Increment number of inner iterations */
935         inneriter                  += j_index_end - j_index_start;
936
937         /* Outer loop uses 20 flops */
938     }
939
940     /* Increment number of outer iterations */
941     outeriter        += nri;
942
943     /* Update outer/inner flops */
944
945     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*315);
946 }
947 /*
948  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_128_fma_single
949  * Electrostatics interaction: Coulomb
950  * VdW interaction:            CubicSplineTable
951  * Geometry:                   Water3-Water3
952  * Calculate force/pot:        Force
953  */
954 void
955 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_128_fma_single
956                     (t_nblist                    * gmx_restrict       nlist,
957                      rvec                        * gmx_restrict          xx,
958                      rvec                        * gmx_restrict          ff,
959                      struct t_forcerec           * gmx_restrict          fr,
960                      t_mdatoms                   * gmx_restrict     mdatoms,
961                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
962                      t_nrnb                      * gmx_restrict        nrnb)
963 {
964     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
965      * just 0 for non-waters.
966      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
967      * jnr indices corresponding to data put in the four positions in the SIMD register.
968      */
969     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
970     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
971     int              jnrA,jnrB,jnrC,jnrD;
972     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
973     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
974     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
975     real             rcutoff_scalar;
976     real             *shiftvec,*fshift,*x,*f;
977     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
978     real             scratch[4*DIM];
979     __m128           fscal,rcutoff,rcutoff2,jidxall;
980     int              vdwioffset0;
981     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
982     int              vdwioffset1;
983     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
984     int              vdwioffset2;
985     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
986     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
987     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
988     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
989     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
990     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
991     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
992     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
993     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
994     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
995     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
996     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
997     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
998     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
999     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1000     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1001     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1002     real             *charge;
1003     int              nvdwtype;
1004     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1005     int              *vdwtype;
1006     real             *vdwparam;
1007     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1008     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1009     __m128i          vfitab;
1010     __m128i          ifour       = _mm_set1_epi32(4);
1011     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
1012     real             *vftab;
1013     __m128           dummy_mask,cutoff_mask;
1014     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1015     __m128           one     = _mm_set1_ps(1.0);
1016     __m128           two     = _mm_set1_ps(2.0);
1017     x                = xx[0];
1018     f                = ff[0];
1019
1020     nri              = nlist->nri;
1021     iinr             = nlist->iinr;
1022     jindex           = nlist->jindex;
1023     jjnr             = nlist->jjnr;
1024     shiftidx         = nlist->shift;
1025     gid              = nlist->gid;
1026     shiftvec         = fr->shift_vec[0];
1027     fshift           = fr->fshift[0];
1028     facel            = _mm_set1_ps(fr->ic->epsfac);
1029     charge           = mdatoms->chargeA;
1030     nvdwtype         = fr->ntype;
1031     vdwparam         = fr->nbfp;
1032     vdwtype          = mdatoms->typeA;
1033
1034     vftab            = kernel_data->table_vdw->data;
1035     vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
1036
1037     /* Setup water-specific parameters */
1038     inr              = nlist->iinr[0];
1039     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1040     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1041     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1042     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1043
1044     jq0              = _mm_set1_ps(charge[inr+0]);
1045     jq1              = _mm_set1_ps(charge[inr+1]);
1046     jq2              = _mm_set1_ps(charge[inr+2]);
1047     vdwjidx0A        = 2*vdwtype[inr+0];
1048     qq00             = _mm_mul_ps(iq0,jq0);
1049     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1050     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1051     qq01             = _mm_mul_ps(iq0,jq1);
1052     qq02             = _mm_mul_ps(iq0,jq2);
1053     qq10             = _mm_mul_ps(iq1,jq0);
1054     qq11             = _mm_mul_ps(iq1,jq1);
1055     qq12             = _mm_mul_ps(iq1,jq2);
1056     qq20             = _mm_mul_ps(iq2,jq0);
1057     qq21             = _mm_mul_ps(iq2,jq1);
1058     qq22             = _mm_mul_ps(iq2,jq2);
1059
1060     /* Avoid stupid compiler warnings */
1061     jnrA = jnrB = jnrC = jnrD = 0;
1062     j_coord_offsetA = 0;
1063     j_coord_offsetB = 0;
1064     j_coord_offsetC = 0;
1065     j_coord_offsetD = 0;
1066
1067     outeriter        = 0;
1068     inneriter        = 0;
1069
1070     for(iidx=0;iidx<4*DIM;iidx++)
1071     {
1072         scratch[iidx] = 0.0;
1073     }
1074
1075     /* Start outer loop over neighborlists */
1076     for(iidx=0; iidx<nri; iidx++)
1077     {
1078         /* Load shift vector for this list */
1079         i_shift_offset   = DIM*shiftidx[iidx];
1080
1081         /* Load limits for loop over neighbors */
1082         j_index_start    = jindex[iidx];
1083         j_index_end      = jindex[iidx+1];
1084
1085         /* Get outer coordinate index */
1086         inr              = iinr[iidx];
1087         i_coord_offset   = DIM*inr;
1088
1089         /* Load i particle coords and add shift vector */
1090         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1091                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1092
1093         fix0             = _mm_setzero_ps();
1094         fiy0             = _mm_setzero_ps();
1095         fiz0             = _mm_setzero_ps();
1096         fix1             = _mm_setzero_ps();
1097         fiy1             = _mm_setzero_ps();
1098         fiz1             = _mm_setzero_ps();
1099         fix2             = _mm_setzero_ps();
1100         fiy2             = _mm_setzero_ps();
1101         fiz2             = _mm_setzero_ps();
1102
1103         /* Start inner kernel loop */
1104         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1105         {
1106
1107             /* Get j neighbor index, and coordinate index */
1108             jnrA             = jjnr[jidx];
1109             jnrB             = jjnr[jidx+1];
1110             jnrC             = jjnr[jidx+2];
1111             jnrD             = jjnr[jidx+3];
1112             j_coord_offsetA  = DIM*jnrA;
1113             j_coord_offsetB  = DIM*jnrB;
1114             j_coord_offsetC  = DIM*jnrC;
1115             j_coord_offsetD  = DIM*jnrD;
1116
1117             /* load j atom coordinates */
1118             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1119                                               x+j_coord_offsetC,x+j_coord_offsetD,
1120                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1121
1122             /* Calculate displacement vector */
1123             dx00             = _mm_sub_ps(ix0,jx0);
1124             dy00             = _mm_sub_ps(iy0,jy0);
1125             dz00             = _mm_sub_ps(iz0,jz0);
1126             dx01             = _mm_sub_ps(ix0,jx1);
1127             dy01             = _mm_sub_ps(iy0,jy1);
1128             dz01             = _mm_sub_ps(iz0,jz1);
1129             dx02             = _mm_sub_ps(ix0,jx2);
1130             dy02             = _mm_sub_ps(iy0,jy2);
1131             dz02             = _mm_sub_ps(iz0,jz2);
1132             dx10             = _mm_sub_ps(ix1,jx0);
1133             dy10             = _mm_sub_ps(iy1,jy0);
1134             dz10             = _mm_sub_ps(iz1,jz0);
1135             dx11             = _mm_sub_ps(ix1,jx1);
1136             dy11             = _mm_sub_ps(iy1,jy1);
1137             dz11             = _mm_sub_ps(iz1,jz1);
1138             dx12             = _mm_sub_ps(ix1,jx2);
1139             dy12             = _mm_sub_ps(iy1,jy2);
1140             dz12             = _mm_sub_ps(iz1,jz2);
1141             dx20             = _mm_sub_ps(ix2,jx0);
1142             dy20             = _mm_sub_ps(iy2,jy0);
1143             dz20             = _mm_sub_ps(iz2,jz0);
1144             dx21             = _mm_sub_ps(ix2,jx1);
1145             dy21             = _mm_sub_ps(iy2,jy1);
1146             dz21             = _mm_sub_ps(iz2,jz1);
1147             dx22             = _mm_sub_ps(ix2,jx2);
1148             dy22             = _mm_sub_ps(iy2,jy2);
1149             dz22             = _mm_sub_ps(iz2,jz2);
1150
1151             /* Calculate squared distance and things based on it */
1152             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1153             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1154             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1155             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1156             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1157             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1158             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1159             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1160             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1161
1162             rinv00           = avx128fma_invsqrt_f(rsq00);
1163             rinv01           = avx128fma_invsqrt_f(rsq01);
1164             rinv02           = avx128fma_invsqrt_f(rsq02);
1165             rinv10           = avx128fma_invsqrt_f(rsq10);
1166             rinv11           = avx128fma_invsqrt_f(rsq11);
1167             rinv12           = avx128fma_invsqrt_f(rsq12);
1168             rinv20           = avx128fma_invsqrt_f(rsq20);
1169             rinv21           = avx128fma_invsqrt_f(rsq21);
1170             rinv22           = avx128fma_invsqrt_f(rsq22);
1171
1172             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1173             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1174             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1175             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1176             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1177             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1178             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1179             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1180             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1181
1182             fjx0             = _mm_setzero_ps();
1183             fjy0             = _mm_setzero_ps();
1184             fjz0             = _mm_setzero_ps();
1185             fjx1             = _mm_setzero_ps();
1186             fjy1             = _mm_setzero_ps();
1187             fjz1             = _mm_setzero_ps();
1188             fjx2             = _mm_setzero_ps();
1189             fjy2             = _mm_setzero_ps();
1190             fjz2             = _mm_setzero_ps();
1191
1192             /**************************
1193              * CALCULATE INTERACTIONS *
1194              **************************/
1195
1196             r00              = _mm_mul_ps(rsq00,rinv00);
1197
1198             /* Calculate table index by multiplying r with table scale and truncate to integer */
1199             rt               = _mm_mul_ps(r00,vftabscale);
1200             vfitab           = _mm_cvttps_epi32(rt);
1201 #ifdef __XOP__
1202             vfeps            = _mm_frcz_ps(rt);
1203 #else
1204             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1205 #endif
1206             twovfeps         = _mm_add_ps(vfeps,vfeps);
1207             vfitab           = _mm_slli_epi32(vfitab,3);
1208
1209             /* COULOMB ELECTROSTATICS */
1210             velec            = _mm_mul_ps(qq00,rinv00);
1211             felec            = _mm_mul_ps(velec,rinvsq00);
1212
1213             /* CUBIC SPLINE TABLE DISPERSION */
1214             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1215             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1216             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1217             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1218             _MM_TRANSPOSE4_PS(Y,F,G,H);
1219             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1220             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1221             fvdw6            = _mm_mul_ps(c6_00,FF);
1222
1223             /* CUBIC SPLINE TABLE REPULSION */
1224             vfitab           = _mm_add_epi32(vfitab,ifour);
1225             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1226             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1227             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1228             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1229             _MM_TRANSPOSE4_PS(Y,F,G,H);
1230             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1231             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1232             fvdw12           = _mm_mul_ps(c12_00,FF);
1233             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1234
1235             fscal            = _mm_add_ps(felec,fvdw);
1236
1237              /* Update vectorial force */
1238             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1239             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1240             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1241
1242             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1243             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1244             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1245
1246             /**************************
1247              * CALCULATE INTERACTIONS *
1248              **************************/
1249
1250             /* COULOMB ELECTROSTATICS */
1251             velec            = _mm_mul_ps(qq01,rinv01);
1252             felec            = _mm_mul_ps(velec,rinvsq01);
1253
1254             fscal            = felec;
1255
1256              /* Update vectorial force */
1257             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1258             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1259             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1260
1261             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1262             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1263             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1264
1265             /**************************
1266              * CALCULATE INTERACTIONS *
1267              **************************/
1268
1269             /* COULOMB ELECTROSTATICS */
1270             velec            = _mm_mul_ps(qq02,rinv02);
1271             felec            = _mm_mul_ps(velec,rinvsq02);
1272
1273             fscal            = felec;
1274
1275              /* Update vectorial force */
1276             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1277             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1278             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1279
1280             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1281             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1282             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1283
1284             /**************************
1285              * CALCULATE INTERACTIONS *
1286              **************************/
1287
1288             /* COULOMB ELECTROSTATICS */
1289             velec            = _mm_mul_ps(qq10,rinv10);
1290             felec            = _mm_mul_ps(velec,rinvsq10);
1291
1292             fscal            = felec;
1293
1294              /* Update vectorial force */
1295             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1296             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1297             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1298
1299             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1300             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1301             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1302
1303             /**************************
1304              * CALCULATE INTERACTIONS *
1305              **************************/
1306
1307             /* COULOMB ELECTROSTATICS */
1308             velec            = _mm_mul_ps(qq11,rinv11);
1309             felec            = _mm_mul_ps(velec,rinvsq11);
1310
1311             fscal            = felec;
1312
1313              /* Update vectorial force */
1314             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1315             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1316             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1317
1318             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1319             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1320             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1321
1322             /**************************
1323              * CALCULATE INTERACTIONS *
1324              **************************/
1325
1326             /* COULOMB ELECTROSTATICS */
1327             velec            = _mm_mul_ps(qq12,rinv12);
1328             felec            = _mm_mul_ps(velec,rinvsq12);
1329
1330             fscal            = felec;
1331
1332              /* Update vectorial force */
1333             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1334             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1335             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1336
1337             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1338             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1339             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1340
1341             /**************************
1342              * CALCULATE INTERACTIONS *
1343              **************************/
1344
1345             /* COULOMB ELECTROSTATICS */
1346             velec            = _mm_mul_ps(qq20,rinv20);
1347             felec            = _mm_mul_ps(velec,rinvsq20);
1348
1349             fscal            = felec;
1350
1351              /* Update vectorial force */
1352             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1353             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1354             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1355
1356             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1357             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1358             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1359
1360             /**************************
1361              * CALCULATE INTERACTIONS *
1362              **************************/
1363
1364             /* COULOMB ELECTROSTATICS */
1365             velec            = _mm_mul_ps(qq21,rinv21);
1366             felec            = _mm_mul_ps(velec,rinvsq21);
1367
1368             fscal            = felec;
1369
1370              /* Update vectorial force */
1371             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1372             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1373             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1374
1375             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1376             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1377             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1378
1379             /**************************
1380              * CALCULATE INTERACTIONS *
1381              **************************/
1382
1383             /* COULOMB ELECTROSTATICS */
1384             velec            = _mm_mul_ps(qq22,rinv22);
1385             felec            = _mm_mul_ps(velec,rinvsq22);
1386
1387             fscal            = felec;
1388
1389              /* Update vectorial force */
1390             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1391             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1392             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1393
1394             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1395             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1396             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1397
1398             fjptrA             = f+j_coord_offsetA;
1399             fjptrB             = f+j_coord_offsetB;
1400             fjptrC             = f+j_coord_offsetC;
1401             fjptrD             = f+j_coord_offsetD;
1402
1403             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1404                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1405
1406             /* Inner loop uses 297 flops */
1407         }
1408
1409         if(jidx<j_index_end)
1410         {
1411
1412             /* Get j neighbor index, and coordinate index */
1413             jnrlistA         = jjnr[jidx];
1414             jnrlistB         = jjnr[jidx+1];
1415             jnrlistC         = jjnr[jidx+2];
1416             jnrlistD         = jjnr[jidx+3];
1417             /* Sign of each element will be negative for non-real atoms.
1418              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1419              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1420              */
1421             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1422             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1423             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1424             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1425             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1426             j_coord_offsetA  = DIM*jnrA;
1427             j_coord_offsetB  = DIM*jnrB;
1428             j_coord_offsetC  = DIM*jnrC;
1429             j_coord_offsetD  = DIM*jnrD;
1430
1431             /* load j atom coordinates */
1432             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1433                                               x+j_coord_offsetC,x+j_coord_offsetD,
1434                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1435
1436             /* Calculate displacement vector */
1437             dx00             = _mm_sub_ps(ix0,jx0);
1438             dy00             = _mm_sub_ps(iy0,jy0);
1439             dz00             = _mm_sub_ps(iz0,jz0);
1440             dx01             = _mm_sub_ps(ix0,jx1);
1441             dy01             = _mm_sub_ps(iy0,jy1);
1442             dz01             = _mm_sub_ps(iz0,jz1);
1443             dx02             = _mm_sub_ps(ix0,jx2);
1444             dy02             = _mm_sub_ps(iy0,jy2);
1445             dz02             = _mm_sub_ps(iz0,jz2);
1446             dx10             = _mm_sub_ps(ix1,jx0);
1447             dy10             = _mm_sub_ps(iy1,jy0);
1448             dz10             = _mm_sub_ps(iz1,jz0);
1449             dx11             = _mm_sub_ps(ix1,jx1);
1450             dy11             = _mm_sub_ps(iy1,jy1);
1451             dz11             = _mm_sub_ps(iz1,jz1);
1452             dx12             = _mm_sub_ps(ix1,jx2);
1453             dy12             = _mm_sub_ps(iy1,jy2);
1454             dz12             = _mm_sub_ps(iz1,jz2);
1455             dx20             = _mm_sub_ps(ix2,jx0);
1456             dy20             = _mm_sub_ps(iy2,jy0);
1457             dz20             = _mm_sub_ps(iz2,jz0);
1458             dx21             = _mm_sub_ps(ix2,jx1);
1459             dy21             = _mm_sub_ps(iy2,jy1);
1460             dz21             = _mm_sub_ps(iz2,jz1);
1461             dx22             = _mm_sub_ps(ix2,jx2);
1462             dy22             = _mm_sub_ps(iy2,jy2);
1463             dz22             = _mm_sub_ps(iz2,jz2);
1464
1465             /* Calculate squared distance and things based on it */
1466             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1467             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1468             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1469             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1470             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1471             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1472             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1473             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1474             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1475
1476             rinv00           = avx128fma_invsqrt_f(rsq00);
1477             rinv01           = avx128fma_invsqrt_f(rsq01);
1478             rinv02           = avx128fma_invsqrt_f(rsq02);
1479             rinv10           = avx128fma_invsqrt_f(rsq10);
1480             rinv11           = avx128fma_invsqrt_f(rsq11);
1481             rinv12           = avx128fma_invsqrt_f(rsq12);
1482             rinv20           = avx128fma_invsqrt_f(rsq20);
1483             rinv21           = avx128fma_invsqrt_f(rsq21);
1484             rinv22           = avx128fma_invsqrt_f(rsq22);
1485
1486             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1487             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1488             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1489             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1490             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1491             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1492             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1493             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1494             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1495
1496             fjx0             = _mm_setzero_ps();
1497             fjy0             = _mm_setzero_ps();
1498             fjz0             = _mm_setzero_ps();
1499             fjx1             = _mm_setzero_ps();
1500             fjy1             = _mm_setzero_ps();
1501             fjz1             = _mm_setzero_ps();
1502             fjx2             = _mm_setzero_ps();
1503             fjy2             = _mm_setzero_ps();
1504             fjz2             = _mm_setzero_ps();
1505
1506             /**************************
1507              * CALCULATE INTERACTIONS *
1508              **************************/
1509
1510             r00              = _mm_mul_ps(rsq00,rinv00);
1511             r00              = _mm_andnot_ps(dummy_mask,r00);
1512
1513             /* Calculate table index by multiplying r with table scale and truncate to integer */
1514             rt               = _mm_mul_ps(r00,vftabscale);
1515             vfitab           = _mm_cvttps_epi32(rt);
1516 #ifdef __XOP__
1517             vfeps            = _mm_frcz_ps(rt);
1518 #else
1519             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1520 #endif
1521             twovfeps         = _mm_add_ps(vfeps,vfeps);
1522             vfitab           = _mm_slli_epi32(vfitab,3);
1523
1524             /* COULOMB ELECTROSTATICS */
1525             velec            = _mm_mul_ps(qq00,rinv00);
1526             felec            = _mm_mul_ps(velec,rinvsq00);
1527
1528             /* CUBIC SPLINE TABLE DISPERSION */
1529             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1530             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1531             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1532             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1533             _MM_TRANSPOSE4_PS(Y,F,G,H);
1534             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1535             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1536             fvdw6            = _mm_mul_ps(c6_00,FF);
1537
1538             /* CUBIC SPLINE TABLE REPULSION */
1539             vfitab           = _mm_add_epi32(vfitab,ifour);
1540             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1541             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1542             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1543             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1544             _MM_TRANSPOSE4_PS(Y,F,G,H);
1545             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1546             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1547             fvdw12           = _mm_mul_ps(c12_00,FF);
1548             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1549
1550             fscal            = _mm_add_ps(felec,fvdw);
1551
1552             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1553
1554              /* Update vectorial force */
1555             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1556             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1557             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1558
1559             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1560             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1561             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1562
1563             /**************************
1564              * CALCULATE INTERACTIONS *
1565              **************************/
1566
1567             /* COULOMB ELECTROSTATICS */
1568             velec            = _mm_mul_ps(qq01,rinv01);
1569             felec            = _mm_mul_ps(velec,rinvsq01);
1570
1571             fscal            = felec;
1572
1573             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1574
1575              /* Update vectorial force */
1576             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1577             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1578             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1579
1580             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1581             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1582             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1583
1584             /**************************
1585              * CALCULATE INTERACTIONS *
1586              **************************/
1587
1588             /* COULOMB ELECTROSTATICS */
1589             velec            = _mm_mul_ps(qq02,rinv02);
1590             felec            = _mm_mul_ps(velec,rinvsq02);
1591
1592             fscal            = felec;
1593
1594             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1595
1596              /* Update vectorial force */
1597             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1598             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1599             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1600
1601             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1602             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1603             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1604
1605             /**************************
1606              * CALCULATE INTERACTIONS *
1607              **************************/
1608
1609             /* COULOMB ELECTROSTATICS */
1610             velec            = _mm_mul_ps(qq10,rinv10);
1611             felec            = _mm_mul_ps(velec,rinvsq10);
1612
1613             fscal            = felec;
1614
1615             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1616
1617              /* Update vectorial force */
1618             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1619             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1620             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1621
1622             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1623             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1624             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1625
1626             /**************************
1627              * CALCULATE INTERACTIONS *
1628              **************************/
1629
1630             /* COULOMB ELECTROSTATICS */
1631             velec            = _mm_mul_ps(qq11,rinv11);
1632             felec            = _mm_mul_ps(velec,rinvsq11);
1633
1634             fscal            = felec;
1635
1636             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1637
1638              /* Update vectorial force */
1639             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1640             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1641             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1642
1643             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1644             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1645             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1646
1647             /**************************
1648              * CALCULATE INTERACTIONS *
1649              **************************/
1650
1651             /* COULOMB ELECTROSTATICS */
1652             velec            = _mm_mul_ps(qq12,rinv12);
1653             felec            = _mm_mul_ps(velec,rinvsq12);
1654
1655             fscal            = felec;
1656
1657             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1658
1659              /* Update vectorial force */
1660             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1661             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1662             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1663
1664             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1665             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1666             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1667
1668             /**************************
1669              * CALCULATE INTERACTIONS *
1670              **************************/
1671
1672             /* COULOMB ELECTROSTATICS */
1673             velec            = _mm_mul_ps(qq20,rinv20);
1674             felec            = _mm_mul_ps(velec,rinvsq20);
1675
1676             fscal            = felec;
1677
1678             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1679
1680              /* Update vectorial force */
1681             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1682             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1683             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1684
1685             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1686             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1687             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1688
1689             /**************************
1690              * CALCULATE INTERACTIONS *
1691              **************************/
1692
1693             /* COULOMB ELECTROSTATICS */
1694             velec            = _mm_mul_ps(qq21,rinv21);
1695             felec            = _mm_mul_ps(velec,rinvsq21);
1696
1697             fscal            = felec;
1698
1699             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1700
1701              /* Update vectorial force */
1702             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1703             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1704             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1705
1706             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1707             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1708             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1709
1710             /**************************
1711              * CALCULATE INTERACTIONS *
1712              **************************/
1713
1714             /* COULOMB ELECTROSTATICS */
1715             velec            = _mm_mul_ps(qq22,rinv22);
1716             felec            = _mm_mul_ps(velec,rinvsq22);
1717
1718             fscal            = felec;
1719
1720             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1721
1722              /* Update vectorial force */
1723             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1724             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1725             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1726
1727             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1728             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1729             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1730
1731             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1732             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1733             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1734             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1735
1736             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1737                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1738
1739             /* Inner loop uses 298 flops */
1740         }
1741
1742         /* End of innermost loop */
1743
1744         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1745                                               f+i_coord_offset,fshift+i_shift_offset);
1746
1747         /* Increment number of inner iterations */
1748         inneriter                  += j_index_end - j_index_start;
1749
1750         /* Outer loop uses 18 flops */
1751     }
1752
1753     /* Increment number of outer iterations */
1754     outeriter        += nri;
1755
1756     /* Update outer/inner flops */
1757
1758     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*298);
1759 }