Remove all unnecessary HAVE_CONFIG_H
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_avx_128_fma_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
48 #include "kernelutil_x86_avx_128_fma_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_128_fma_single
52  * Electrostatics interaction: Ewald
53  * VdW interaction:            LennardJones
54  * Geometry:                   Water3-Water3
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_128_fma_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m128           fscal,rcutoff,rcutoff2,jidxall;
83     int              vdwioffset0;
84     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     int              vdwioffset1;
86     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     int              vdwioffset2;
88     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
105     real             *charge;
106     int              nvdwtype;
107     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
108     int              *vdwtype;
109     real             *vdwparam;
110     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
111     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
112     __m128i          ewitab;
113     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
114     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
115     real             *ewtab;
116     __m128           dummy_mask,cutoff_mask;
117     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
118     __m128           one     = _mm_set1_ps(1.0);
119     __m128           two     = _mm_set1_ps(2.0);
120     x                = xx[0];
121     f                = ff[0];
122
123     nri              = nlist->nri;
124     iinr             = nlist->iinr;
125     jindex           = nlist->jindex;
126     jjnr             = nlist->jjnr;
127     shiftidx         = nlist->shift;
128     gid              = nlist->gid;
129     shiftvec         = fr->shift_vec[0];
130     fshift           = fr->fshift[0];
131     facel            = _mm_set1_ps(fr->epsfac);
132     charge           = mdatoms->chargeA;
133     nvdwtype         = fr->ntype;
134     vdwparam         = fr->nbfp;
135     vdwtype          = mdatoms->typeA;
136
137     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
138     beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
139     beta2            = _mm_mul_ps(beta,beta);
140     beta3            = _mm_mul_ps(beta,beta2);
141     ewtab            = fr->ic->tabq_coul_FDV0;
142     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
143     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
144
145     /* Setup water-specific parameters */
146     inr              = nlist->iinr[0];
147     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
148     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
149     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
150     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
151
152     jq0              = _mm_set1_ps(charge[inr+0]);
153     jq1              = _mm_set1_ps(charge[inr+1]);
154     jq2              = _mm_set1_ps(charge[inr+2]);
155     vdwjidx0A        = 2*vdwtype[inr+0];
156     qq00             = _mm_mul_ps(iq0,jq0);
157     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
158     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
159     qq01             = _mm_mul_ps(iq0,jq1);
160     qq02             = _mm_mul_ps(iq0,jq2);
161     qq10             = _mm_mul_ps(iq1,jq0);
162     qq11             = _mm_mul_ps(iq1,jq1);
163     qq12             = _mm_mul_ps(iq1,jq2);
164     qq20             = _mm_mul_ps(iq2,jq0);
165     qq21             = _mm_mul_ps(iq2,jq1);
166     qq22             = _mm_mul_ps(iq2,jq2);
167
168     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
169     rcutoff_scalar   = fr->rcoulomb;
170     rcutoff          = _mm_set1_ps(rcutoff_scalar);
171     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
172
173     sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
174     rvdw             = _mm_set1_ps(fr->rvdw);
175
176     /* Avoid stupid compiler warnings */
177     jnrA = jnrB = jnrC = jnrD = 0;
178     j_coord_offsetA = 0;
179     j_coord_offsetB = 0;
180     j_coord_offsetC = 0;
181     j_coord_offsetD = 0;
182
183     outeriter        = 0;
184     inneriter        = 0;
185
186     for(iidx=0;iidx<4*DIM;iidx++)
187     {
188         scratch[iidx] = 0.0;
189     }
190
191     /* Start outer loop over neighborlists */
192     for(iidx=0; iidx<nri; iidx++)
193     {
194         /* Load shift vector for this list */
195         i_shift_offset   = DIM*shiftidx[iidx];
196
197         /* Load limits for loop over neighbors */
198         j_index_start    = jindex[iidx];
199         j_index_end      = jindex[iidx+1];
200
201         /* Get outer coordinate index */
202         inr              = iinr[iidx];
203         i_coord_offset   = DIM*inr;
204
205         /* Load i particle coords and add shift vector */
206         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
207                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
208
209         fix0             = _mm_setzero_ps();
210         fiy0             = _mm_setzero_ps();
211         fiz0             = _mm_setzero_ps();
212         fix1             = _mm_setzero_ps();
213         fiy1             = _mm_setzero_ps();
214         fiz1             = _mm_setzero_ps();
215         fix2             = _mm_setzero_ps();
216         fiy2             = _mm_setzero_ps();
217         fiz2             = _mm_setzero_ps();
218
219         /* Reset potential sums */
220         velecsum         = _mm_setzero_ps();
221         vvdwsum          = _mm_setzero_ps();
222
223         /* Start inner kernel loop */
224         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
225         {
226
227             /* Get j neighbor index, and coordinate index */
228             jnrA             = jjnr[jidx];
229             jnrB             = jjnr[jidx+1];
230             jnrC             = jjnr[jidx+2];
231             jnrD             = jjnr[jidx+3];
232             j_coord_offsetA  = DIM*jnrA;
233             j_coord_offsetB  = DIM*jnrB;
234             j_coord_offsetC  = DIM*jnrC;
235             j_coord_offsetD  = DIM*jnrD;
236
237             /* load j atom coordinates */
238             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
239                                               x+j_coord_offsetC,x+j_coord_offsetD,
240                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
241
242             /* Calculate displacement vector */
243             dx00             = _mm_sub_ps(ix0,jx0);
244             dy00             = _mm_sub_ps(iy0,jy0);
245             dz00             = _mm_sub_ps(iz0,jz0);
246             dx01             = _mm_sub_ps(ix0,jx1);
247             dy01             = _mm_sub_ps(iy0,jy1);
248             dz01             = _mm_sub_ps(iz0,jz1);
249             dx02             = _mm_sub_ps(ix0,jx2);
250             dy02             = _mm_sub_ps(iy0,jy2);
251             dz02             = _mm_sub_ps(iz0,jz2);
252             dx10             = _mm_sub_ps(ix1,jx0);
253             dy10             = _mm_sub_ps(iy1,jy0);
254             dz10             = _mm_sub_ps(iz1,jz0);
255             dx11             = _mm_sub_ps(ix1,jx1);
256             dy11             = _mm_sub_ps(iy1,jy1);
257             dz11             = _mm_sub_ps(iz1,jz1);
258             dx12             = _mm_sub_ps(ix1,jx2);
259             dy12             = _mm_sub_ps(iy1,jy2);
260             dz12             = _mm_sub_ps(iz1,jz2);
261             dx20             = _mm_sub_ps(ix2,jx0);
262             dy20             = _mm_sub_ps(iy2,jy0);
263             dz20             = _mm_sub_ps(iz2,jz0);
264             dx21             = _mm_sub_ps(ix2,jx1);
265             dy21             = _mm_sub_ps(iy2,jy1);
266             dz21             = _mm_sub_ps(iz2,jz1);
267             dx22             = _mm_sub_ps(ix2,jx2);
268             dy22             = _mm_sub_ps(iy2,jy2);
269             dz22             = _mm_sub_ps(iz2,jz2);
270
271             /* Calculate squared distance and things based on it */
272             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
273             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
274             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
275             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
276             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
277             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
278             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
279             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
280             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
281
282             rinv00           = gmx_mm_invsqrt_ps(rsq00);
283             rinv01           = gmx_mm_invsqrt_ps(rsq01);
284             rinv02           = gmx_mm_invsqrt_ps(rsq02);
285             rinv10           = gmx_mm_invsqrt_ps(rsq10);
286             rinv11           = gmx_mm_invsqrt_ps(rsq11);
287             rinv12           = gmx_mm_invsqrt_ps(rsq12);
288             rinv20           = gmx_mm_invsqrt_ps(rsq20);
289             rinv21           = gmx_mm_invsqrt_ps(rsq21);
290             rinv22           = gmx_mm_invsqrt_ps(rsq22);
291
292             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
293             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
294             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
295             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
296             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
297             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
298             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
299             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
300             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
301
302             fjx0             = _mm_setzero_ps();
303             fjy0             = _mm_setzero_ps();
304             fjz0             = _mm_setzero_ps();
305             fjx1             = _mm_setzero_ps();
306             fjy1             = _mm_setzero_ps();
307             fjz1             = _mm_setzero_ps();
308             fjx2             = _mm_setzero_ps();
309             fjy2             = _mm_setzero_ps();
310             fjz2             = _mm_setzero_ps();
311
312             /**************************
313              * CALCULATE INTERACTIONS *
314              **************************/
315
316             if (gmx_mm_any_lt(rsq00,rcutoff2))
317             {
318
319             r00              = _mm_mul_ps(rsq00,rinv00);
320
321             /* EWALD ELECTROSTATICS */
322
323             /* Analytical PME correction */
324             zeta2            = _mm_mul_ps(beta2,rsq00);
325             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
326             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
327             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
328             felec            = _mm_mul_ps(qq00,felec);
329             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
330             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv00,sh_ewald));
331             velec            = _mm_mul_ps(qq00,velec);
332
333             /* LENNARD-JONES DISPERSION/REPULSION */
334
335             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
336             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
337             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
338             vvdw             = _mm_msub_ps(_mm_nmacc_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
339                                           _mm_mul_ps( _mm_nmacc_ps(c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
340             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
341
342             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
343
344             /* Update potential sum for this i atom from the interaction with this j atom. */
345             velec            = _mm_and_ps(velec,cutoff_mask);
346             velecsum         = _mm_add_ps(velecsum,velec);
347             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
348             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
349
350             fscal            = _mm_add_ps(felec,fvdw);
351
352             fscal            = _mm_and_ps(fscal,cutoff_mask);
353
354              /* Update vectorial force */
355             fix0             = _mm_macc_ps(dx00,fscal,fix0);
356             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
357             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
358
359             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
360             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
361             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
362
363             }
364
365             /**************************
366              * CALCULATE INTERACTIONS *
367              **************************/
368
369             if (gmx_mm_any_lt(rsq01,rcutoff2))
370             {
371
372             r01              = _mm_mul_ps(rsq01,rinv01);
373
374             /* EWALD ELECTROSTATICS */
375
376             /* Analytical PME correction */
377             zeta2            = _mm_mul_ps(beta2,rsq01);
378             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
379             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
380             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
381             felec            = _mm_mul_ps(qq01,felec);
382             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
383             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv01,sh_ewald));
384             velec            = _mm_mul_ps(qq01,velec);
385
386             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
387
388             /* Update potential sum for this i atom from the interaction with this j atom. */
389             velec            = _mm_and_ps(velec,cutoff_mask);
390             velecsum         = _mm_add_ps(velecsum,velec);
391
392             fscal            = felec;
393
394             fscal            = _mm_and_ps(fscal,cutoff_mask);
395
396              /* Update vectorial force */
397             fix0             = _mm_macc_ps(dx01,fscal,fix0);
398             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
399             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
400
401             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
402             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
403             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
404
405             }
406
407             /**************************
408              * CALCULATE INTERACTIONS *
409              **************************/
410
411             if (gmx_mm_any_lt(rsq02,rcutoff2))
412             {
413
414             r02              = _mm_mul_ps(rsq02,rinv02);
415
416             /* EWALD ELECTROSTATICS */
417
418             /* Analytical PME correction */
419             zeta2            = _mm_mul_ps(beta2,rsq02);
420             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
421             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
422             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
423             felec            = _mm_mul_ps(qq02,felec);
424             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
425             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv02,sh_ewald));
426             velec            = _mm_mul_ps(qq02,velec);
427
428             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
429
430             /* Update potential sum for this i atom from the interaction with this j atom. */
431             velec            = _mm_and_ps(velec,cutoff_mask);
432             velecsum         = _mm_add_ps(velecsum,velec);
433
434             fscal            = felec;
435
436             fscal            = _mm_and_ps(fscal,cutoff_mask);
437
438              /* Update vectorial force */
439             fix0             = _mm_macc_ps(dx02,fscal,fix0);
440             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
441             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
442
443             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
444             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
445             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
446
447             }
448
449             /**************************
450              * CALCULATE INTERACTIONS *
451              **************************/
452
453             if (gmx_mm_any_lt(rsq10,rcutoff2))
454             {
455
456             r10              = _mm_mul_ps(rsq10,rinv10);
457
458             /* EWALD ELECTROSTATICS */
459
460             /* Analytical PME correction */
461             zeta2            = _mm_mul_ps(beta2,rsq10);
462             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
463             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
464             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
465             felec            = _mm_mul_ps(qq10,felec);
466             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
467             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv10,sh_ewald));
468             velec            = _mm_mul_ps(qq10,velec);
469
470             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
471
472             /* Update potential sum for this i atom from the interaction with this j atom. */
473             velec            = _mm_and_ps(velec,cutoff_mask);
474             velecsum         = _mm_add_ps(velecsum,velec);
475
476             fscal            = felec;
477
478             fscal            = _mm_and_ps(fscal,cutoff_mask);
479
480              /* Update vectorial force */
481             fix1             = _mm_macc_ps(dx10,fscal,fix1);
482             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
483             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
484
485             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
486             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
487             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
488
489             }
490
491             /**************************
492              * CALCULATE INTERACTIONS *
493              **************************/
494
495             if (gmx_mm_any_lt(rsq11,rcutoff2))
496             {
497
498             r11              = _mm_mul_ps(rsq11,rinv11);
499
500             /* EWALD ELECTROSTATICS */
501
502             /* Analytical PME correction */
503             zeta2            = _mm_mul_ps(beta2,rsq11);
504             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
505             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
506             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
507             felec            = _mm_mul_ps(qq11,felec);
508             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
509             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv11,sh_ewald));
510             velec            = _mm_mul_ps(qq11,velec);
511
512             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
513
514             /* Update potential sum for this i atom from the interaction with this j atom. */
515             velec            = _mm_and_ps(velec,cutoff_mask);
516             velecsum         = _mm_add_ps(velecsum,velec);
517
518             fscal            = felec;
519
520             fscal            = _mm_and_ps(fscal,cutoff_mask);
521
522              /* Update vectorial force */
523             fix1             = _mm_macc_ps(dx11,fscal,fix1);
524             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
525             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
526
527             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
528             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
529             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
530
531             }
532
533             /**************************
534              * CALCULATE INTERACTIONS *
535              **************************/
536
537             if (gmx_mm_any_lt(rsq12,rcutoff2))
538             {
539
540             r12              = _mm_mul_ps(rsq12,rinv12);
541
542             /* EWALD ELECTROSTATICS */
543
544             /* Analytical PME correction */
545             zeta2            = _mm_mul_ps(beta2,rsq12);
546             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
547             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
548             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
549             felec            = _mm_mul_ps(qq12,felec);
550             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
551             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv12,sh_ewald));
552             velec            = _mm_mul_ps(qq12,velec);
553
554             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
555
556             /* Update potential sum for this i atom from the interaction with this j atom. */
557             velec            = _mm_and_ps(velec,cutoff_mask);
558             velecsum         = _mm_add_ps(velecsum,velec);
559
560             fscal            = felec;
561
562             fscal            = _mm_and_ps(fscal,cutoff_mask);
563
564              /* Update vectorial force */
565             fix1             = _mm_macc_ps(dx12,fscal,fix1);
566             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
567             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
568
569             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
570             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
571             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
572
573             }
574
575             /**************************
576              * CALCULATE INTERACTIONS *
577              **************************/
578
579             if (gmx_mm_any_lt(rsq20,rcutoff2))
580             {
581
582             r20              = _mm_mul_ps(rsq20,rinv20);
583
584             /* EWALD ELECTROSTATICS */
585
586             /* Analytical PME correction */
587             zeta2            = _mm_mul_ps(beta2,rsq20);
588             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
589             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
590             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
591             felec            = _mm_mul_ps(qq20,felec);
592             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
593             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv20,sh_ewald));
594             velec            = _mm_mul_ps(qq20,velec);
595
596             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
597
598             /* Update potential sum for this i atom from the interaction with this j atom. */
599             velec            = _mm_and_ps(velec,cutoff_mask);
600             velecsum         = _mm_add_ps(velecsum,velec);
601
602             fscal            = felec;
603
604             fscal            = _mm_and_ps(fscal,cutoff_mask);
605
606              /* Update vectorial force */
607             fix2             = _mm_macc_ps(dx20,fscal,fix2);
608             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
609             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
610
611             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
612             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
613             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
614
615             }
616
617             /**************************
618              * CALCULATE INTERACTIONS *
619              **************************/
620
621             if (gmx_mm_any_lt(rsq21,rcutoff2))
622             {
623
624             r21              = _mm_mul_ps(rsq21,rinv21);
625
626             /* EWALD ELECTROSTATICS */
627
628             /* Analytical PME correction */
629             zeta2            = _mm_mul_ps(beta2,rsq21);
630             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
631             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
632             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
633             felec            = _mm_mul_ps(qq21,felec);
634             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
635             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv21,sh_ewald));
636             velec            = _mm_mul_ps(qq21,velec);
637
638             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
639
640             /* Update potential sum for this i atom from the interaction with this j atom. */
641             velec            = _mm_and_ps(velec,cutoff_mask);
642             velecsum         = _mm_add_ps(velecsum,velec);
643
644             fscal            = felec;
645
646             fscal            = _mm_and_ps(fscal,cutoff_mask);
647
648              /* Update vectorial force */
649             fix2             = _mm_macc_ps(dx21,fscal,fix2);
650             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
651             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
652
653             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
654             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
655             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
656
657             }
658
659             /**************************
660              * CALCULATE INTERACTIONS *
661              **************************/
662
663             if (gmx_mm_any_lt(rsq22,rcutoff2))
664             {
665
666             r22              = _mm_mul_ps(rsq22,rinv22);
667
668             /* EWALD ELECTROSTATICS */
669
670             /* Analytical PME correction */
671             zeta2            = _mm_mul_ps(beta2,rsq22);
672             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
673             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
674             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
675             felec            = _mm_mul_ps(qq22,felec);
676             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
677             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv22,sh_ewald));
678             velec            = _mm_mul_ps(qq22,velec);
679
680             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
681
682             /* Update potential sum for this i atom from the interaction with this j atom. */
683             velec            = _mm_and_ps(velec,cutoff_mask);
684             velecsum         = _mm_add_ps(velecsum,velec);
685
686             fscal            = felec;
687
688             fscal            = _mm_and_ps(fscal,cutoff_mask);
689
690              /* Update vectorial force */
691             fix2             = _mm_macc_ps(dx22,fscal,fix2);
692             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
693             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
694
695             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
696             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
697             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
698
699             }
700
701             fjptrA             = f+j_coord_offsetA;
702             fjptrB             = f+j_coord_offsetB;
703             fjptrC             = f+j_coord_offsetC;
704             fjptrD             = f+j_coord_offsetD;
705
706             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
707                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
708
709             /* Inner loop uses 315 flops */
710         }
711
712         if(jidx<j_index_end)
713         {
714
715             /* Get j neighbor index, and coordinate index */
716             jnrlistA         = jjnr[jidx];
717             jnrlistB         = jjnr[jidx+1];
718             jnrlistC         = jjnr[jidx+2];
719             jnrlistD         = jjnr[jidx+3];
720             /* Sign of each element will be negative for non-real atoms.
721              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
722              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
723              */
724             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
725             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
726             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
727             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
728             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
729             j_coord_offsetA  = DIM*jnrA;
730             j_coord_offsetB  = DIM*jnrB;
731             j_coord_offsetC  = DIM*jnrC;
732             j_coord_offsetD  = DIM*jnrD;
733
734             /* load j atom coordinates */
735             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
736                                               x+j_coord_offsetC,x+j_coord_offsetD,
737                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
738
739             /* Calculate displacement vector */
740             dx00             = _mm_sub_ps(ix0,jx0);
741             dy00             = _mm_sub_ps(iy0,jy0);
742             dz00             = _mm_sub_ps(iz0,jz0);
743             dx01             = _mm_sub_ps(ix0,jx1);
744             dy01             = _mm_sub_ps(iy0,jy1);
745             dz01             = _mm_sub_ps(iz0,jz1);
746             dx02             = _mm_sub_ps(ix0,jx2);
747             dy02             = _mm_sub_ps(iy0,jy2);
748             dz02             = _mm_sub_ps(iz0,jz2);
749             dx10             = _mm_sub_ps(ix1,jx0);
750             dy10             = _mm_sub_ps(iy1,jy0);
751             dz10             = _mm_sub_ps(iz1,jz0);
752             dx11             = _mm_sub_ps(ix1,jx1);
753             dy11             = _mm_sub_ps(iy1,jy1);
754             dz11             = _mm_sub_ps(iz1,jz1);
755             dx12             = _mm_sub_ps(ix1,jx2);
756             dy12             = _mm_sub_ps(iy1,jy2);
757             dz12             = _mm_sub_ps(iz1,jz2);
758             dx20             = _mm_sub_ps(ix2,jx0);
759             dy20             = _mm_sub_ps(iy2,jy0);
760             dz20             = _mm_sub_ps(iz2,jz0);
761             dx21             = _mm_sub_ps(ix2,jx1);
762             dy21             = _mm_sub_ps(iy2,jy1);
763             dz21             = _mm_sub_ps(iz2,jz1);
764             dx22             = _mm_sub_ps(ix2,jx2);
765             dy22             = _mm_sub_ps(iy2,jy2);
766             dz22             = _mm_sub_ps(iz2,jz2);
767
768             /* Calculate squared distance and things based on it */
769             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
770             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
771             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
772             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
773             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
774             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
775             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
776             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
777             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
778
779             rinv00           = gmx_mm_invsqrt_ps(rsq00);
780             rinv01           = gmx_mm_invsqrt_ps(rsq01);
781             rinv02           = gmx_mm_invsqrt_ps(rsq02);
782             rinv10           = gmx_mm_invsqrt_ps(rsq10);
783             rinv11           = gmx_mm_invsqrt_ps(rsq11);
784             rinv12           = gmx_mm_invsqrt_ps(rsq12);
785             rinv20           = gmx_mm_invsqrt_ps(rsq20);
786             rinv21           = gmx_mm_invsqrt_ps(rsq21);
787             rinv22           = gmx_mm_invsqrt_ps(rsq22);
788
789             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
790             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
791             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
792             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
793             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
794             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
795             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
796             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
797             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
798
799             fjx0             = _mm_setzero_ps();
800             fjy0             = _mm_setzero_ps();
801             fjz0             = _mm_setzero_ps();
802             fjx1             = _mm_setzero_ps();
803             fjy1             = _mm_setzero_ps();
804             fjz1             = _mm_setzero_ps();
805             fjx2             = _mm_setzero_ps();
806             fjy2             = _mm_setzero_ps();
807             fjz2             = _mm_setzero_ps();
808
809             /**************************
810              * CALCULATE INTERACTIONS *
811              **************************/
812
813             if (gmx_mm_any_lt(rsq00,rcutoff2))
814             {
815
816             r00              = _mm_mul_ps(rsq00,rinv00);
817             r00              = _mm_andnot_ps(dummy_mask,r00);
818
819             /* EWALD ELECTROSTATICS */
820
821             /* Analytical PME correction */
822             zeta2            = _mm_mul_ps(beta2,rsq00);
823             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
824             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
825             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
826             felec            = _mm_mul_ps(qq00,felec);
827             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
828             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv00,sh_ewald));
829             velec            = _mm_mul_ps(qq00,velec);
830
831             /* LENNARD-JONES DISPERSION/REPULSION */
832
833             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
834             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
835             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
836             vvdw             = _mm_msub_ps(_mm_nmacc_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
837                                           _mm_mul_ps( _mm_nmacc_ps(c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
838             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
839
840             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
841
842             /* Update potential sum for this i atom from the interaction with this j atom. */
843             velec            = _mm_and_ps(velec,cutoff_mask);
844             velec            = _mm_andnot_ps(dummy_mask,velec);
845             velecsum         = _mm_add_ps(velecsum,velec);
846             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
847             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
848             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
849
850             fscal            = _mm_add_ps(felec,fvdw);
851
852             fscal            = _mm_and_ps(fscal,cutoff_mask);
853
854             fscal            = _mm_andnot_ps(dummy_mask,fscal);
855
856              /* Update vectorial force */
857             fix0             = _mm_macc_ps(dx00,fscal,fix0);
858             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
859             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
860
861             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
862             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
863             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
864
865             }
866
867             /**************************
868              * CALCULATE INTERACTIONS *
869              **************************/
870
871             if (gmx_mm_any_lt(rsq01,rcutoff2))
872             {
873
874             r01              = _mm_mul_ps(rsq01,rinv01);
875             r01              = _mm_andnot_ps(dummy_mask,r01);
876
877             /* EWALD ELECTROSTATICS */
878
879             /* Analytical PME correction */
880             zeta2            = _mm_mul_ps(beta2,rsq01);
881             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
882             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
883             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
884             felec            = _mm_mul_ps(qq01,felec);
885             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
886             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv01,sh_ewald));
887             velec            = _mm_mul_ps(qq01,velec);
888
889             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
890
891             /* Update potential sum for this i atom from the interaction with this j atom. */
892             velec            = _mm_and_ps(velec,cutoff_mask);
893             velec            = _mm_andnot_ps(dummy_mask,velec);
894             velecsum         = _mm_add_ps(velecsum,velec);
895
896             fscal            = felec;
897
898             fscal            = _mm_and_ps(fscal,cutoff_mask);
899
900             fscal            = _mm_andnot_ps(dummy_mask,fscal);
901
902              /* Update vectorial force */
903             fix0             = _mm_macc_ps(dx01,fscal,fix0);
904             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
905             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
906
907             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
908             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
909             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
910
911             }
912
913             /**************************
914              * CALCULATE INTERACTIONS *
915              **************************/
916
917             if (gmx_mm_any_lt(rsq02,rcutoff2))
918             {
919
920             r02              = _mm_mul_ps(rsq02,rinv02);
921             r02              = _mm_andnot_ps(dummy_mask,r02);
922
923             /* EWALD ELECTROSTATICS */
924
925             /* Analytical PME correction */
926             zeta2            = _mm_mul_ps(beta2,rsq02);
927             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
928             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
929             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
930             felec            = _mm_mul_ps(qq02,felec);
931             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
932             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv02,sh_ewald));
933             velec            = _mm_mul_ps(qq02,velec);
934
935             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
936
937             /* Update potential sum for this i atom from the interaction with this j atom. */
938             velec            = _mm_and_ps(velec,cutoff_mask);
939             velec            = _mm_andnot_ps(dummy_mask,velec);
940             velecsum         = _mm_add_ps(velecsum,velec);
941
942             fscal            = felec;
943
944             fscal            = _mm_and_ps(fscal,cutoff_mask);
945
946             fscal            = _mm_andnot_ps(dummy_mask,fscal);
947
948              /* Update vectorial force */
949             fix0             = _mm_macc_ps(dx02,fscal,fix0);
950             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
951             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
952
953             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
954             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
955             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
956
957             }
958
959             /**************************
960              * CALCULATE INTERACTIONS *
961              **************************/
962
963             if (gmx_mm_any_lt(rsq10,rcutoff2))
964             {
965
966             r10              = _mm_mul_ps(rsq10,rinv10);
967             r10              = _mm_andnot_ps(dummy_mask,r10);
968
969             /* EWALD ELECTROSTATICS */
970
971             /* Analytical PME correction */
972             zeta2            = _mm_mul_ps(beta2,rsq10);
973             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
974             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
975             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
976             felec            = _mm_mul_ps(qq10,felec);
977             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
978             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv10,sh_ewald));
979             velec            = _mm_mul_ps(qq10,velec);
980
981             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
982
983             /* Update potential sum for this i atom from the interaction with this j atom. */
984             velec            = _mm_and_ps(velec,cutoff_mask);
985             velec            = _mm_andnot_ps(dummy_mask,velec);
986             velecsum         = _mm_add_ps(velecsum,velec);
987
988             fscal            = felec;
989
990             fscal            = _mm_and_ps(fscal,cutoff_mask);
991
992             fscal            = _mm_andnot_ps(dummy_mask,fscal);
993
994              /* Update vectorial force */
995             fix1             = _mm_macc_ps(dx10,fscal,fix1);
996             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
997             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
998
999             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1000             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1001             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1002
1003             }
1004
1005             /**************************
1006              * CALCULATE INTERACTIONS *
1007              **************************/
1008
1009             if (gmx_mm_any_lt(rsq11,rcutoff2))
1010             {
1011
1012             r11              = _mm_mul_ps(rsq11,rinv11);
1013             r11              = _mm_andnot_ps(dummy_mask,r11);
1014
1015             /* EWALD ELECTROSTATICS */
1016
1017             /* Analytical PME correction */
1018             zeta2            = _mm_mul_ps(beta2,rsq11);
1019             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1020             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1021             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1022             felec            = _mm_mul_ps(qq11,felec);
1023             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1024             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv11,sh_ewald));
1025             velec            = _mm_mul_ps(qq11,velec);
1026
1027             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1028
1029             /* Update potential sum for this i atom from the interaction with this j atom. */
1030             velec            = _mm_and_ps(velec,cutoff_mask);
1031             velec            = _mm_andnot_ps(dummy_mask,velec);
1032             velecsum         = _mm_add_ps(velecsum,velec);
1033
1034             fscal            = felec;
1035
1036             fscal            = _mm_and_ps(fscal,cutoff_mask);
1037
1038             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1039
1040              /* Update vectorial force */
1041             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1042             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1043             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1044
1045             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1046             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1047             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1048
1049             }
1050
1051             /**************************
1052              * CALCULATE INTERACTIONS *
1053              **************************/
1054
1055             if (gmx_mm_any_lt(rsq12,rcutoff2))
1056             {
1057
1058             r12              = _mm_mul_ps(rsq12,rinv12);
1059             r12              = _mm_andnot_ps(dummy_mask,r12);
1060
1061             /* EWALD ELECTROSTATICS */
1062
1063             /* Analytical PME correction */
1064             zeta2            = _mm_mul_ps(beta2,rsq12);
1065             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
1066             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1067             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1068             felec            = _mm_mul_ps(qq12,felec);
1069             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1070             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv12,sh_ewald));
1071             velec            = _mm_mul_ps(qq12,velec);
1072
1073             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1074
1075             /* Update potential sum for this i atom from the interaction with this j atom. */
1076             velec            = _mm_and_ps(velec,cutoff_mask);
1077             velec            = _mm_andnot_ps(dummy_mask,velec);
1078             velecsum         = _mm_add_ps(velecsum,velec);
1079
1080             fscal            = felec;
1081
1082             fscal            = _mm_and_ps(fscal,cutoff_mask);
1083
1084             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1085
1086              /* Update vectorial force */
1087             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1088             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1089             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1090
1091             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1092             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1093             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1094
1095             }
1096
1097             /**************************
1098              * CALCULATE INTERACTIONS *
1099              **************************/
1100
1101             if (gmx_mm_any_lt(rsq20,rcutoff2))
1102             {
1103
1104             r20              = _mm_mul_ps(rsq20,rinv20);
1105             r20              = _mm_andnot_ps(dummy_mask,r20);
1106
1107             /* EWALD ELECTROSTATICS */
1108
1109             /* Analytical PME correction */
1110             zeta2            = _mm_mul_ps(beta2,rsq20);
1111             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
1112             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1113             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1114             felec            = _mm_mul_ps(qq20,felec);
1115             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1116             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv20,sh_ewald));
1117             velec            = _mm_mul_ps(qq20,velec);
1118
1119             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1120
1121             /* Update potential sum for this i atom from the interaction with this j atom. */
1122             velec            = _mm_and_ps(velec,cutoff_mask);
1123             velec            = _mm_andnot_ps(dummy_mask,velec);
1124             velecsum         = _mm_add_ps(velecsum,velec);
1125
1126             fscal            = felec;
1127
1128             fscal            = _mm_and_ps(fscal,cutoff_mask);
1129
1130             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1131
1132              /* Update vectorial force */
1133             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1134             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1135             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1136
1137             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1138             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1139             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1140
1141             }
1142
1143             /**************************
1144              * CALCULATE INTERACTIONS *
1145              **************************/
1146
1147             if (gmx_mm_any_lt(rsq21,rcutoff2))
1148             {
1149
1150             r21              = _mm_mul_ps(rsq21,rinv21);
1151             r21              = _mm_andnot_ps(dummy_mask,r21);
1152
1153             /* EWALD ELECTROSTATICS */
1154
1155             /* Analytical PME correction */
1156             zeta2            = _mm_mul_ps(beta2,rsq21);
1157             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
1158             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1159             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1160             felec            = _mm_mul_ps(qq21,felec);
1161             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1162             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv21,sh_ewald));
1163             velec            = _mm_mul_ps(qq21,velec);
1164
1165             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1166
1167             /* Update potential sum for this i atom from the interaction with this j atom. */
1168             velec            = _mm_and_ps(velec,cutoff_mask);
1169             velec            = _mm_andnot_ps(dummy_mask,velec);
1170             velecsum         = _mm_add_ps(velecsum,velec);
1171
1172             fscal            = felec;
1173
1174             fscal            = _mm_and_ps(fscal,cutoff_mask);
1175
1176             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1177
1178              /* Update vectorial force */
1179             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1180             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1181             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1182
1183             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1184             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1185             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1186
1187             }
1188
1189             /**************************
1190              * CALCULATE INTERACTIONS *
1191              **************************/
1192
1193             if (gmx_mm_any_lt(rsq22,rcutoff2))
1194             {
1195
1196             r22              = _mm_mul_ps(rsq22,rinv22);
1197             r22              = _mm_andnot_ps(dummy_mask,r22);
1198
1199             /* EWALD ELECTROSTATICS */
1200
1201             /* Analytical PME correction */
1202             zeta2            = _mm_mul_ps(beta2,rsq22);
1203             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
1204             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1205             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1206             felec            = _mm_mul_ps(qq22,felec);
1207             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1208             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv22,sh_ewald));
1209             velec            = _mm_mul_ps(qq22,velec);
1210
1211             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1212
1213             /* Update potential sum for this i atom from the interaction with this j atom. */
1214             velec            = _mm_and_ps(velec,cutoff_mask);
1215             velec            = _mm_andnot_ps(dummy_mask,velec);
1216             velecsum         = _mm_add_ps(velecsum,velec);
1217
1218             fscal            = felec;
1219
1220             fscal            = _mm_and_ps(fscal,cutoff_mask);
1221
1222             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1223
1224              /* Update vectorial force */
1225             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1226             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1227             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1228
1229             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1230             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1231             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1232
1233             }
1234
1235             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1236             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1237             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1238             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1239
1240             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1241                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1242
1243             /* Inner loop uses 324 flops */
1244         }
1245
1246         /* End of innermost loop */
1247
1248         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1249                                               f+i_coord_offset,fshift+i_shift_offset);
1250
1251         ggid                        = gid[iidx];
1252         /* Update potential energies */
1253         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1254         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1255
1256         /* Increment number of inner iterations */
1257         inneriter                  += j_index_end - j_index_start;
1258
1259         /* Outer loop uses 20 flops */
1260     }
1261
1262     /* Increment number of outer iterations */
1263     outeriter        += nri;
1264
1265     /* Update outer/inner flops */
1266
1267     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*324);
1268 }
1269 /*
1270  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_128_fma_single
1271  * Electrostatics interaction: Ewald
1272  * VdW interaction:            LennardJones
1273  * Geometry:                   Water3-Water3
1274  * Calculate force/pot:        Force
1275  */
1276 void
1277 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_128_fma_single
1278                     (t_nblist                    * gmx_restrict       nlist,
1279                      rvec                        * gmx_restrict          xx,
1280                      rvec                        * gmx_restrict          ff,
1281                      t_forcerec                  * gmx_restrict          fr,
1282                      t_mdatoms                   * gmx_restrict     mdatoms,
1283                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1284                      t_nrnb                      * gmx_restrict        nrnb)
1285 {
1286     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1287      * just 0 for non-waters.
1288      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1289      * jnr indices corresponding to data put in the four positions in the SIMD register.
1290      */
1291     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1292     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1293     int              jnrA,jnrB,jnrC,jnrD;
1294     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1295     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1296     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1297     real             rcutoff_scalar;
1298     real             *shiftvec,*fshift,*x,*f;
1299     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1300     real             scratch[4*DIM];
1301     __m128           fscal,rcutoff,rcutoff2,jidxall;
1302     int              vdwioffset0;
1303     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1304     int              vdwioffset1;
1305     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1306     int              vdwioffset2;
1307     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1308     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1309     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1310     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1311     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1312     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1313     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1314     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1315     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1316     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1317     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1318     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1319     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1320     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1321     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1322     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1323     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1324     real             *charge;
1325     int              nvdwtype;
1326     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1327     int              *vdwtype;
1328     real             *vdwparam;
1329     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1330     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1331     __m128i          ewitab;
1332     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1333     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1334     real             *ewtab;
1335     __m128           dummy_mask,cutoff_mask;
1336     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1337     __m128           one     = _mm_set1_ps(1.0);
1338     __m128           two     = _mm_set1_ps(2.0);
1339     x                = xx[0];
1340     f                = ff[0];
1341
1342     nri              = nlist->nri;
1343     iinr             = nlist->iinr;
1344     jindex           = nlist->jindex;
1345     jjnr             = nlist->jjnr;
1346     shiftidx         = nlist->shift;
1347     gid              = nlist->gid;
1348     shiftvec         = fr->shift_vec[0];
1349     fshift           = fr->fshift[0];
1350     facel            = _mm_set1_ps(fr->epsfac);
1351     charge           = mdatoms->chargeA;
1352     nvdwtype         = fr->ntype;
1353     vdwparam         = fr->nbfp;
1354     vdwtype          = mdatoms->typeA;
1355
1356     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1357     beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
1358     beta2            = _mm_mul_ps(beta,beta);
1359     beta3            = _mm_mul_ps(beta,beta2);
1360     ewtab            = fr->ic->tabq_coul_F;
1361     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1362     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1363
1364     /* Setup water-specific parameters */
1365     inr              = nlist->iinr[0];
1366     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1367     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1368     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1369     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1370
1371     jq0              = _mm_set1_ps(charge[inr+0]);
1372     jq1              = _mm_set1_ps(charge[inr+1]);
1373     jq2              = _mm_set1_ps(charge[inr+2]);
1374     vdwjidx0A        = 2*vdwtype[inr+0];
1375     qq00             = _mm_mul_ps(iq0,jq0);
1376     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1377     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1378     qq01             = _mm_mul_ps(iq0,jq1);
1379     qq02             = _mm_mul_ps(iq0,jq2);
1380     qq10             = _mm_mul_ps(iq1,jq0);
1381     qq11             = _mm_mul_ps(iq1,jq1);
1382     qq12             = _mm_mul_ps(iq1,jq2);
1383     qq20             = _mm_mul_ps(iq2,jq0);
1384     qq21             = _mm_mul_ps(iq2,jq1);
1385     qq22             = _mm_mul_ps(iq2,jq2);
1386
1387     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1388     rcutoff_scalar   = fr->rcoulomb;
1389     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1390     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1391
1392     sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
1393     rvdw             = _mm_set1_ps(fr->rvdw);
1394
1395     /* Avoid stupid compiler warnings */
1396     jnrA = jnrB = jnrC = jnrD = 0;
1397     j_coord_offsetA = 0;
1398     j_coord_offsetB = 0;
1399     j_coord_offsetC = 0;
1400     j_coord_offsetD = 0;
1401
1402     outeriter        = 0;
1403     inneriter        = 0;
1404
1405     for(iidx=0;iidx<4*DIM;iidx++)
1406     {
1407         scratch[iidx] = 0.0;
1408     }
1409
1410     /* Start outer loop over neighborlists */
1411     for(iidx=0; iidx<nri; iidx++)
1412     {
1413         /* Load shift vector for this list */
1414         i_shift_offset   = DIM*shiftidx[iidx];
1415
1416         /* Load limits for loop over neighbors */
1417         j_index_start    = jindex[iidx];
1418         j_index_end      = jindex[iidx+1];
1419
1420         /* Get outer coordinate index */
1421         inr              = iinr[iidx];
1422         i_coord_offset   = DIM*inr;
1423
1424         /* Load i particle coords and add shift vector */
1425         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1426                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1427
1428         fix0             = _mm_setzero_ps();
1429         fiy0             = _mm_setzero_ps();
1430         fiz0             = _mm_setzero_ps();
1431         fix1             = _mm_setzero_ps();
1432         fiy1             = _mm_setzero_ps();
1433         fiz1             = _mm_setzero_ps();
1434         fix2             = _mm_setzero_ps();
1435         fiy2             = _mm_setzero_ps();
1436         fiz2             = _mm_setzero_ps();
1437
1438         /* Start inner kernel loop */
1439         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1440         {
1441
1442             /* Get j neighbor index, and coordinate index */
1443             jnrA             = jjnr[jidx];
1444             jnrB             = jjnr[jidx+1];
1445             jnrC             = jjnr[jidx+2];
1446             jnrD             = jjnr[jidx+3];
1447             j_coord_offsetA  = DIM*jnrA;
1448             j_coord_offsetB  = DIM*jnrB;
1449             j_coord_offsetC  = DIM*jnrC;
1450             j_coord_offsetD  = DIM*jnrD;
1451
1452             /* load j atom coordinates */
1453             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1454                                               x+j_coord_offsetC,x+j_coord_offsetD,
1455                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1456
1457             /* Calculate displacement vector */
1458             dx00             = _mm_sub_ps(ix0,jx0);
1459             dy00             = _mm_sub_ps(iy0,jy0);
1460             dz00             = _mm_sub_ps(iz0,jz0);
1461             dx01             = _mm_sub_ps(ix0,jx1);
1462             dy01             = _mm_sub_ps(iy0,jy1);
1463             dz01             = _mm_sub_ps(iz0,jz1);
1464             dx02             = _mm_sub_ps(ix0,jx2);
1465             dy02             = _mm_sub_ps(iy0,jy2);
1466             dz02             = _mm_sub_ps(iz0,jz2);
1467             dx10             = _mm_sub_ps(ix1,jx0);
1468             dy10             = _mm_sub_ps(iy1,jy0);
1469             dz10             = _mm_sub_ps(iz1,jz0);
1470             dx11             = _mm_sub_ps(ix1,jx1);
1471             dy11             = _mm_sub_ps(iy1,jy1);
1472             dz11             = _mm_sub_ps(iz1,jz1);
1473             dx12             = _mm_sub_ps(ix1,jx2);
1474             dy12             = _mm_sub_ps(iy1,jy2);
1475             dz12             = _mm_sub_ps(iz1,jz2);
1476             dx20             = _mm_sub_ps(ix2,jx0);
1477             dy20             = _mm_sub_ps(iy2,jy0);
1478             dz20             = _mm_sub_ps(iz2,jz0);
1479             dx21             = _mm_sub_ps(ix2,jx1);
1480             dy21             = _mm_sub_ps(iy2,jy1);
1481             dz21             = _mm_sub_ps(iz2,jz1);
1482             dx22             = _mm_sub_ps(ix2,jx2);
1483             dy22             = _mm_sub_ps(iy2,jy2);
1484             dz22             = _mm_sub_ps(iz2,jz2);
1485
1486             /* Calculate squared distance and things based on it */
1487             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1488             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1489             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1490             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1491             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1492             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1493             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1494             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1495             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1496
1497             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1498             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1499             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1500             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1501             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1502             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1503             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1504             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1505             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1506
1507             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1508             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1509             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1510             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1511             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1512             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1513             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1514             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1515             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1516
1517             fjx0             = _mm_setzero_ps();
1518             fjy0             = _mm_setzero_ps();
1519             fjz0             = _mm_setzero_ps();
1520             fjx1             = _mm_setzero_ps();
1521             fjy1             = _mm_setzero_ps();
1522             fjz1             = _mm_setzero_ps();
1523             fjx2             = _mm_setzero_ps();
1524             fjy2             = _mm_setzero_ps();
1525             fjz2             = _mm_setzero_ps();
1526
1527             /**************************
1528              * CALCULATE INTERACTIONS *
1529              **************************/
1530
1531             if (gmx_mm_any_lt(rsq00,rcutoff2))
1532             {
1533
1534             r00              = _mm_mul_ps(rsq00,rinv00);
1535
1536             /* EWALD ELECTROSTATICS */
1537
1538             /* Analytical PME correction */
1539             zeta2            = _mm_mul_ps(beta2,rsq00);
1540             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
1541             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1542             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1543             felec            = _mm_mul_ps(qq00,felec);
1544
1545             /* LENNARD-JONES DISPERSION/REPULSION */
1546
1547             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1548             fvdw             = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1549
1550             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1551
1552             fscal            = _mm_add_ps(felec,fvdw);
1553
1554             fscal            = _mm_and_ps(fscal,cutoff_mask);
1555
1556              /* Update vectorial force */
1557             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1558             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1559             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1560
1561             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1562             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1563             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1564
1565             }
1566
1567             /**************************
1568              * CALCULATE INTERACTIONS *
1569              **************************/
1570
1571             if (gmx_mm_any_lt(rsq01,rcutoff2))
1572             {
1573
1574             r01              = _mm_mul_ps(rsq01,rinv01);
1575
1576             /* EWALD ELECTROSTATICS */
1577
1578             /* Analytical PME correction */
1579             zeta2            = _mm_mul_ps(beta2,rsq01);
1580             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
1581             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1582             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1583             felec            = _mm_mul_ps(qq01,felec);
1584
1585             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1586
1587             fscal            = felec;
1588
1589             fscal            = _mm_and_ps(fscal,cutoff_mask);
1590
1591              /* Update vectorial force */
1592             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1593             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1594             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1595
1596             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1597             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1598             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1599
1600             }
1601
1602             /**************************
1603              * CALCULATE INTERACTIONS *
1604              **************************/
1605
1606             if (gmx_mm_any_lt(rsq02,rcutoff2))
1607             {
1608
1609             r02              = _mm_mul_ps(rsq02,rinv02);
1610
1611             /* EWALD ELECTROSTATICS */
1612
1613             /* Analytical PME correction */
1614             zeta2            = _mm_mul_ps(beta2,rsq02);
1615             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
1616             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1617             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1618             felec            = _mm_mul_ps(qq02,felec);
1619
1620             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1621
1622             fscal            = felec;
1623
1624             fscal            = _mm_and_ps(fscal,cutoff_mask);
1625
1626              /* Update vectorial force */
1627             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1628             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1629             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1630
1631             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1632             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1633             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1634
1635             }
1636
1637             /**************************
1638              * CALCULATE INTERACTIONS *
1639              **************************/
1640
1641             if (gmx_mm_any_lt(rsq10,rcutoff2))
1642             {
1643
1644             r10              = _mm_mul_ps(rsq10,rinv10);
1645
1646             /* EWALD ELECTROSTATICS */
1647
1648             /* Analytical PME correction */
1649             zeta2            = _mm_mul_ps(beta2,rsq10);
1650             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
1651             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1652             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1653             felec            = _mm_mul_ps(qq10,felec);
1654
1655             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1656
1657             fscal            = felec;
1658
1659             fscal            = _mm_and_ps(fscal,cutoff_mask);
1660
1661              /* Update vectorial force */
1662             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1663             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1664             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1665
1666             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1667             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1668             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1669
1670             }
1671
1672             /**************************
1673              * CALCULATE INTERACTIONS *
1674              **************************/
1675
1676             if (gmx_mm_any_lt(rsq11,rcutoff2))
1677             {
1678
1679             r11              = _mm_mul_ps(rsq11,rinv11);
1680
1681             /* EWALD ELECTROSTATICS */
1682
1683             /* Analytical PME correction */
1684             zeta2            = _mm_mul_ps(beta2,rsq11);
1685             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1686             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1687             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1688             felec            = _mm_mul_ps(qq11,felec);
1689
1690             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1691
1692             fscal            = felec;
1693
1694             fscal            = _mm_and_ps(fscal,cutoff_mask);
1695
1696              /* Update vectorial force */
1697             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1698             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1699             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1700
1701             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1702             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1703             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1704
1705             }
1706
1707             /**************************
1708              * CALCULATE INTERACTIONS *
1709              **************************/
1710
1711             if (gmx_mm_any_lt(rsq12,rcutoff2))
1712             {
1713
1714             r12              = _mm_mul_ps(rsq12,rinv12);
1715
1716             /* EWALD ELECTROSTATICS */
1717
1718             /* Analytical PME correction */
1719             zeta2            = _mm_mul_ps(beta2,rsq12);
1720             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
1721             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1722             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1723             felec            = _mm_mul_ps(qq12,felec);
1724
1725             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1726
1727             fscal            = felec;
1728
1729             fscal            = _mm_and_ps(fscal,cutoff_mask);
1730
1731              /* Update vectorial force */
1732             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1733             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1734             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1735
1736             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1737             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1738             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1739
1740             }
1741
1742             /**************************
1743              * CALCULATE INTERACTIONS *
1744              **************************/
1745
1746             if (gmx_mm_any_lt(rsq20,rcutoff2))
1747             {
1748
1749             r20              = _mm_mul_ps(rsq20,rinv20);
1750
1751             /* EWALD ELECTROSTATICS */
1752
1753             /* Analytical PME correction */
1754             zeta2            = _mm_mul_ps(beta2,rsq20);
1755             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
1756             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1757             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1758             felec            = _mm_mul_ps(qq20,felec);
1759
1760             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1761
1762             fscal            = felec;
1763
1764             fscal            = _mm_and_ps(fscal,cutoff_mask);
1765
1766              /* Update vectorial force */
1767             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1768             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1769             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1770
1771             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1772             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1773             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1774
1775             }
1776
1777             /**************************
1778              * CALCULATE INTERACTIONS *
1779              **************************/
1780
1781             if (gmx_mm_any_lt(rsq21,rcutoff2))
1782             {
1783
1784             r21              = _mm_mul_ps(rsq21,rinv21);
1785
1786             /* EWALD ELECTROSTATICS */
1787
1788             /* Analytical PME correction */
1789             zeta2            = _mm_mul_ps(beta2,rsq21);
1790             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
1791             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1792             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1793             felec            = _mm_mul_ps(qq21,felec);
1794
1795             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1796
1797             fscal            = felec;
1798
1799             fscal            = _mm_and_ps(fscal,cutoff_mask);
1800
1801              /* Update vectorial force */
1802             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1803             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1804             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1805
1806             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1807             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1808             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1809
1810             }
1811
1812             /**************************
1813              * CALCULATE INTERACTIONS *
1814              **************************/
1815
1816             if (gmx_mm_any_lt(rsq22,rcutoff2))
1817             {
1818
1819             r22              = _mm_mul_ps(rsq22,rinv22);
1820
1821             /* EWALD ELECTROSTATICS */
1822
1823             /* Analytical PME correction */
1824             zeta2            = _mm_mul_ps(beta2,rsq22);
1825             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
1826             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1827             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1828             felec            = _mm_mul_ps(qq22,felec);
1829
1830             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1831
1832             fscal            = felec;
1833
1834             fscal            = _mm_and_ps(fscal,cutoff_mask);
1835
1836              /* Update vectorial force */
1837             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1838             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1839             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1840
1841             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1842             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1843             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1844
1845             }
1846
1847             fjptrA             = f+j_coord_offsetA;
1848             fjptrB             = f+j_coord_offsetB;
1849             fjptrC             = f+j_coord_offsetC;
1850             fjptrD             = f+j_coord_offsetD;
1851
1852             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1853                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1854
1855             /* Inner loop uses 286 flops */
1856         }
1857
1858         if(jidx<j_index_end)
1859         {
1860
1861             /* Get j neighbor index, and coordinate index */
1862             jnrlistA         = jjnr[jidx];
1863             jnrlistB         = jjnr[jidx+1];
1864             jnrlistC         = jjnr[jidx+2];
1865             jnrlistD         = jjnr[jidx+3];
1866             /* Sign of each element will be negative for non-real atoms.
1867              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1868              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1869              */
1870             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1871             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1872             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1873             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1874             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1875             j_coord_offsetA  = DIM*jnrA;
1876             j_coord_offsetB  = DIM*jnrB;
1877             j_coord_offsetC  = DIM*jnrC;
1878             j_coord_offsetD  = DIM*jnrD;
1879
1880             /* load j atom coordinates */
1881             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1882                                               x+j_coord_offsetC,x+j_coord_offsetD,
1883                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1884
1885             /* Calculate displacement vector */
1886             dx00             = _mm_sub_ps(ix0,jx0);
1887             dy00             = _mm_sub_ps(iy0,jy0);
1888             dz00             = _mm_sub_ps(iz0,jz0);
1889             dx01             = _mm_sub_ps(ix0,jx1);
1890             dy01             = _mm_sub_ps(iy0,jy1);
1891             dz01             = _mm_sub_ps(iz0,jz1);
1892             dx02             = _mm_sub_ps(ix0,jx2);
1893             dy02             = _mm_sub_ps(iy0,jy2);
1894             dz02             = _mm_sub_ps(iz0,jz2);
1895             dx10             = _mm_sub_ps(ix1,jx0);
1896             dy10             = _mm_sub_ps(iy1,jy0);
1897             dz10             = _mm_sub_ps(iz1,jz0);
1898             dx11             = _mm_sub_ps(ix1,jx1);
1899             dy11             = _mm_sub_ps(iy1,jy1);
1900             dz11             = _mm_sub_ps(iz1,jz1);
1901             dx12             = _mm_sub_ps(ix1,jx2);
1902             dy12             = _mm_sub_ps(iy1,jy2);
1903             dz12             = _mm_sub_ps(iz1,jz2);
1904             dx20             = _mm_sub_ps(ix2,jx0);
1905             dy20             = _mm_sub_ps(iy2,jy0);
1906             dz20             = _mm_sub_ps(iz2,jz0);
1907             dx21             = _mm_sub_ps(ix2,jx1);
1908             dy21             = _mm_sub_ps(iy2,jy1);
1909             dz21             = _mm_sub_ps(iz2,jz1);
1910             dx22             = _mm_sub_ps(ix2,jx2);
1911             dy22             = _mm_sub_ps(iy2,jy2);
1912             dz22             = _mm_sub_ps(iz2,jz2);
1913
1914             /* Calculate squared distance and things based on it */
1915             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1916             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1917             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1918             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1919             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1920             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1921             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1922             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1923             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1924
1925             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1926             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1927             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1928             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1929             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1930             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1931             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1932             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1933             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1934
1935             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1936             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1937             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1938             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1939             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1940             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1941             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1942             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1943             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1944
1945             fjx0             = _mm_setzero_ps();
1946             fjy0             = _mm_setzero_ps();
1947             fjz0             = _mm_setzero_ps();
1948             fjx1             = _mm_setzero_ps();
1949             fjy1             = _mm_setzero_ps();
1950             fjz1             = _mm_setzero_ps();
1951             fjx2             = _mm_setzero_ps();
1952             fjy2             = _mm_setzero_ps();
1953             fjz2             = _mm_setzero_ps();
1954
1955             /**************************
1956              * CALCULATE INTERACTIONS *
1957              **************************/
1958
1959             if (gmx_mm_any_lt(rsq00,rcutoff2))
1960             {
1961
1962             r00              = _mm_mul_ps(rsq00,rinv00);
1963             r00              = _mm_andnot_ps(dummy_mask,r00);
1964
1965             /* EWALD ELECTROSTATICS */
1966
1967             /* Analytical PME correction */
1968             zeta2            = _mm_mul_ps(beta2,rsq00);
1969             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
1970             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1971             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1972             felec            = _mm_mul_ps(qq00,felec);
1973
1974             /* LENNARD-JONES DISPERSION/REPULSION */
1975
1976             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1977             fvdw             = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1978
1979             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1980
1981             fscal            = _mm_add_ps(felec,fvdw);
1982
1983             fscal            = _mm_and_ps(fscal,cutoff_mask);
1984
1985             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1986
1987              /* Update vectorial force */
1988             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1989             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1990             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1991
1992             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1993             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1994             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1995
1996             }
1997
1998             /**************************
1999              * CALCULATE INTERACTIONS *
2000              **************************/
2001
2002             if (gmx_mm_any_lt(rsq01,rcutoff2))
2003             {
2004
2005             r01              = _mm_mul_ps(rsq01,rinv01);
2006             r01              = _mm_andnot_ps(dummy_mask,r01);
2007
2008             /* EWALD ELECTROSTATICS */
2009
2010             /* Analytical PME correction */
2011             zeta2            = _mm_mul_ps(beta2,rsq01);
2012             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
2013             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2014             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2015             felec            = _mm_mul_ps(qq01,felec);
2016
2017             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
2018
2019             fscal            = felec;
2020
2021             fscal            = _mm_and_ps(fscal,cutoff_mask);
2022
2023             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2024
2025              /* Update vectorial force */
2026             fix0             = _mm_macc_ps(dx01,fscal,fix0);
2027             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
2028             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
2029
2030             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
2031             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
2032             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
2033
2034             }
2035
2036             /**************************
2037              * CALCULATE INTERACTIONS *
2038              **************************/
2039
2040             if (gmx_mm_any_lt(rsq02,rcutoff2))
2041             {
2042
2043             r02              = _mm_mul_ps(rsq02,rinv02);
2044             r02              = _mm_andnot_ps(dummy_mask,r02);
2045
2046             /* EWALD ELECTROSTATICS */
2047
2048             /* Analytical PME correction */
2049             zeta2            = _mm_mul_ps(beta2,rsq02);
2050             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
2051             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2052             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2053             felec            = _mm_mul_ps(qq02,felec);
2054
2055             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
2056
2057             fscal            = felec;
2058
2059             fscal            = _mm_and_ps(fscal,cutoff_mask);
2060
2061             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2062
2063              /* Update vectorial force */
2064             fix0             = _mm_macc_ps(dx02,fscal,fix0);
2065             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
2066             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
2067
2068             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
2069             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
2070             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
2071
2072             }
2073
2074             /**************************
2075              * CALCULATE INTERACTIONS *
2076              **************************/
2077
2078             if (gmx_mm_any_lt(rsq10,rcutoff2))
2079             {
2080
2081             r10              = _mm_mul_ps(rsq10,rinv10);
2082             r10              = _mm_andnot_ps(dummy_mask,r10);
2083
2084             /* EWALD ELECTROSTATICS */
2085
2086             /* Analytical PME correction */
2087             zeta2            = _mm_mul_ps(beta2,rsq10);
2088             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
2089             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2090             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2091             felec            = _mm_mul_ps(qq10,felec);
2092
2093             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
2094
2095             fscal            = felec;
2096
2097             fscal            = _mm_and_ps(fscal,cutoff_mask);
2098
2099             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2100
2101              /* Update vectorial force */
2102             fix1             = _mm_macc_ps(dx10,fscal,fix1);
2103             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
2104             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
2105
2106             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
2107             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
2108             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
2109
2110             }
2111
2112             /**************************
2113              * CALCULATE INTERACTIONS *
2114              **************************/
2115
2116             if (gmx_mm_any_lt(rsq11,rcutoff2))
2117             {
2118
2119             r11              = _mm_mul_ps(rsq11,rinv11);
2120             r11              = _mm_andnot_ps(dummy_mask,r11);
2121
2122             /* EWALD ELECTROSTATICS */
2123
2124             /* Analytical PME correction */
2125             zeta2            = _mm_mul_ps(beta2,rsq11);
2126             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
2127             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2128             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2129             felec            = _mm_mul_ps(qq11,felec);
2130
2131             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2132
2133             fscal            = felec;
2134
2135             fscal            = _mm_and_ps(fscal,cutoff_mask);
2136
2137             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2138
2139              /* Update vectorial force */
2140             fix1             = _mm_macc_ps(dx11,fscal,fix1);
2141             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
2142             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
2143
2144             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
2145             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
2146             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
2147
2148             }
2149
2150             /**************************
2151              * CALCULATE INTERACTIONS *
2152              **************************/
2153
2154             if (gmx_mm_any_lt(rsq12,rcutoff2))
2155             {
2156
2157             r12              = _mm_mul_ps(rsq12,rinv12);
2158             r12              = _mm_andnot_ps(dummy_mask,r12);
2159
2160             /* EWALD ELECTROSTATICS */
2161
2162             /* Analytical PME correction */
2163             zeta2            = _mm_mul_ps(beta2,rsq12);
2164             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
2165             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2166             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2167             felec            = _mm_mul_ps(qq12,felec);
2168
2169             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2170
2171             fscal            = felec;
2172
2173             fscal            = _mm_and_ps(fscal,cutoff_mask);
2174
2175             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2176
2177              /* Update vectorial force */
2178             fix1             = _mm_macc_ps(dx12,fscal,fix1);
2179             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
2180             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
2181
2182             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
2183             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
2184             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
2185
2186             }
2187
2188             /**************************
2189              * CALCULATE INTERACTIONS *
2190              **************************/
2191
2192             if (gmx_mm_any_lt(rsq20,rcutoff2))
2193             {
2194
2195             r20              = _mm_mul_ps(rsq20,rinv20);
2196             r20              = _mm_andnot_ps(dummy_mask,r20);
2197
2198             /* EWALD ELECTROSTATICS */
2199
2200             /* Analytical PME correction */
2201             zeta2            = _mm_mul_ps(beta2,rsq20);
2202             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
2203             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2204             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2205             felec            = _mm_mul_ps(qq20,felec);
2206
2207             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
2208
2209             fscal            = felec;
2210
2211             fscal            = _mm_and_ps(fscal,cutoff_mask);
2212
2213             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2214
2215              /* Update vectorial force */
2216             fix2             = _mm_macc_ps(dx20,fscal,fix2);
2217             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
2218             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
2219
2220             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
2221             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
2222             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
2223
2224             }
2225
2226             /**************************
2227              * CALCULATE INTERACTIONS *
2228              **************************/
2229
2230             if (gmx_mm_any_lt(rsq21,rcutoff2))
2231             {
2232
2233             r21              = _mm_mul_ps(rsq21,rinv21);
2234             r21              = _mm_andnot_ps(dummy_mask,r21);
2235
2236             /* EWALD ELECTROSTATICS */
2237
2238             /* Analytical PME correction */
2239             zeta2            = _mm_mul_ps(beta2,rsq21);
2240             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
2241             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2242             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2243             felec            = _mm_mul_ps(qq21,felec);
2244
2245             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2246
2247             fscal            = felec;
2248
2249             fscal            = _mm_and_ps(fscal,cutoff_mask);
2250
2251             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2252
2253              /* Update vectorial force */
2254             fix2             = _mm_macc_ps(dx21,fscal,fix2);
2255             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
2256             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
2257
2258             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
2259             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
2260             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
2261
2262             }
2263
2264             /**************************
2265              * CALCULATE INTERACTIONS *
2266              **************************/
2267
2268             if (gmx_mm_any_lt(rsq22,rcutoff2))
2269             {
2270
2271             r22              = _mm_mul_ps(rsq22,rinv22);
2272             r22              = _mm_andnot_ps(dummy_mask,r22);
2273
2274             /* EWALD ELECTROSTATICS */
2275
2276             /* Analytical PME correction */
2277             zeta2            = _mm_mul_ps(beta2,rsq22);
2278             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
2279             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2280             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2281             felec            = _mm_mul_ps(qq22,felec);
2282
2283             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2284
2285             fscal            = felec;
2286
2287             fscal            = _mm_and_ps(fscal,cutoff_mask);
2288
2289             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2290
2291              /* Update vectorial force */
2292             fix2             = _mm_macc_ps(dx22,fscal,fix2);
2293             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
2294             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
2295
2296             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
2297             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
2298             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
2299
2300             }
2301
2302             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2303             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2304             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2305             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2306
2307             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2308                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2309
2310             /* Inner loop uses 295 flops */
2311         }
2312
2313         /* End of innermost loop */
2314
2315         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2316                                               f+i_coord_offset,fshift+i_shift_offset);
2317
2318         /* Increment number of inner iterations */
2319         inneriter                  += j_index_end - j_index_start;
2320
2321         /* Outer loop uses 18 flops */
2322     }
2323
2324     /* Increment number of outer iterations */
2325     outeriter        += nri;
2326
2327     /* Update outer/inner flops */
2328
2329     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*295);
2330 }