6e1f63508250d5d2fa2ebcb8caa84fb1bbfa4162
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecEw_VdwLJ_GeomW4W4_avx_128_fma_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
48 #include "kernelutil_x86_avx_128_fma_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_128_fma_single
52  * Electrostatics interaction: Ewald
53  * VdW interaction:            LennardJones
54  * Geometry:                   Water4-Water4
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_128_fma_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m128           fscal,rcutoff,rcutoff2,jidxall;
83     int              vdwioffset0;
84     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     int              vdwioffset1;
86     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     int              vdwioffset2;
88     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     int              vdwioffset3;
90     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
98     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
99     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
100     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
110     real             *charge;
111     int              nvdwtype;
112     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
113     int              *vdwtype;
114     real             *vdwparam;
115     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
116     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
117     __m128i          ewitab;
118     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
119     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
120     real             *ewtab;
121     __m128           dummy_mask,cutoff_mask;
122     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
123     __m128           one     = _mm_set1_ps(1.0);
124     __m128           two     = _mm_set1_ps(2.0);
125     x                = xx[0];
126     f                = ff[0];
127
128     nri              = nlist->nri;
129     iinr             = nlist->iinr;
130     jindex           = nlist->jindex;
131     jjnr             = nlist->jjnr;
132     shiftidx         = nlist->shift;
133     gid              = nlist->gid;
134     shiftvec         = fr->shift_vec[0];
135     fshift           = fr->fshift[0];
136     facel            = _mm_set1_ps(fr->epsfac);
137     charge           = mdatoms->chargeA;
138     nvdwtype         = fr->ntype;
139     vdwparam         = fr->nbfp;
140     vdwtype          = mdatoms->typeA;
141
142     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
143     beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
144     beta2            = _mm_mul_ps(beta,beta);
145     beta3            = _mm_mul_ps(beta,beta2);
146     ewtab            = fr->ic->tabq_coul_FDV0;
147     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
148     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
149
150     /* Setup water-specific parameters */
151     inr              = nlist->iinr[0];
152     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
153     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
154     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
155     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
156
157     jq1              = _mm_set1_ps(charge[inr+1]);
158     jq2              = _mm_set1_ps(charge[inr+2]);
159     jq3              = _mm_set1_ps(charge[inr+3]);
160     vdwjidx0A        = 2*vdwtype[inr+0];
161     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
162     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
163     qq11             = _mm_mul_ps(iq1,jq1);
164     qq12             = _mm_mul_ps(iq1,jq2);
165     qq13             = _mm_mul_ps(iq1,jq3);
166     qq21             = _mm_mul_ps(iq2,jq1);
167     qq22             = _mm_mul_ps(iq2,jq2);
168     qq23             = _mm_mul_ps(iq2,jq3);
169     qq31             = _mm_mul_ps(iq3,jq1);
170     qq32             = _mm_mul_ps(iq3,jq2);
171     qq33             = _mm_mul_ps(iq3,jq3);
172
173     /* Avoid stupid compiler warnings */
174     jnrA = jnrB = jnrC = jnrD = 0;
175     j_coord_offsetA = 0;
176     j_coord_offsetB = 0;
177     j_coord_offsetC = 0;
178     j_coord_offsetD = 0;
179
180     outeriter        = 0;
181     inneriter        = 0;
182
183     for(iidx=0;iidx<4*DIM;iidx++)
184     {
185         scratch[iidx] = 0.0;
186     }
187
188     /* Start outer loop over neighborlists */
189     for(iidx=0; iidx<nri; iidx++)
190     {
191         /* Load shift vector for this list */
192         i_shift_offset   = DIM*shiftidx[iidx];
193
194         /* Load limits for loop over neighbors */
195         j_index_start    = jindex[iidx];
196         j_index_end      = jindex[iidx+1];
197
198         /* Get outer coordinate index */
199         inr              = iinr[iidx];
200         i_coord_offset   = DIM*inr;
201
202         /* Load i particle coords and add shift vector */
203         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
204                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
205
206         fix0             = _mm_setzero_ps();
207         fiy0             = _mm_setzero_ps();
208         fiz0             = _mm_setzero_ps();
209         fix1             = _mm_setzero_ps();
210         fiy1             = _mm_setzero_ps();
211         fiz1             = _mm_setzero_ps();
212         fix2             = _mm_setzero_ps();
213         fiy2             = _mm_setzero_ps();
214         fiz2             = _mm_setzero_ps();
215         fix3             = _mm_setzero_ps();
216         fiy3             = _mm_setzero_ps();
217         fiz3             = _mm_setzero_ps();
218
219         /* Reset potential sums */
220         velecsum         = _mm_setzero_ps();
221         vvdwsum          = _mm_setzero_ps();
222
223         /* Start inner kernel loop */
224         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
225         {
226
227             /* Get j neighbor index, and coordinate index */
228             jnrA             = jjnr[jidx];
229             jnrB             = jjnr[jidx+1];
230             jnrC             = jjnr[jidx+2];
231             jnrD             = jjnr[jidx+3];
232             j_coord_offsetA  = DIM*jnrA;
233             j_coord_offsetB  = DIM*jnrB;
234             j_coord_offsetC  = DIM*jnrC;
235             j_coord_offsetD  = DIM*jnrD;
236
237             /* load j atom coordinates */
238             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
239                                               x+j_coord_offsetC,x+j_coord_offsetD,
240                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
241                                               &jy2,&jz2,&jx3,&jy3,&jz3);
242
243             /* Calculate displacement vector */
244             dx00             = _mm_sub_ps(ix0,jx0);
245             dy00             = _mm_sub_ps(iy0,jy0);
246             dz00             = _mm_sub_ps(iz0,jz0);
247             dx11             = _mm_sub_ps(ix1,jx1);
248             dy11             = _mm_sub_ps(iy1,jy1);
249             dz11             = _mm_sub_ps(iz1,jz1);
250             dx12             = _mm_sub_ps(ix1,jx2);
251             dy12             = _mm_sub_ps(iy1,jy2);
252             dz12             = _mm_sub_ps(iz1,jz2);
253             dx13             = _mm_sub_ps(ix1,jx3);
254             dy13             = _mm_sub_ps(iy1,jy3);
255             dz13             = _mm_sub_ps(iz1,jz3);
256             dx21             = _mm_sub_ps(ix2,jx1);
257             dy21             = _mm_sub_ps(iy2,jy1);
258             dz21             = _mm_sub_ps(iz2,jz1);
259             dx22             = _mm_sub_ps(ix2,jx2);
260             dy22             = _mm_sub_ps(iy2,jy2);
261             dz22             = _mm_sub_ps(iz2,jz2);
262             dx23             = _mm_sub_ps(ix2,jx3);
263             dy23             = _mm_sub_ps(iy2,jy3);
264             dz23             = _mm_sub_ps(iz2,jz3);
265             dx31             = _mm_sub_ps(ix3,jx1);
266             dy31             = _mm_sub_ps(iy3,jy1);
267             dz31             = _mm_sub_ps(iz3,jz1);
268             dx32             = _mm_sub_ps(ix3,jx2);
269             dy32             = _mm_sub_ps(iy3,jy2);
270             dz32             = _mm_sub_ps(iz3,jz2);
271             dx33             = _mm_sub_ps(ix3,jx3);
272             dy33             = _mm_sub_ps(iy3,jy3);
273             dz33             = _mm_sub_ps(iz3,jz3);
274
275             /* Calculate squared distance and things based on it */
276             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
277             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
278             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
279             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
280             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
281             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
282             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
283             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
284             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
285             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
286
287             rinv11           = gmx_mm_invsqrt_ps(rsq11);
288             rinv12           = gmx_mm_invsqrt_ps(rsq12);
289             rinv13           = gmx_mm_invsqrt_ps(rsq13);
290             rinv21           = gmx_mm_invsqrt_ps(rsq21);
291             rinv22           = gmx_mm_invsqrt_ps(rsq22);
292             rinv23           = gmx_mm_invsqrt_ps(rsq23);
293             rinv31           = gmx_mm_invsqrt_ps(rsq31);
294             rinv32           = gmx_mm_invsqrt_ps(rsq32);
295             rinv33           = gmx_mm_invsqrt_ps(rsq33);
296
297             rinvsq00         = gmx_mm_inv_ps(rsq00);
298             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
299             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
300             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
301             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
302             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
303             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
304             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
305             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
306             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
307
308             fjx0             = _mm_setzero_ps();
309             fjy0             = _mm_setzero_ps();
310             fjz0             = _mm_setzero_ps();
311             fjx1             = _mm_setzero_ps();
312             fjy1             = _mm_setzero_ps();
313             fjz1             = _mm_setzero_ps();
314             fjx2             = _mm_setzero_ps();
315             fjy2             = _mm_setzero_ps();
316             fjz2             = _mm_setzero_ps();
317             fjx3             = _mm_setzero_ps();
318             fjy3             = _mm_setzero_ps();
319             fjz3             = _mm_setzero_ps();
320
321             /**************************
322              * CALCULATE INTERACTIONS *
323              **************************/
324
325             /* LENNARD-JONES DISPERSION/REPULSION */
326
327             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
328             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
329             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
330             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
331             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
332
333             /* Update potential sum for this i atom from the interaction with this j atom. */
334             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
335
336             fscal            = fvdw;
337
338              /* Update vectorial force */
339             fix0             = _mm_macc_ps(dx00,fscal,fix0);
340             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
341             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
342
343             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
344             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
345             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
346
347             /**************************
348              * CALCULATE INTERACTIONS *
349              **************************/
350
351             r11              = _mm_mul_ps(rsq11,rinv11);
352
353             /* EWALD ELECTROSTATICS */
354
355             /* Analytical PME correction */
356             zeta2            = _mm_mul_ps(beta2,rsq11);
357             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
358             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
359             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
360             felec            = _mm_mul_ps(qq11,felec);
361             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
362             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
363             velec            = _mm_mul_ps(qq11,velec);
364
365             /* Update potential sum for this i atom from the interaction with this j atom. */
366             velecsum         = _mm_add_ps(velecsum,velec);
367
368             fscal            = felec;
369
370              /* Update vectorial force */
371             fix1             = _mm_macc_ps(dx11,fscal,fix1);
372             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
373             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
374
375             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
376             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
377             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
378
379             /**************************
380              * CALCULATE INTERACTIONS *
381              **************************/
382
383             r12              = _mm_mul_ps(rsq12,rinv12);
384
385             /* EWALD ELECTROSTATICS */
386
387             /* Analytical PME correction */
388             zeta2            = _mm_mul_ps(beta2,rsq12);
389             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
390             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
391             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
392             felec            = _mm_mul_ps(qq12,felec);
393             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
394             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
395             velec            = _mm_mul_ps(qq12,velec);
396
397             /* Update potential sum for this i atom from the interaction with this j atom. */
398             velecsum         = _mm_add_ps(velecsum,velec);
399
400             fscal            = felec;
401
402              /* Update vectorial force */
403             fix1             = _mm_macc_ps(dx12,fscal,fix1);
404             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
405             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
406
407             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
408             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
409             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
410
411             /**************************
412              * CALCULATE INTERACTIONS *
413              **************************/
414
415             r13              = _mm_mul_ps(rsq13,rinv13);
416
417             /* EWALD ELECTROSTATICS */
418
419             /* Analytical PME correction */
420             zeta2            = _mm_mul_ps(beta2,rsq13);
421             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
422             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
423             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
424             felec            = _mm_mul_ps(qq13,felec);
425             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
426             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv13);
427             velec            = _mm_mul_ps(qq13,velec);
428
429             /* Update potential sum for this i atom from the interaction with this j atom. */
430             velecsum         = _mm_add_ps(velecsum,velec);
431
432             fscal            = felec;
433
434              /* Update vectorial force */
435             fix1             = _mm_macc_ps(dx13,fscal,fix1);
436             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
437             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
438
439             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
440             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
441             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
442
443             /**************************
444              * CALCULATE INTERACTIONS *
445              **************************/
446
447             r21              = _mm_mul_ps(rsq21,rinv21);
448
449             /* EWALD ELECTROSTATICS */
450
451             /* Analytical PME correction */
452             zeta2            = _mm_mul_ps(beta2,rsq21);
453             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
454             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
455             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
456             felec            = _mm_mul_ps(qq21,felec);
457             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
458             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
459             velec            = _mm_mul_ps(qq21,velec);
460
461             /* Update potential sum for this i atom from the interaction with this j atom. */
462             velecsum         = _mm_add_ps(velecsum,velec);
463
464             fscal            = felec;
465
466              /* Update vectorial force */
467             fix2             = _mm_macc_ps(dx21,fscal,fix2);
468             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
469             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
470
471             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
472             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
473             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
474
475             /**************************
476              * CALCULATE INTERACTIONS *
477              **************************/
478
479             r22              = _mm_mul_ps(rsq22,rinv22);
480
481             /* EWALD ELECTROSTATICS */
482
483             /* Analytical PME correction */
484             zeta2            = _mm_mul_ps(beta2,rsq22);
485             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
486             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
487             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
488             felec            = _mm_mul_ps(qq22,felec);
489             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
490             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
491             velec            = _mm_mul_ps(qq22,velec);
492
493             /* Update potential sum for this i atom from the interaction with this j atom. */
494             velecsum         = _mm_add_ps(velecsum,velec);
495
496             fscal            = felec;
497
498              /* Update vectorial force */
499             fix2             = _mm_macc_ps(dx22,fscal,fix2);
500             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
501             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
502
503             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
504             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
505             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
506
507             /**************************
508              * CALCULATE INTERACTIONS *
509              **************************/
510
511             r23              = _mm_mul_ps(rsq23,rinv23);
512
513             /* EWALD ELECTROSTATICS */
514
515             /* Analytical PME correction */
516             zeta2            = _mm_mul_ps(beta2,rsq23);
517             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
518             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
519             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
520             felec            = _mm_mul_ps(qq23,felec);
521             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
522             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv23);
523             velec            = _mm_mul_ps(qq23,velec);
524
525             /* Update potential sum for this i atom from the interaction with this j atom. */
526             velecsum         = _mm_add_ps(velecsum,velec);
527
528             fscal            = felec;
529
530              /* Update vectorial force */
531             fix2             = _mm_macc_ps(dx23,fscal,fix2);
532             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
533             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
534
535             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
536             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
537             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
538
539             /**************************
540              * CALCULATE INTERACTIONS *
541              **************************/
542
543             r31              = _mm_mul_ps(rsq31,rinv31);
544
545             /* EWALD ELECTROSTATICS */
546
547             /* Analytical PME correction */
548             zeta2            = _mm_mul_ps(beta2,rsq31);
549             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
550             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
551             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
552             felec            = _mm_mul_ps(qq31,felec);
553             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
554             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv31);
555             velec            = _mm_mul_ps(qq31,velec);
556
557             /* Update potential sum for this i atom from the interaction with this j atom. */
558             velecsum         = _mm_add_ps(velecsum,velec);
559
560             fscal            = felec;
561
562              /* Update vectorial force */
563             fix3             = _mm_macc_ps(dx31,fscal,fix3);
564             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
565             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
566
567             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
568             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
569             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
570
571             /**************************
572              * CALCULATE INTERACTIONS *
573              **************************/
574
575             r32              = _mm_mul_ps(rsq32,rinv32);
576
577             /* EWALD ELECTROSTATICS */
578
579             /* Analytical PME correction */
580             zeta2            = _mm_mul_ps(beta2,rsq32);
581             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
582             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
583             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
584             felec            = _mm_mul_ps(qq32,felec);
585             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
586             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv32);
587             velec            = _mm_mul_ps(qq32,velec);
588
589             /* Update potential sum for this i atom from the interaction with this j atom. */
590             velecsum         = _mm_add_ps(velecsum,velec);
591
592             fscal            = felec;
593
594              /* Update vectorial force */
595             fix3             = _mm_macc_ps(dx32,fscal,fix3);
596             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
597             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
598
599             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
600             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
601             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
602
603             /**************************
604              * CALCULATE INTERACTIONS *
605              **************************/
606
607             r33              = _mm_mul_ps(rsq33,rinv33);
608
609             /* EWALD ELECTROSTATICS */
610
611             /* Analytical PME correction */
612             zeta2            = _mm_mul_ps(beta2,rsq33);
613             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
614             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
615             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
616             felec            = _mm_mul_ps(qq33,felec);
617             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
618             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv33);
619             velec            = _mm_mul_ps(qq33,velec);
620
621             /* Update potential sum for this i atom from the interaction with this j atom. */
622             velecsum         = _mm_add_ps(velecsum,velec);
623
624             fscal            = felec;
625
626              /* Update vectorial force */
627             fix3             = _mm_macc_ps(dx33,fscal,fix3);
628             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
629             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
630
631             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
632             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
633             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
634
635             fjptrA             = f+j_coord_offsetA;
636             fjptrB             = f+j_coord_offsetB;
637             fjptrC             = f+j_coord_offsetC;
638             fjptrD             = f+j_coord_offsetD;
639
640             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
641                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
642                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
643
644             /* Inner loop uses 299 flops */
645         }
646
647         if(jidx<j_index_end)
648         {
649
650             /* Get j neighbor index, and coordinate index */
651             jnrlistA         = jjnr[jidx];
652             jnrlistB         = jjnr[jidx+1];
653             jnrlistC         = jjnr[jidx+2];
654             jnrlistD         = jjnr[jidx+3];
655             /* Sign of each element will be negative for non-real atoms.
656              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
657              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
658              */
659             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
660             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
661             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
662             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
663             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
664             j_coord_offsetA  = DIM*jnrA;
665             j_coord_offsetB  = DIM*jnrB;
666             j_coord_offsetC  = DIM*jnrC;
667             j_coord_offsetD  = DIM*jnrD;
668
669             /* load j atom coordinates */
670             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
671                                               x+j_coord_offsetC,x+j_coord_offsetD,
672                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
673                                               &jy2,&jz2,&jx3,&jy3,&jz3);
674
675             /* Calculate displacement vector */
676             dx00             = _mm_sub_ps(ix0,jx0);
677             dy00             = _mm_sub_ps(iy0,jy0);
678             dz00             = _mm_sub_ps(iz0,jz0);
679             dx11             = _mm_sub_ps(ix1,jx1);
680             dy11             = _mm_sub_ps(iy1,jy1);
681             dz11             = _mm_sub_ps(iz1,jz1);
682             dx12             = _mm_sub_ps(ix1,jx2);
683             dy12             = _mm_sub_ps(iy1,jy2);
684             dz12             = _mm_sub_ps(iz1,jz2);
685             dx13             = _mm_sub_ps(ix1,jx3);
686             dy13             = _mm_sub_ps(iy1,jy3);
687             dz13             = _mm_sub_ps(iz1,jz3);
688             dx21             = _mm_sub_ps(ix2,jx1);
689             dy21             = _mm_sub_ps(iy2,jy1);
690             dz21             = _mm_sub_ps(iz2,jz1);
691             dx22             = _mm_sub_ps(ix2,jx2);
692             dy22             = _mm_sub_ps(iy2,jy2);
693             dz22             = _mm_sub_ps(iz2,jz2);
694             dx23             = _mm_sub_ps(ix2,jx3);
695             dy23             = _mm_sub_ps(iy2,jy3);
696             dz23             = _mm_sub_ps(iz2,jz3);
697             dx31             = _mm_sub_ps(ix3,jx1);
698             dy31             = _mm_sub_ps(iy3,jy1);
699             dz31             = _mm_sub_ps(iz3,jz1);
700             dx32             = _mm_sub_ps(ix3,jx2);
701             dy32             = _mm_sub_ps(iy3,jy2);
702             dz32             = _mm_sub_ps(iz3,jz2);
703             dx33             = _mm_sub_ps(ix3,jx3);
704             dy33             = _mm_sub_ps(iy3,jy3);
705             dz33             = _mm_sub_ps(iz3,jz3);
706
707             /* Calculate squared distance and things based on it */
708             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
709             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
710             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
711             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
712             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
713             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
714             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
715             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
716             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
717             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
718
719             rinv11           = gmx_mm_invsqrt_ps(rsq11);
720             rinv12           = gmx_mm_invsqrt_ps(rsq12);
721             rinv13           = gmx_mm_invsqrt_ps(rsq13);
722             rinv21           = gmx_mm_invsqrt_ps(rsq21);
723             rinv22           = gmx_mm_invsqrt_ps(rsq22);
724             rinv23           = gmx_mm_invsqrt_ps(rsq23);
725             rinv31           = gmx_mm_invsqrt_ps(rsq31);
726             rinv32           = gmx_mm_invsqrt_ps(rsq32);
727             rinv33           = gmx_mm_invsqrt_ps(rsq33);
728
729             rinvsq00         = gmx_mm_inv_ps(rsq00);
730             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
731             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
732             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
733             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
734             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
735             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
736             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
737             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
738             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
739
740             fjx0             = _mm_setzero_ps();
741             fjy0             = _mm_setzero_ps();
742             fjz0             = _mm_setzero_ps();
743             fjx1             = _mm_setzero_ps();
744             fjy1             = _mm_setzero_ps();
745             fjz1             = _mm_setzero_ps();
746             fjx2             = _mm_setzero_ps();
747             fjy2             = _mm_setzero_ps();
748             fjz2             = _mm_setzero_ps();
749             fjx3             = _mm_setzero_ps();
750             fjy3             = _mm_setzero_ps();
751             fjz3             = _mm_setzero_ps();
752
753             /**************************
754              * CALCULATE INTERACTIONS *
755              **************************/
756
757             /* LENNARD-JONES DISPERSION/REPULSION */
758
759             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
760             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
761             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
762             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
763             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
764
765             /* Update potential sum for this i atom from the interaction with this j atom. */
766             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
767             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
768
769             fscal            = fvdw;
770
771             fscal            = _mm_andnot_ps(dummy_mask,fscal);
772
773              /* Update vectorial force */
774             fix0             = _mm_macc_ps(dx00,fscal,fix0);
775             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
776             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
777
778             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
779             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
780             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
781
782             /**************************
783              * CALCULATE INTERACTIONS *
784              **************************/
785
786             r11              = _mm_mul_ps(rsq11,rinv11);
787             r11              = _mm_andnot_ps(dummy_mask,r11);
788
789             /* EWALD ELECTROSTATICS */
790
791             /* Analytical PME correction */
792             zeta2            = _mm_mul_ps(beta2,rsq11);
793             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
794             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
795             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
796             felec            = _mm_mul_ps(qq11,felec);
797             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
798             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
799             velec            = _mm_mul_ps(qq11,velec);
800
801             /* Update potential sum for this i atom from the interaction with this j atom. */
802             velec            = _mm_andnot_ps(dummy_mask,velec);
803             velecsum         = _mm_add_ps(velecsum,velec);
804
805             fscal            = felec;
806
807             fscal            = _mm_andnot_ps(dummy_mask,fscal);
808
809              /* Update vectorial force */
810             fix1             = _mm_macc_ps(dx11,fscal,fix1);
811             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
812             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
813
814             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
815             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
816             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
817
818             /**************************
819              * CALCULATE INTERACTIONS *
820              **************************/
821
822             r12              = _mm_mul_ps(rsq12,rinv12);
823             r12              = _mm_andnot_ps(dummy_mask,r12);
824
825             /* EWALD ELECTROSTATICS */
826
827             /* Analytical PME correction */
828             zeta2            = _mm_mul_ps(beta2,rsq12);
829             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
830             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
831             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
832             felec            = _mm_mul_ps(qq12,felec);
833             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
834             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
835             velec            = _mm_mul_ps(qq12,velec);
836
837             /* Update potential sum for this i atom from the interaction with this j atom. */
838             velec            = _mm_andnot_ps(dummy_mask,velec);
839             velecsum         = _mm_add_ps(velecsum,velec);
840
841             fscal            = felec;
842
843             fscal            = _mm_andnot_ps(dummy_mask,fscal);
844
845              /* Update vectorial force */
846             fix1             = _mm_macc_ps(dx12,fscal,fix1);
847             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
848             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
849
850             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
851             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
852             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
853
854             /**************************
855              * CALCULATE INTERACTIONS *
856              **************************/
857
858             r13              = _mm_mul_ps(rsq13,rinv13);
859             r13              = _mm_andnot_ps(dummy_mask,r13);
860
861             /* EWALD ELECTROSTATICS */
862
863             /* Analytical PME correction */
864             zeta2            = _mm_mul_ps(beta2,rsq13);
865             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
866             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
867             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
868             felec            = _mm_mul_ps(qq13,felec);
869             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
870             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv13);
871             velec            = _mm_mul_ps(qq13,velec);
872
873             /* Update potential sum for this i atom from the interaction with this j atom. */
874             velec            = _mm_andnot_ps(dummy_mask,velec);
875             velecsum         = _mm_add_ps(velecsum,velec);
876
877             fscal            = felec;
878
879             fscal            = _mm_andnot_ps(dummy_mask,fscal);
880
881              /* Update vectorial force */
882             fix1             = _mm_macc_ps(dx13,fscal,fix1);
883             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
884             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
885
886             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
887             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
888             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
889
890             /**************************
891              * CALCULATE INTERACTIONS *
892              **************************/
893
894             r21              = _mm_mul_ps(rsq21,rinv21);
895             r21              = _mm_andnot_ps(dummy_mask,r21);
896
897             /* EWALD ELECTROSTATICS */
898
899             /* Analytical PME correction */
900             zeta2            = _mm_mul_ps(beta2,rsq21);
901             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
902             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
903             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
904             felec            = _mm_mul_ps(qq21,felec);
905             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
906             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
907             velec            = _mm_mul_ps(qq21,velec);
908
909             /* Update potential sum for this i atom from the interaction with this j atom. */
910             velec            = _mm_andnot_ps(dummy_mask,velec);
911             velecsum         = _mm_add_ps(velecsum,velec);
912
913             fscal            = felec;
914
915             fscal            = _mm_andnot_ps(dummy_mask,fscal);
916
917              /* Update vectorial force */
918             fix2             = _mm_macc_ps(dx21,fscal,fix2);
919             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
920             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
921
922             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
923             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
924             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
925
926             /**************************
927              * CALCULATE INTERACTIONS *
928              **************************/
929
930             r22              = _mm_mul_ps(rsq22,rinv22);
931             r22              = _mm_andnot_ps(dummy_mask,r22);
932
933             /* EWALD ELECTROSTATICS */
934
935             /* Analytical PME correction */
936             zeta2            = _mm_mul_ps(beta2,rsq22);
937             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
938             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
939             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
940             felec            = _mm_mul_ps(qq22,felec);
941             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
942             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
943             velec            = _mm_mul_ps(qq22,velec);
944
945             /* Update potential sum for this i atom from the interaction with this j atom. */
946             velec            = _mm_andnot_ps(dummy_mask,velec);
947             velecsum         = _mm_add_ps(velecsum,velec);
948
949             fscal            = felec;
950
951             fscal            = _mm_andnot_ps(dummy_mask,fscal);
952
953              /* Update vectorial force */
954             fix2             = _mm_macc_ps(dx22,fscal,fix2);
955             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
956             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
957
958             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
959             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
960             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
961
962             /**************************
963              * CALCULATE INTERACTIONS *
964              **************************/
965
966             r23              = _mm_mul_ps(rsq23,rinv23);
967             r23              = _mm_andnot_ps(dummy_mask,r23);
968
969             /* EWALD ELECTROSTATICS */
970
971             /* Analytical PME correction */
972             zeta2            = _mm_mul_ps(beta2,rsq23);
973             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
974             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
975             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
976             felec            = _mm_mul_ps(qq23,felec);
977             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
978             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv23);
979             velec            = _mm_mul_ps(qq23,velec);
980
981             /* Update potential sum for this i atom from the interaction with this j atom. */
982             velec            = _mm_andnot_ps(dummy_mask,velec);
983             velecsum         = _mm_add_ps(velecsum,velec);
984
985             fscal            = felec;
986
987             fscal            = _mm_andnot_ps(dummy_mask,fscal);
988
989              /* Update vectorial force */
990             fix2             = _mm_macc_ps(dx23,fscal,fix2);
991             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
992             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
993
994             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
995             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
996             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
997
998             /**************************
999              * CALCULATE INTERACTIONS *
1000              **************************/
1001
1002             r31              = _mm_mul_ps(rsq31,rinv31);
1003             r31              = _mm_andnot_ps(dummy_mask,r31);
1004
1005             /* EWALD ELECTROSTATICS */
1006
1007             /* Analytical PME correction */
1008             zeta2            = _mm_mul_ps(beta2,rsq31);
1009             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
1010             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1011             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1012             felec            = _mm_mul_ps(qq31,felec);
1013             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1014             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv31);
1015             velec            = _mm_mul_ps(qq31,velec);
1016
1017             /* Update potential sum for this i atom from the interaction with this j atom. */
1018             velec            = _mm_andnot_ps(dummy_mask,velec);
1019             velecsum         = _mm_add_ps(velecsum,velec);
1020
1021             fscal            = felec;
1022
1023             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1024
1025              /* Update vectorial force */
1026             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1027             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1028             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1029
1030             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1031             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1032             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1033
1034             /**************************
1035              * CALCULATE INTERACTIONS *
1036              **************************/
1037
1038             r32              = _mm_mul_ps(rsq32,rinv32);
1039             r32              = _mm_andnot_ps(dummy_mask,r32);
1040
1041             /* EWALD ELECTROSTATICS */
1042
1043             /* Analytical PME correction */
1044             zeta2            = _mm_mul_ps(beta2,rsq32);
1045             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
1046             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1047             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1048             felec            = _mm_mul_ps(qq32,felec);
1049             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1050             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv32);
1051             velec            = _mm_mul_ps(qq32,velec);
1052
1053             /* Update potential sum for this i atom from the interaction with this j atom. */
1054             velec            = _mm_andnot_ps(dummy_mask,velec);
1055             velecsum         = _mm_add_ps(velecsum,velec);
1056
1057             fscal            = felec;
1058
1059             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1060
1061              /* Update vectorial force */
1062             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1063             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1064             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1065
1066             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1067             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1068             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1069
1070             /**************************
1071              * CALCULATE INTERACTIONS *
1072              **************************/
1073
1074             r33              = _mm_mul_ps(rsq33,rinv33);
1075             r33              = _mm_andnot_ps(dummy_mask,r33);
1076
1077             /* EWALD ELECTROSTATICS */
1078
1079             /* Analytical PME correction */
1080             zeta2            = _mm_mul_ps(beta2,rsq33);
1081             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
1082             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1083             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1084             felec            = _mm_mul_ps(qq33,felec);
1085             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1086             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv33);
1087             velec            = _mm_mul_ps(qq33,velec);
1088
1089             /* Update potential sum for this i atom from the interaction with this j atom. */
1090             velec            = _mm_andnot_ps(dummy_mask,velec);
1091             velecsum         = _mm_add_ps(velecsum,velec);
1092
1093             fscal            = felec;
1094
1095             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1096
1097              /* Update vectorial force */
1098             fix3             = _mm_macc_ps(dx33,fscal,fix3);
1099             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
1100             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
1101
1102             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
1103             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
1104             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
1105
1106             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1107             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1108             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1109             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1110
1111             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1112                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1113                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1114
1115             /* Inner loop uses 308 flops */
1116         }
1117
1118         /* End of innermost loop */
1119
1120         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1121                                               f+i_coord_offset,fshift+i_shift_offset);
1122
1123         ggid                        = gid[iidx];
1124         /* Update potential energies */
1125         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1126         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1127
1128         /* Increment number of inner iterations */
1129         inneriter                  += j_index_end - j_index_start;
1130
1131         /* Outer loop uses 26 flops */
1132     }
1133
1134     /* Increment number of outer iterations */
1135     outeriter        += nri;
1136
1137     /* Update outer/inner flops */
1138
1139     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*308);
1140 }
1141 /*
1142  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_128_fma_single
1143  * Electrostatics interaction: Ewald
1144  * VdW interaction:            LennardJones
1145  * Geometry:                   Water4-Water4
1146  * Calculate force/pot:        Force
1147  */
1148 void
1149 nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_128_fma_single
1150                     (t_nblist                    * gmx_restrict       nlist,
1151                      rvec                        * gmx_restrict          xx,
1152                      rvec                        * gmx_restrict          ff,
1153                      t_forcerec                  * gmx_restrict          fr,
1154                      t_mdatoms                   * gmx_restrict     mdatoms,
1155                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1156                      t_nrnb                      * gmx_restrict        nrnb)
1157 {
1158     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1159      * just 0 for non-waters.
1160      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1161      * jnr indices corresponding to data put in the four positions in the SIMD register.
1162      */
1163     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1164     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1165     int              jnrA,jnrB,jnrC,jnrD;
1166     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1167     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1168     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1169     real             rcutoff_scalar;
1170     real             *shiftvec,*fshift,*x,*f;
1171     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1172     real             scratch[4*DIM];
1173     __m128           fscal,rcutoff,rcutoff2,jidxall;
1174     int              vdwioffset0;
1175     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1176     int              vdwioffset1;
1177     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1178     int              vdwioffset2;
1179     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1180     int              vdwioffset3;
1181     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1182     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1183     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1184     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1185     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1186     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1187     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1188     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1189     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1190     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1191     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1192     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1193     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1194     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1195     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1196     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1197     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1198     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1199     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1200     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1201     real             *charge;
1202     int              nvdwtype;
1203     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1204     int              *vdwtype;
1205     real             *vdwparam;
1206     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1207     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1208     __m128i          ewitab;
1209     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1210     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1211     real             *ewtab;
1212     __m128           dummy_mask,cutoff_mask;
1213     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1214     __m128           one     = _mm_set1_ps(1.0);
1215     __m128           two     = _mm_set1_ps(2.0);
1216     x                = xx[0];
1217     f                = ff[0];
1218
1219     nri              = nlist->nri;
1220     iinr             = nlist->iinr;
1221     jindex           = nlist->jindex;
1222     jjnr             = nlist->jjnr;
1223     shiftidx         = nlist->shift;
1224     gid              = nlist->gid;
1225     shiftvec         = fr->shift_vec[0];
1226     fshift           = fr->fshift[0];
1227     facel            = _mm_set1_ps(fr->epsfac);
1228     charge           = mdatoms->chargeA;
1229     nvdwtype         = fr->ntype;
1230     vdwparam         = fr->nbfp;
1231     vdwtype          = mdatoms->typeA;
1232
1233     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1234     beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
1235     beta2            = _mm_mul_ps(beta,beta);
1236     beta3            = _mm_mul_ps(beta,beta2);
1237     ewtab            = fr->ic->tabq_coul_F;
1238     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1239     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1240
1241     /* Setup water-specific parameters */
1242     inr              = nlist->iinr[0];
1243     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1244     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1245     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1246     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1247
1248     jq1              = _mm_set1_ps(charge[inr+1]);
1249     jq2              = _mm_set1_ps(charge[inr+2]);
1250     jq3              = _mm_set1_ps(charge[inr+3]);
1251     vdwjidx0A        = 2*vdwtype[inr+0];
1252     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1253     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1254     qq11             = _mm_mul_ps(iq1,jq1);
1255     qq12             = _mm_mul_ps(iq1,jq2);
1256     qq13             = _mm_mul_ps(iq1,jq3);
1257     qq21             = _mm_mul_ps(iq2,jq1);
1258     qq22             = _mm_mul_ps(iq2,jq2);
1259     qq23             = _mm_mul_ps(iq2,jq3);
1260     qq31             = _mm_mul_ps(iq3,jq1);
1261     qq32             = _mm_mul_ps(iq3,jq2);
1262     qq33             = _mm_mul_ps(iq3,jq3);
1263
1264     /* Avoid stupid compiler warnings */
1265     jnrA = jnrB = jnrC = jnrD = 0;
1266     j_coord_offsetA = 0;
1267     j_coord_offsetB = 0;
1268     j_coord_offsetC = 0;
1269     j_coord_offsetD = 0;
1270
1271     outeriter        = 0;
1272     inneriter        = 0;
1273
1274     for(iidx=0;iidx<4*DIM;iidx++)
1275     {
1276         scratch[iidx] = 0.0;
1277     }
1278
1279     /* Start outer loop over neighborlists */
1280     for(iidx=0; iidx<nri; iidx++)
1281     {
1282         /* Load shift vector for this list */
1283         i_shift_offset   = DIM*shiftidx[iidx];
1284
1285         /* Load limits for loop over neighbors */
1286         j_index_start    = jindex[iidx];
1287         j_index_end      = jindex[iidx+1];
1288
1289         /* Get outer coordinate index */
1290         inr              = iinr[iidx];
1291         i_coord_offset   = DIM*inr;
1292
1293         /* Load i particle coords and add shift vector */
1294         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1295                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1296
1297         fix0             = _mm_setzero_ps();
1298         fiy0             = _mm_setzero_ps();
1299         fiz0             = _mm_setzero_ps();
1300         fix1             = _mm_setzero_ps();
1301         fiy1             = _mm_setzero_ps();
1302         fiz1             = _mm_setzero_ps();
1303         fix2             = _mm_setzero_ps();
1304         fiy2             = _mm_setzero_ps();
1305         fiz2             = _mm_setzero_ps();
1306         fix3             = _mm_setzero_ps();
1307         fiy3             = _mm_setzero_ps();
1308         fiz3             = _mm_setzero_ps();
1309
1310         /* Start inner kernel loop */
1311         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1312         {
1313
1314             /* Get j neighbor index, and coordinate index */
1315             jnrA             = jjnr[jidx];
1316             jnrB             = jjnr[jidx+1];
1317             jnrC             = jjnr[jidx+2];
1318             jnrD             = jjnr[jidx+3];
1319             j_coord_offsetA  = DIM*jnrA;
1320             j_coord_offsetB  = DIM*jnrB;
1321             j_coord_offsetC  = DIM*jnrC;
1322             j_coord_offsetD  = DIM*jnrD;
1323
1324             /* load j atom coordinates */
1325             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1326                                               x+j_coord_offsetC,x+j_coord_offsetD,
1327                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1328                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1329
1330             /* Calculate displacement vector */
1331             dx00             = _mm_sub_ps(ix0,jx0);
1332             dy00             = _mm_sub_ps(iy0,jy0);
1333             dz00             = _mm_sub_ps(iz0,jz0);
1334             dx11             = _mm_sub_ps(ix1,jx1);
1335             dy11             = _mm_sub_ps(iy1,jy1);
1336             dz11             = _mm_sub_ps(iz1,jz1);
1337             dx12             = _mm_sub_ps(ix1,jx2);
1338             dy12             = _mm_sub_ps(iy1,jy2);
1339             dz12             = _mm_sub_ps(iz1,jz2);
1340             dx13             = _mm_sub_ps(ix1,jx3);
1341             dy13             = _mm_sub_ps(iy1,jy3);
1342             dz13             = _mm_sub_ps(iz1,jz3);
1343             dx21             = _mm_sub_ps(ix2,jx1);
1344             dy21             = _mm_sub_ps(iy2,jy1);
1345             dz21             = _mm_sub_ps(iz2,jz1);
1346             dx22             = _mm_sub_ps(ix2,jx2);
1347             dy22             = _mm_sub_ps(iy2,jy2);
1348             dz22             = _mm_sub_ps(iz2,jz2);
1349             dx23             = _mm_sub_ps(ix2,jx3);
1350             dy23             = _mm_sub_ps(iy2,jy3);
1351             dz23             = _mm_sub_ps(iz2,jz3);
1352             dx31             = _mm_sub_ps(ix3,jx1);
1353             dy31             = _mm_sub_ps(iy3,jy1);
1354             dz31             = _mm_sub_ps(iz3,jz1);
1355             dx32             = _mm_sub_ps(ix3,jx2);
1356             dy32             = _mm_sub_ps(iy3,jy2);
1357             dz32             = _mm_sub_ps(iz3,jz2);
1358             dx33             = _mm_sub_ps(ix3,jx3);
1359             dy33             = _mm_sub_ps(iy3,jy3);
1360             dz33             = _mm_sub_ps(iz3,jz3);
1361
1362             /* Calculate squared distance and things based on it */
1363             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1364             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1365             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1366             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1367             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1368             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1369             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1370             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1371             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1372             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1373
1374             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1375             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1376             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1377             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1378             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1379             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1380             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1381             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1382             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1383
1384             rinvsq00         = gmx_mm_inv_ps(rsq00);
1385             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1386             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1387             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1388             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1389             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1390             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1391             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1392             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1393             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1394
1395             fjx0             = _mm_setzero_ps();
1396             fjy0             = _mm_setzero_ps();
1397             fjz0             = _mm_setzero_ps();
1398             fjx1             = _mm_setzero_ps();
1399             fjy1             = _mm_setzero_ps();
1400             fjz1             = _mm_setzero_ps();
1401             fjx2             = _mm_setzero_ps();
1402             fjy2             = _mm_setzero_ps();
1403             fjz2             = _mm_setzero_ps();
1404             fjx3             = _mm_setzero_ps();
1405             fjy3             = _mm_setzero_ps();
1406             fjz3             = _mm_setzero_ps();
1407
1408             /**************************
1409              * CALCULATE INTERACTIONS *
1410              **************************/
1411
1412             /* LENNARD-JONES DISPERSION/REPULSION */
1413
1414             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1415             fvdw             = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1416
1417             fscal            = fvdw;
1418
1419              /* Update vectorial force */
1420             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1421             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1422             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1423
1424             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1425             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1426             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1427
1428             /**************************
1429              * CALCULATE INTERACTIONS *
1430              **************************/
1431
1432             r11              = _mm_mul_ps(rsq11,rinv11);
1433
1434             /* EWALD ELECTROSTATICS */
1435
1436             /* Analytical PME correction */
1437             zeta2            = _mm_mul_ps(beta2,rsq11);
1438             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1439             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1440             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1441             felec            = _mm_mul_ps(qq11,felec);
1442
1443             fscal            = felec;
1444
1445              /* Update vectorial force */
1446             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1447             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1448             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1449
1450             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1451             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1452             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1453
1454             /**************************
1455              * CALCULATE INTERACTIONS *
1456              **************************/
1457
1458             r12              = _mm_mul_ps(rsq12,rinv12);
1459
1460             /* EWALD ELECTROSTATICS */
1461
1462             /* Analytical PME correction */
1463             zeta2            = _mm_mul_ps(beta2,rsq12);
1464             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
1465             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1466             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1467             felec            = _mm_mul_ps(qq12,felec);
1468
1469             fscal            = felec;
1470
1471              /* Update vectorial force */
1472             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1473             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1474             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1475
1476             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1477             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1478             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1479
1480             /**************************
1481              * CALCULATE INTERACTIONS *
1482              **************************/
1483
1484             r13              = _mm_mul_ps(rsq13,rinv13);
1485
1486             /* EWALD ELECTROSTATICS */
1487
1488             /* Analytical PME correction */
1489             zeta2            = _mm_mul_ps(beta2,rsq13);
1490             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
1491             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1492             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1493             felec            = _mm_mul_ps(qq13,felec);
1494
1495             fscal            = felec;
1496
1497              /* Update vectorial force */
1498             fix1             = _mm_macc_ps(dx13,fscal,fix1);
1499             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
1500             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
1501
1502             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
1503             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
1504             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
1505
1506             /**************************
1507              * CALCULATE INTERACTIONS *
1508              **************************/
1509
1510             r21              = _mm_mul_ps(rsq21,rinv21);
1511
1512             /* EWALD ELECTROSTATICS */
1513
1514             /* Analytical PME correction */
1515             zeta2            = _mm_mul_ps(beta2,rsq21);
1516             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
1517             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1518             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1519             felec            = _mm_mul_ps(qq21,felec);
1520
1521             fscal            = felec;
1522
1523              /* Update vectorial force */
1524             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1525             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1526             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1527
1528             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1529             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1530             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1531
1532             /**************************
1533              * CALCULATE INTERACTIONS *
1534              **************************/
1535
1536             r22              = _mm_mul_ps(rsq22,rinv22);
1537
1538             /* EWALD ELECTROSTATICS */
1539
1540             /* Analytical PME correction */
1541             zeta2            = _mm_mul_ps(beta2,rsq22);
1542             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
1543             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1544             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1545             felec            = _mm_mul_ps(qq22,felec);
1546
1547             fscal            = felec;
1548
1549              /* Update vectorial force */
1550             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1551             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1552             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1553
1554             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1555             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1556             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1557
1558             /**************************
1559              * CALCULATE INTERACTIONS *
1560              **************************/
1561
1562             r23              = _mm_mul_ps(rsq23,rinv23);
1563
1564             /* EWALD ELECTROSTATICS */
1565
1566             /* Analytical PME correction */
1567             zeta2            = _mm_mul_ps(beta2,rsq23);
1568             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
1569             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1570             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1571             felec            = _mm_mul_ps(qq23,felec);
1572
1573             fscal            = felec;
1574
1575              /* Update vectorial force */
1576             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1577             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1578             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1579
1580             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1581             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1582             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1583
1584             /**************************
1585              * CALCULATE INTERACTIONS *
1586              **************************/
1587
1588             r31              = _mm_mul_ps(rsq31,rinv31);
1589
1590             /* EWALD ELECTROSTATICS */
1591
1592             /* Analytical PME correction */
1593             zeta2            = _mm_mul_ps(beta2,rsq31);
1594             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
1595             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1596             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1597             felec            = _mm_mul_ps(qq31,felec);
1598
1599             fscal            = felec;
1600
1601              /* Update vectorial force */
1602             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1603             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1604             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1605
1606             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1607             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1608             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1609
1610             /**************************
1611              * CALCULATE INTERACTIONS *
1612              **************************/
1613
1614             r32              = _mm_mul_ps(rsq32,rinv32);
1615
1616             /* EWALD ELECTROSTATICS */
1617
1618             /* Analytical PME correction */
1619             zeta2            = _mm_mul_ps(beta2,rsq32);
1620             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
1621             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1622             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1623             felec            = _mm_mul_ps(qq32,felec);
1624
1625             fscal            = felec;
1626
1627              /* Update vectorial force */
1628             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1629             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1630             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1631
1632             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1633             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1634             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1635
1636             /**************************
1637              * CALCULATE INTERACTIONS *
1638              **************************/
1639
1640             r33              = _mm_mul_ps(rsq33,rinv33);
1641
1642             /* EWALD ELECTROSTATICS */
1643
1644             /* Analytical PME correction */
1645             zeta2            = _mm_mul_ps(beta2,rsq33);
1646             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
1647             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1648             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1649             felec            = _mm_mul_ps(qq33,felec);
1650
1651             fscal            = felec;
1652
1653              /* Update vectorial force */
1654             fix3             = _mm_macc_ps(dx33,fscal,fix3);
1655             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
1656             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
1657
1658             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
1659             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
1660             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
1661
1662             fjptrA             = f+j_coord_offsetA;
1663             fjptrB             = f+j_coord_offsetB;
1664             fjptrC             = f+j_coord_offsetC;
1665             fjptrD             = f+j_coord_offsetD;
1666
1667             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1668                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1669                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1670
1671             /* Inner loop uses 285 flops */
1672         }
1673
1674         if(jidx<j_index_end)
1675         {
1676
1677             /* Get j neighbor index, and coordinate index */
1678             jnrlistA         = jjnr[jidx];
1679             jnrlistB         = jjnr[jidx+1];
1680             jnrlistC         = jjnr[jidx+2];
1681             jnrlistD         = jjnr[jidx+3];
1682             /* Sign of each element will be negative for non-real atoms.
1683              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1684              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1685              */
1686             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1687             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1688             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1689             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1690             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1691             j_coord_offsetA  = DIM*jnrA;
1692             j_coord_offsetB  = DIM*jnrB;
1693             j_coord_offsetC  = DIM*jnrC;
1694             j_coord_offsetD  = DIM*jnrD;
1695
1696             /* load j atom coordinates */
1697             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1698                                               x+j_coord_offsetC,x+j_coord_offsetD,
1699                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1700                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1701
1702             /* Calculate displacement vector */
1703             dx00             = _mm_sub_ps(ix0,jx0);
1704             dy00             = _mm_sub_ps(iy0,jy0);
1705             dz00             = _mm_sub_ps(iz0,jz0);
1706             dx11             = _mm_sub_ps(ix1,jx1);
1707             dy11             = _mm_sub_ps(iy1,jy1);
1708             dz11             = _mm_sub_ps(iz1,jz1);
1709             dx12             = _mm_sub_ps(ix1,jx2);
1710             dy12             = _mm_sub_ps(iy1,jy2);
1711             dz12             = _mm_sub_ps(iz1,jz2);
1712             dx13             = _mm_sub_ps(ix1,jx3);
1713             dy13             = _mm_sub_ps(iy1,jy3);
1714             dz13             = _mm_sub_ps(iz1,jz3);
1715             dx21             = _mm_sub_ps(ix2,jx1);
1716             dy21             = _mm_sub_ps(iy2,jy1);
1717             dz21             = _mm_sub_ps(iz2,jz1);
1718             dx22             = _mm_sub_ps(ix2,jx2);
1719             dy22             = _mm_sub_ps(iy2,jy2);
1720             dz22             = _mm_sub_ps(iz2,jz2);
1721             dx23             = _mm_sub_ps(ix2,jx3);
1722             dy23             = _mm_sub_ps(iy2,jy3);
1723             dz23             = _mm_sub_ps(iz2,jz3);
1724             dx31             = _mm_sub_ps(ix3,jx1);
1725             dy31             = _mm_sub_ps(iy3,jy1);
1726             dz31             = _mm_sub_ps(iz3,jz1);
1727             dx32             = _mm_sub_ps(ix3,jx2);
1728             dy32             = _mm_sub_ps(iy3,jy2);
1729             dz32             = _mm_sub_ps(iz3,jz2);
1730             dx33             = _mm_sub_ps(ix3,jx3);
1731             dy33             = _mm_sub_ps(iy3,jy3);
1732             dz33             = _mm_sub_ps(iz3,jz3);
1733
1734             /* Calculate squared distance and things based on it */
1735             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1736             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1737             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1738             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1739             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1740             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1741             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1742             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1743             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1744             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1745
1746             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1747             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1748             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1749             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1750             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1751             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1752             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1753             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1754             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1755
1756             rinvsq00         = gmx_mm_inv_ps(rsq00);
1757             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1758             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1759             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1760             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1761             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1762             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1763             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1764             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1765             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1766
1767             fjx0             = _mm_setzero_ps();
1768             fjy0             = _mm_setzero_ps();
1769             fjz0             = _mm_setzero_ps();
1770             fjx1             = _mm_setzero_ps();
1771             fjy1             = _mm_setzero_ps();
1772             fjz1             = _mm_setzero_ps();
1773             fjx2             = _mm_setzero_ps();
1774             fjy2             = _mm_setzero_ps();
1775             fjz2             = _mm_setzero_ps();
1776             fjx3             = _mm_setzero_ps();
1777             fjy3             = _mm_setzero_ps();
1778             fjz3             = _mm_setzero_ps();
1779
1780             /**************************
1781              * CALCULATE INTERACTIONS *
1782              **************************/
1783
1784             /* LENNARD-JONES DISPERSION/REPULSION */
1785
1786             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1787             fvdw             = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1788
1789             fscal            = fvdw;
1790
1791             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1792
1793              /* Update vectorial force */
1794             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1795             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1796             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1797
1798             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1799             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1800             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1801
1802             /**************************
1803              * CALCULATE INTERACTIONS *
1804              **************************/
1805
1806             r11              = _mm_mul_ps(rsq11,rinv11);
1807             r11              = _mm_andnot_ps(dummy_mask,r11);
1808
1809             /* EWALD ELECTROSTATICS */
1810
1811             /* Analytical PME correction */
1812             zeta2            = _mm_mul_ps(beta2,rsq11);
1813             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1814             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1815             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1816             felec            = _mm_mul_ps(qq11,felec);
1817
1818             fscal            = felec;
1819
1820             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1821
1822              /* Update vectorial force */
1823             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1824             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1825             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1826
1827             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1828             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1829             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1830
1831             /**************************
1832              * CALCULATE INTERACTIONS *
1833              **************************/
1834
1835             r12              = _mm_mul_ps(rsq12,rinv12);
1836             r12              = _mm_andnot_ps(dummy_mask,r12);
1837
1838             /* EWALD ELECTROSTATICS */
1839
1840             /* Analytical PME correction */
1841             zeta2            = _mm_mul_ps(beta2,rsq12);
1842             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
1843             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1844             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1845             felec            = _mm_mul_ps(qq12,felec);
1846
1847             fscal            = felec;
1848
1849             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1850
1851              /* Update vectorial force */
1852             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1853             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1854             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1855
1856             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1857             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1858             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1859
1860             /**************************
1861              * CALCULATE INTERACTIONS *
1862              **************************/
1863
1864             r13              = _mm_mul_ps(rsq13,rinv13);
1865             r13              = _mm_andnot_ps(dummy_mask,r13);
1866
1867             /* EWALD ELECTROSTATICS */
1868
1869             /* Analytical PME correction */
1870             zeta2            = _mm_mul_ps(beta2,rsq13);
1871             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
1872             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1873             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1874             felec            = _mm_mul_ps(qq13,felec);
1875
1876             fscal            = felec;
1877
1878             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1879
1880              /* Update vectorial force */
1881             fix1             = _mm_macc_ps(dx13,fscal,fix1);
1882             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
1883             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
1884
1885             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
1886             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
1887             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
1888
1889             /**************************
1890              * CALCULATE INTERACTIONS *
1891              **************************/
1892
1893             r21              = _mm_mul_ps(rsq21,rinv21);
1894             r21              = _mm_andnot_ps(dummy_mask,r21);
1895
1896             /* EWALD ELECTROSTATICS */
1897
1898             /* Analytical PME correction */
1899             zeta2            = _mm_mul_ps(beta2,rsq21);
1900             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
1901             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1902             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1903             felec            = _mm_mul_ps(qq21,felec);
1904
1905             fscal            = felec;
1906
1907             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1908
1909              /* Update vectorial force */
1910             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1911             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1912             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1913
1914             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1915             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1916             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1917
1918             /**************************
1919              * CALCULATE INTERACTIONS *
1920              **************************/
1921
1922             r22              = _mm_mul_ps(rsq22,rinv22);
1923             r22              = _mm_andnot_ps(dummy_mask,r22);
1924
1925             /* EWALD ELECTROSTATICS */
1926
1927             /* Analytical PME correction */
1928             zeta2            = _mm_mul_ps(beta2,rsq22);
1929             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
1930             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1931             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1932             felec            = _mm_mul_ps(qq22,felec);
1933
1934             fscal            = felec;
1935
1936             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1937
1938              /* Update vectorial force */
1939             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1940             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1941             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1942
1943             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1944             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1945             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1946
1947             /**************************
1948              * CALCULATE INTERACTIONS *
1949              **************************/
1950
1951             r23              = _mm_mul_ps(rsq23,rinv23);
1952             r23              = _mm_andnot_ps(dummy_mask,r23);
1953
1954             /* EWALD ELECTROSTATICS */
1955
1956             /* Analytical PME correction */
1957             zeta2            = _mm_mul_ps(beta2,rsq23);
1958             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
1959             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1960             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1961             felec            = _mm_mul_ps(qq23,felec);
1962
1963             fscal            = felec;
1964
1965             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1966
1967              /* Update vectorial force */
1968             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1969             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1970             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1971
1972             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1973             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1974             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1975
1976             /**************************
1977              * CALCULATE INTERACTIONS *
1978              **************************/
1979
1980             r31              = _mm_mul_ps(rsq31,rinv31);
1981             r31              = _mm_andnot_ps(dummy_mask,r31);
1982
1983             /* EWALD ELECTROSTATICS */
1984
1985             /* Analytical PME correction */
1986             zeta2            = _mm_mul_ps(beta2,rsq31);
1987             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
1988             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1989             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1990             felec            = _mm_mul_ps(qq31,felec);
1991
1992             fscal            = felec;
1993
1994             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1995
1996              /* Update vectorial force */
1997             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1998             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1999             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
2000
2001             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
2002             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
2003             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
2004
2005             /**************************
2006              * CALCULATE INTERACTIONS *
2007              **************************/
2008
2009             r32              = _mm_mul_ps(rsq32,rinv32);
2010             r32              = _mm_andnot_ps(dummy_mask,r32);
2011
2012             /* EWALD ELECTROSTATICS */
2013
2014             /* Analytical PME correction */
2015             zeta2            = _mm_mul_ps(beta2,rsq32);
2016             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
2017             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2018             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2019             felec            = _mm_mul_ps(qq32,felec);
2020
2021             fscal            = felec;
2022
2023             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2024
2025              /* Update vectorial force */
2026             fix3             = _mm_macc_ps(dx32,fscal,fix3);
2027             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
2028             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
2029
2030             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
2031             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
2032             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
2033
2034             /**************************
2035              * CALCULATE INTERACTIONS *
2036              **************************/
2037
2038             r33              = _mm_mul_ps(rsq33,rinv33);
2039             r33              = _mm_andnot_ps(dummy_mask,r33);
2040
2041             /* EWALD ELECTROSTATICS */
2042
2043             /* Analytical PME correction */
2044             zeta2            = _mm_mul_ps(beta2,rsq33);
2045             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
2046             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2047             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2048             felec            = _mm_mul_ps(qq33,felec);
2049
2050             fscal            = felec;
2051
2052             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2053
2054              /* Update vectorial force */
2055             fix3             = _mm_macc_ps(dx33,fscal,fix3);
2056             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
2057             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
2058
2059             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
2060             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
2061             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
2062
2063             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2064             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2065             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2066             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2067
2068             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2069                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2070                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2071
2072             /* Inner loop uses 294 flops */
2073         }
2074
2075         /* End of innermost loop */
2076
2077         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2078                                               f+i_coord_offset,fshift+i_shift_offset);
2079
2080         /* Increment number of inner iterations */
2081         inneriter                  += j_index_end - j_index_start;
2082
2083         /* Outer loop uses 24 flops */
2084     }
2085
2086     /* Increment number of outer iterations */
2087     outeriter        += nri;
2088
2089     /* Update outer/inner flops */
2090
2091     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*294);
2092 }