94f9653835d0b88bbe7b10e49df70e4744d4451a
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecEwSh_VdwNone_GeomW4W4_avx_128_fma_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "gmx_math_x86_avx_128_fma_single.h"
50 #include "kernelutil_x86_avx_128_fma_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_avx_128_fma_single
54  * Electrostatics interaction: Ewald
55  * VdW interaction:            None
56  * Geometry:                   Water4-Water4
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_avx_128_fma_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83     real             scratch[4*DIM];
84     __m128           fscal,rcutoff,rcutoff2,jidxall;
85     int              vdwioffset1;
86     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     int              vdwioffset2;
88     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     int              vdwioffset3;
90     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
96     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
97     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
98     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
99     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
100     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
101     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
102     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
103     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
104     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
105     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
106     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
107     real             *charge;
108     __m128i          ewitab;
109     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
110     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
111     real             *ewtab;
112     __m128           dummy_mask,cutoff_mask;
113     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
114     __m128           one     = _mm_set1_ps(1.0);
115     __m128           two     = _mm_set1_ps(2.0);
116     x                = xx[0];
117     f                = ff[0];
118
119     nri              = nlist->nri;
120     iinr             = nlist->iinr;
121     jindex           = nlist->jindex;
122     jjnr             = nlist->jjnr;
123     shiftidx         = nlist->shift;
124     gid              = nlist->gid;
125     shiftvec         = fr->shift_vec[0];
126     fshift           = fr->fshift[0];
127     facel            = _mm_set1_ps(fr->epsfac);
128     charge           = mdatoms->chargeA;
129
130     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
131     beta             = _mm_set1_ps(fr->ic->ewaldcoeff);
132     beta2            = _mm_mul_ps(beta,beta);
133     beta3            = _mm_mul_ps(beta,beta2);
134     ewtab            = fr->ic->tabq_coul_FDV0;
135     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
136     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
137
138     /* Setup water-specific parameters */
139     inr              = nlist->iinr[0];
140     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
141     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
142     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
143
144     jq1              = _mm_set1_ps(charge[inr+1]);
145     jq2              = _mm_set1_ps(charge[inr+2]);
146     jq3              = _mm_set1_ps(charge[inr+3]);
147     qq11             = _mm_mul_ps(iq1,jq1);
148     qq12             = _mm_mul_ps(iq1,jq2);
149     qq13             = _mm_mul_ps(iq1,jq3);
150     qq21             = _mm_mul_ps(iq2,jq1);
151     qq22             = _mm_mul_ps(iq2,jq2);
152     qq23             = _mm_mul_ps(iq2,jq3);
153     qq31             = _mm_mul_ps(iq3,jq1);
154     qq32             = _mm_mul_ps(iq3,jq2);
155     qq33             = _mm_mul_ps(iq3,jq3);
156
157     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
158     rcutoff_scalar   = fr->rcoulomb;
159     rcutoff          = _mm_set1_ps(rcutoff_scalar);
160     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
161
162     /* Avoid stupid compiler warnings */
163     jnrA = jnrB = jnrC = jnrD = 0;
164     j_coord_offsetA = 0;
165     j_coord_offsetB = 0;
166     j_coord_offsetC = 0;
167     j_coord_offsetD = 0;
168
169     outeriter        = 0;
170     inneriter        = 0;
171
172     for(iidx=0;iidx<4*DIM;iidx++)
173     {
174         scratch[iidx] = 0.0;
175     }
176
177     /* Start outer loop over neighborlists */
178     for(iidx=0; iidx<nri; iidx++)
179     {
180         /* Load shift vector for this list */
181         i_shift_offset   = DIM*shiftidx[iidx];
182
183         /* Load limits for loop over neighbors */
184         j_index_start    = jindex[iidx];
185         j_index_end      = jindex[iidx+1];
186
187         /* Get outer coordinate index */
188         inr              = iinr[iidx];
189         i_coord_offset   = DIM*inr;
190
191         /* Load i particle coords and add shift vector */
192         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
193                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
194
195         fix1             = _mm_setzero_ps();
196         fiy1             = _mm_setzero_ps();
197         fiz1             = _mm_setzero_ps();
198         fix2             = _mm_setzero_ps();
199         fiy2             = _mm_setzero_ps();
200         fiz2             = _mm_setzero_ps();
201         fix3             = _mm_setzero_ps();
202         fiy3             = _mm_setzero_ps();
203         fiz3             = _mm_setzero_ps();
204
205         /* Reset potential sums */
206         velecsum         = _mm_setzero_ps();
207
208         /* Start inner kernel loop */
209         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
210         {
211
212             /* Get j neighbor index, and coordinate index */
213             jnrA             = jjnr[jidx];
214             jnrB             = jjnr[jidx+1];
215             jnrC             = jjnr[jidx+2];
216             jnrD             = jjnr[jidx+3];
217             j_coord_offsetA  = DIM*jnrA;
218             j_coord_offsetB  = DIM*jnrB;
219             j_coord_offsetC  = DIM*jnrC;
220             j_coord_offsetD  = DIM*jnrD;
221
222             /* load j atom coordinates */
223             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
224                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
225                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
226
227             /* Calculate displacement vector */
228             dx11             = _mm_sub_ps(ix1,jx1);
229             dy11             = _mm_sub_ps(iy1,jy1);
230             dz11             = _mm_sub_ps(iz1,jz1);
231             dx12             = _mm_sub_ps(ix1,jx2);
232             dy12             = _mm_sub_ps(iy1,jy2);
233             dz12             = _mm_sub_ps(iz1,jz2);
234             dx13             = _mm_sub_ps(ix1,jx3);
235             dy13             = _mm_sub_ps(iy1,jy3);
236             dz13             = _mm_sub_ps(iz1,jz3);
237             dx21             = _mm_sub_ps(ix2,jx1);
238             dy21             = _mm_sub_ps(iy2,jy1);
239             dz21             = _mm_sub_ps(iz2,jz1);
240             dx22             = _mm_sub_ps(ix2,jx2);
241             dy22             = _mm_sub_ps(iy2,jy2);
242             dz22             = _mm_sub_ps(iz2,jz2);
243             dx23             = _mm_sub_ps(ix2,jx3);
244             dy23             = _mm_sub_ps(iy2,jy3);
245             dz23             = _mm_sub_ps(iz2,jz3);
246             dx31             = _mm_sub_ps(ix3,jx1);
247             dy31             = _mm_sub_ps(iy3,jy1);
248             dz31             = _mm_sub_ps(iz3,jz1);
249             dx32             = _mm_sub_ps(ix3,jx2);
250             dy32             = _mm_sub_ps(iy3,jy2);
251             dz32             = _mm_sub_ps(iz3,jz2);
252             dx33             = _mm_sub_ps(ix3,jx3);
253             dy33             = _mm_sub_ps(iy3,jy3);
254             dz33             = _mm_sub_ps(iz3,jz3);
255
256             /* Calculate squared distance and things based on it */
257             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
258             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
259             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
260             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
261             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
262             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
263             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
264             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
265             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
266
267             rinv11           = gmx_mm_invsqrt_ps(rsq11);
268             rinv12           = gmx_mm_invsqrt_ps(rsq12);
269             rinv13           = gmx_mm_invsqrt_ps(rsq13);
270             rinv21           = gmx_mm_invsqrt_ps(rsq21);
271             rinv22           = gmx_mm_invsqrt_ps(rsq22);
272             rinv23           = gmx_mm_invsqrt_ps(rsq23);
273             rinv31           = gmx_mm_invsqrt_ps(rsq31);
274             rinv32           = gmx_mm_invsqrt_ps(rsq32);
275             rinv33           = gmx_mm_invsqrt_ps(rsq33);
276
277             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
278             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
279             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
280             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
281             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
282             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
283             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
284             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
285             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
286
287             fjx1             = _mm_setzero_ps();
288             fjy1             = _mm_setzero_ps();
289             fjz1             = _mm_setzero_ps();
290             fjx2             = _mm_setzero_ps();
291             fjy2             = _mm_setzero_ps();
292             fjz2             = _mm_setzero_ps();
293             fjx3             = _mm_setzero_ps();
294             fjy3             = _mm_setzero_ps();
295             fjz3             = _mm_setzero_ps();
296
297             /**************************
298              * CALCULATE INTERACTIONS *
299              **************************/
300
301             if (gmx_mm_any_lt(rsq11,rcutoff2))
302             {
303
304             r11              = _mm_mul_ps(rsq11,rinv11);
305
306             /* EWALD ELECTROSTATICS */
307
308             /* Analytical PME correction */
309             zeta2            = _mm_mul_ps(beta2,rsq11);
310             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
311             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
312             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
313             felec            = _mm_mul_ps(qq11,felec);
314             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
315             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv11,sh_ewald));
316             velec            = _mm_mul_ps(qq11,velec);
317
318             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
319
320             /* Update potential sum for this i atom from the interaction with this j atom. */
321             velec            = _mm_and_ps(velec,cutoff_mask);
322             velecsum         = _mm_add_ps(velecsum,velec);
323
324             fscal            = felec;
325
326             fscal            = _mm_and_ps(fscal,cutoff_mask);
327
328              /* Update vectorial force */
329             fix1             = _mm_macc_ps(dx11,fscal,fix1);
330             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
331             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
332
333             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
334             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
335             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
336
337             }
338
339             /**************************
340              * CALCULATE INTERACTIONS *
341              **************************/
342
343             if (gmx_mm_any_lt(rsq12,rcutoff2))
344             {
345
346             r12              = _mm_mul_ps(rsq12,rinv12);
347
348             /* EWALD ELECTROSTATICS */
349
350             /* Analytical PME correction */
351             zeta2            = _mm_mul_ps(beta2,rsq12);
352             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
353             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
354             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
355             felec            = _mm_mul_ps(qq12,felec);
356             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
357             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv12,sh_ewald));
358             velec            = _mm_mul_ps(qq12,velec);
359
360             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
361
362             /* Update potential sum for this i atom from the interaction with this j atom. */
363             velec            = _mm_and_ps(velec,cutoff_mask);
364             velecsum         = _mm_add_ps(velecsum,velec);
365
366             fscal            = felec;
367
368             fscal            = _mm_and_ps(fscal,cutoff_mask);
369
370              /* Update vectorial force */
371             fix1             = _mm_macc_ps(dx12,fscal,fix1);
372             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
373             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
374
375             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
376             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
377             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
378
379             }
380
381             /**************************
382              * CALCULATE INTERACTIONS *
383              **************************/
384
385             if (gmx_mm_any_lt(rsq13,rcutoff2))
386             {
387
388             r13              = _mm_mul_ps(rsq13,rinv13);
389
390             /* EWALD ELECTROSTATICS */
391
392             /* Analytical PME correction */
393             zeta2            = _mm_mul_ps(beta2,rsq13);
394             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
395             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
396             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
397             felec            = _mm_mul_ps(qq13,felec);
398             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
399             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv13,sh_ewald));
400             velec            = _mm_mul_ps(qq13,velec);
401
402             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
403
404             /* Update potential sum for this i atom from the interaction with this j atom. */
405             velec            = _mm_and_ps(velec,cutoff_mask);
406             velecsum         = _mm_add_ps(velecsum,velec);
407
408             fscal            = felec;
409
410             fscal            = _mm_and_ps(fscal,cutoff_mask);
411
412              /* Update vectorial force */
413             fix1             = _mm_macc_ps(dx13,fscal,fix1);
414             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
415             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
416
417             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
418             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
419             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
420
421             }
422
423             /**************************
424              * CALCULATE INTERACTIONS *
425              **************************/
426
427             if (gmx_mm_any_lt(rsq21,rcutoff2))
428             {
429
430             r21              = _mm_mul_ps(rsq21,rinv21);
431
432             /* EWALD ELECTROSTATICS */
433
434             /* Analytical PME correction */
435             zeta2            = _mm_mul_ps(beta2,rsq21);
436             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
437             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
438             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
439             felec            = _mm_mul_ps(qq21,felec);
440             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
441             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv21,sh_ewald));
442             velec            = _mm_mul_ps(qq21,velec);
443
444             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
445
446             /* Update potential sum for this i atom from the interaction with this j atom. */
447             velec            = _mm_and_ps(velec,cutoff_mask);
448             velecsum         = _mm_add_ps(velecsum,velec);
449
450             fscal            = felec;
451
452             fscal            = _mm_and_ps(fscal,cutoff_mask);
453
454              /* Update vectorial force */
455             fix2             = _mm_macc_ps(dx21,fscal,fix2);
456             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
457             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
458
459             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
460             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
461             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
462
463             }
464
465             /**************************
466              * CALCULATE INTERACTIONS *
467              **************************/
468
469             if (gmx_mm_any_lt(rsq22,rcutoff2))
470             {
471
472             r22              = _mm_mul_ps(rsq22,rinv22);
473
474             /* EWALD ELECTROSTATICS */
475
476             /* Analytical PME correction */
477             zeta2            = _mm_mul_ps(beta2,rsq22);
478             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
479             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
480             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
481             felec            = _mm_mul_ps(qq22,felec);
482             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
483             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv22,sh_ewald));
484             velec            = _mm_mul_ps(qq22,velec);
485
486             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
487
488             /* Update potential sum for this i atom from the interaction with this j atom. */
489             velec            = _mm_and_ps(velec,cutoff_mask);
490             velecsum         = _mm_add_ps(velecsum,velec);
491
492             fscal            = felec;
493
494             fscal            = _mm_and_ps(fscal,cutoff_mask);
495
496              /* Update vectorial force */
497             fix2             = _mm_macc_ps(dx22,fscal,fix2);
498             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
499             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
500
501             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
502             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
503             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
504
505             }
506
507             /**************************
508              * CALCULATE INTERACTIONS *
509              **************************/
510
511             if (gmx_mm_any_lt(rsq23,rcutoff2))
512             {
513
514             r23              = _mm_mul_ps(rsq23,rinv23);
515
516             /* EWALD ELECTROSTATICS */
517
518             /* Analytical PME correction */
519             zeta2            = _mm_mul_ps(beta2,rsq23);
520             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
521             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
522             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
523             felec            = _mm_mul_ps(qq23,felec);
524             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
525             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv23,sh_ewald));
526             velec            = _mm_mul_ps(qq23,velec);
527
528             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
529
530             /* Update potential sum for this i atom from the interaction with this j atom. */
531             velec            = _mm_and_ps(velec,cutoff_mask);
532             velecsum         = _mm_add_ps(velecsum,velec);
533
534             fscal            = felec;
535
536             fscal            = _mm_and_ps(fscal,cutoff_mask);
537
538              /* Update vectorial force */
539             fix2             = _mm_macc_ps(dx23,fscal,fix2);
540             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
541             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
542
543             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
544             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
545             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
546
547             }
548
549             /**************************
550              * CALCULATE INTERACTIONS *
551              **************************/
552
553             if (gmx_mm_any_lt(rsq31,rcutoff2))
554             {
555
556             r31              = _mm_mul_ps(rsq31,rinv31);
557
558             /* EWALD ELECTROSTATICS */
559
560             /* Analytical PME correction */
561             zeta2            = _mm_mul_ps(beta2,rsq31);
562             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
563             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
564             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
565             felec            = _mm_mul_ps(qq31,felec);
566             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
567             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv31,sh_ewald));
568             velec            = _mm_mul_ps(qq31,velec);
569
570             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
571
572             /* Update potential sum for this i atom from the interaction with this j atom. */
573             velec            = _mm_and_ps(velec,cutoff_mask);
574             velecsum         = _mm_add_ps(velecsum,velec);
575
576             fscal            = felec;
577
578             fscal            = _mm_and_ps(fscal,cutoff_mask);
579
580              /* Update vectorial force */
581             fix3             = _mm_macc_ps(dx31,fscal,fix3);
582             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
583             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
584
585             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
586             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
587             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
588
589             }
590
591             /**************************
592              * CALCULATE INTERACTIONS *
593              **************************/
594
595             if (gmx_mm_any_lt(rsq32,rcutoff2))
596             {
597
598             r32              = _mm_mul_ps(rsq32,rinv32);
599
600             /* EWALD ELECTROSTATICS */
601
602             /* Analytical PME correction */
603             zeta2            = _mm_mul_ps(beta2,rsq32);
604             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
605             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
606             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
607             felec            = _mm_mul_ps(qq32,felec);
608             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
609             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv32,sh_ewald));
610             velec            = _mm_mul_ps(qq32,velec);
611
612             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
613
614             /* Update potential sum for this i atom from the interaction with this j atom. */
615             velec            = _mm_and_ps(velec,cutoff_mask);
616             velecsum         = _mm_add_ps(velecsum,velec);
617
618             fscal            = felec;
619
620             fscal            = _mm_and_ps(fscal,cutoff_mask);
621
622              /* Update vectorial force */
623             fix3             = _mm_macc_ps(dx32,fscal,fix3);
624             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
625             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
626
627             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
628             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
629             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
630
631             }
632
633             /**************************
634              * CALCULATE INTERACTIONS *
635              **************************/
636
637             if (gmx_mm_any_lt(rsq33,rcutoff2))
638             {
639
640             r33              = _mm_mul_ps(rsq33,rinv33);
641
642             /* EWALD ELECTROSTATICS */
643
644             /* Analytical PME correction */
645             zeta2            = _mm_mul_ps(beta2,rsq33);
646             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
647             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
648             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
649             felec            = _mm_mul_ps(qq33,felec);
650             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
651             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv33,sh_ewald));
652             velec            = _mm_mul_ps(qq33,velec);
653
654             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
655
656             /* Update potential sum for this i atom from the interaction with this j atom. */
657             velec            = _mm_and_ps(velec,cutoff_mask);
658             velecsum         = _mm_add_ps(velecsum,velec);
659
660             fscal            = felec;
661
662             fscal            = _mm_and_ps(fscal,cutoff_mask);
663
664              /* Update vectorial force */
665             fix3             = _mm_macc_ps(dx33,fscal,fix3);
666             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
667             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
668
669             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
670             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
671             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
672
673             }
674
675             fjptrA             = f+j_coord_offsetA;
676             fjptrB             = f+j_coord_offsetB;
677             fjptrC             = f+j_coord_offsetC;
678             fjptrD             = f+j_coord_offsetD;
679
680             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
681                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
682
683             /* Inner loop uses 297 flops */
684         }
685
686         if(jidx<j_index_end)
687         {
688
689             /* Get j neighbor index, and coordinate index */
690             jnrlistA         = jjnr[jidx];
691             jnrlistB         = jjnr[jidx+1];
692             jnrlistC         = jjnr[jidx+2];
693             jnrlistD         = jjnr[jidx+3];
694             /* Sign of each element will be negative for non-real atoms.
695              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
696              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
697              */
698             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
699             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
700             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
701             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
702             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
703             j_coord_offsetA  = DIM*jnrA;
704             j_coord_offsetB  = DIM*jnrB;
705             j_coord_offsetC  = DIM*jnrC;
706             j_coord_offsetD  = DIM*jnrD;
707
708             /* load j atom coordinates */
709             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
710                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
711                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
712
713             /* Calculate displacement vector */
714             dx11             = _mm_sub_ps(ix1,jx1);
715             dy11             = _mm_sub_ps(iy1,jy1);
716             dz11             = _mm_sub_ps(iz1,jz1);
717             dx12             = _mm_sub_ps(ix1,jx2);
718             dy12             = _mm_sub_ps(iy1,jy2);
719             dz12             = _mm_sub_ps(iz1,jz2);
720             dx13             = _mm_sub_ps(ix1,jx3);
721             dy13             = _mm_sub_ps(iy1,jy3);
722             dz13             = _mm_sub_ps(iz1,jz3);
723             dx21             = _mm_sub_ps(ix2,jx1);
724             dy21             = _mm_sub_ps(iy2,jy1);
725             dz21             = _mm_sub_ps(iz2,jz1);
726             dx22             = _mm_sub_ps(ix2,jx2);
727             dy22             = _mm_sub_ps(iy2,jy2);
728             dz22             = _mm_sub_ps(iz2,jz2);
729             dx23             = _mm_sub_ps(ix2,jx3);
730             dy23             = _mm_sub_ps(iy2,jy3);
731             dz23             = _mm_sub_ps(iz2,jz3);
732             dx31             = _mm_sub_ps(ix3,jx1);
733             dy31             = _mm_sub_ps(iy3,jy1);
734             dz31             = _mm_sub_ps(iz3,jz1);
735             dx32             = _mm_sub_ps(ix3,jx2);
736             dy32             = _mm_sub_ps(iy3,jy2);
737             dz32             = _mm_sub_ps(iz3,jz2);
738             dx33             = _mm_sub_ps(ix3,jx3);
739             dy33             = _mm_sub_ps(iy3,jy3);
740             dz33             = _mm_sub_ps(iz3,jz3);
741
742             /* Calculate squared distance and things based on it */
743             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
744             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
745             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
746             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
747             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
748             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
749             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
750             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
751             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
752
753             rinv11           = gmx_mm_invsqrt_ps(rsq11);
754             rinv12           = gmx_mm_invsqrt_ps(rsq12);
755             rinv13           = gmx_mm_invsqrt_ps(rsq13);
756             rinv21           = gmx_mm_invsqrt_ps(rsq21);
757             rinv22           = gmx_mm_invsqrt_ps(rsq22);
758             rinv23           = gmx_mm_invsqrt_ps(rsq23);
759             rinv31           = gmx_mm_invsqrt_ps(rsq31);
760             rinv32           = gmx_mm_invsqrt_ps(rsq32);
761             rinv33           = gmx_mm_invsqrt_ps(rsq33);
762
763             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
764             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
765             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
766             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
767             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
768             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
769             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
770             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
771             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
772
773             fjx1             = _mm_setzero_ps();
774             fjy1             = _mm_setzero_ps();
775             fjz1             = _mm_setzero_ps();
776             fjx2             = _mm_setzero_ps();
777             fjy2             = _mm_setzero_ps();
778             fjz2             = _mm_setzero_ps();
779             fjx3             = _mm_setzero_ps();
780             fjy3             = _mm_setzero_ps();
781             fjz3             = _mm_setzero_ps();
782
783             /**************************
784              * CALCULATE INTERACTIONS *
785              **************************/
786
787             if (gmx_mm_any_lt(rsq11,rcutoff2))
788             {
789
790             r11              = _mm_mul_ps(rsq11,rinv11);
791             r11              = _mm_andnot_ps(dummy_mask,r11);
792
793             /* EWALD ELECTROSTATICS */
794
795             /* Analytical PME correction */
796             zeta2            = _mm_mul_ps(beta2,rsq11);
797             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
798             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
799             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
800             felec            = _mm_mul_ps(qq11,felec);
801             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
802             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv11,sh_ewald));
803             velec            = _mm_mul_ps(qq11,velec);
804
805             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
806
807             /* Update potential sum for this i atom from the interaction with this j atom. */
808             velec            = _mm_and_ps(velec,cutoff_mask);
809             velec            = _mm_andnot_ps(dummy_mask,velec);
810             velecsum         = _mm_add_ps(velecsum,velec);
811
812             fscal            = felec;
813
814             fscal            = _mm_and_ps(fscal,cutoff_mask);
815
816             fscal            = _mm_andnot_ps(dummy_mask,fscal);
817
818              /* Update vectorial force */
819             fix1             = _mm_macc_ps(dx11,fscal,fix1);
820             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
821             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
822
823             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
824             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
825             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
826
827             }
828
829             /**************************
830              * CALCULATE INTERACTIONS *
831              **************************/
832
833             if (gmx_mm_any_lt(rsq12,rcutoff2))
834             {
835
836             r12              = _mm_mul_ps(rsq12,rinv12);
837             r12              = _mm_andnot_ps(dummy_mask,r12);
838
839             /* EWALD ELECTROSTATICS */
840
841             /* Analytical PME correction */
842             zeta2            = _mm_mul_ps(beta2,rsq12);
843             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
844             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
845             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
846             felec            = _mm_mul_ps(qq12,felec);
847             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
848             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv12,sh_ewald));
849             velec            = _mm_mul_ps(qq12,velec);
850
851             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
852
853             /* Update potential sum for this i atom from the interaction with this j atom. */
854             velec            = _mm_and_ps(velec,cutoff_mask);
855             velec            = _mm_andnot_ps(dummy_mask,velec);
856             velecsum         = _mm_add_ps(velecsum,velec);
857
858             fscal            = felec;
859
860             fscal            = _mm_and_ps(fscal,cutoff_mask);
861
862             fscal            = _mm_andnot_ps(dummy_mask,fscal);
863
864              /* Update vectorial force */
865             fix1             = _mm_macc_ps(dx12,fscal,fix1);
866             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
867             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
868
869             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
870             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
871             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
872
873             }
874
875             /**************************
876              * CALCULATE INTERACTIONS *
877              **************************/
878
879             if (gmx_mm_any_lt(rsq13,rcutoff2))
880             {
881
882             r13              = _mm_mul_ps(rsq13,rinv13);
883             r13              = _mm_andnot_ps(dummy_mask,r13);
884
885             /* EWALD ELECTROSTATICS */
886
887             /* Analytical PME correction */
888             zeta2            = _mm_mul_ps(beta2,rsq13);
889             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
890             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
891             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
892             felec            = _mm_mul_ps(qq13,felec);
893             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
894             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv13,sh_ewald));
895             velec            = _mm_mul_ps(qq13,velec);
896
897             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
898
899             /* Update potential sum for this i atom from the interaction with this j atom. */
900             velec            = _mm_and_ps(velec,cutoff_mask);
901             velec            = _mm_andnot_ps(dummy_mask,velec);
902             velecsum         = _mm_add_ps(velecsum,velec);
903
904             fscal            = felec;
905
906             fscal            = _mm_and_ps(fscal,cutoff_mask);
907
908             fscal            = _mm_andnot_ps(dummy_mask,fscal);
909
910              /* Update vectorial force */
911             fix1             = _mm_macc_ps(dx13,fscal,fix1);
912             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
913             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
914
915             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
916             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
917             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
918
919             }
920
921             /**************************
922              * CALCULATE INTERACTIONS *
923              **************************/
924
925             if (gmx_mm_any_lt(rsq21,rcutoff2))
926             {
927
928             r21              = _mm_mul_ps(rsq21,rinv21);
929             r21              = _mm_andnot_ps(dummy_mask,r21);
930
931             /* EWALD ELECTROSTATICS */
932
933             /* Analytical PME correction */
934             zeta2            = _mm_mul_ps(beta2,rsq21);
935             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
936             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
937             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
938             felec            = _mm_mul_ps(qq21,felec);
939             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
940             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv21,sh_ewald));
941             velec            = _mm_mul_ps(qq21,velec);
942
943             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
944
945             /* Update potential sum for this i atom from the interaction with this j atom. */
946             velec            = _mm_and_ps(velec,cutoff_mask);
947             velec            = _mm_andnot_ps(dummy_mask,velec);
948             velecsum         = _mm_add_ps(velecsum,velec);
949
950             fscal            = felec;
951
952             fscal            = _mm_and_ps(fscal,cutoff_mask);
953
954             fscal            = _mm_andnot_ps(dummy_mask,fscal);
955
956              /* Update vectorial force */
957             fix2             = _mm_macc_ps(dx21,fscal,fix2);
958             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
959             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
960
961             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
962             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
963             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
964
965             }
966
967             /**************************
968              * CALCULATE INTERACTIONS *
969              **************************/
970
971             if (gmx_mm_any_lt(rsq22,rcutoff2))
972             {
973
974             r22              = _mm_mul_ps(rsq22,rinv22);
975             r22              = _mm_andnot_ps(dummy_mask,r22);
976
977             /* EWALD ELECTROSTATICS */
978
979             /* Analytical PME correction */
980             zeta2            = _mm_mul_ps(beta2,rsq22);
981             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
982             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
983             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
984             felec            = _mm_mul_ps(qq22,felec);
985             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
986             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv22,sh_ewald));
987             velec            = _mm_mul_ps(qq22,velec);
988
989             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
990
991             /* Update potential sum for this i atom from the interaction with this j atom. */
992             velec            = _mm_and_ps(velec,cutoff_mask);
993             velec            = _mm_andnot_ps(dummy_mask,velec);
994             velecsum         = _mm_add_ps(velecsum,velec);
995
996             fscal            = felec;
997
998             fscal            = _mm_and_ps(fscal,cutoff_mask);
999
1000             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1001
1002              /* Update vectorial force */
1003             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1004             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1005             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1006
1007             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1008             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1009             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1010
1011             }
1012
1013             /**************************
1014              * CALCULATE INTERACTIONS *
1015              **************************/
1016
1017             if (gmx_mm_any_lt(rsq23,rcutoff2))
1018             {
1019
1020             r23              = _mm_mul_ps(rsq23,rinv23);
1021             r23              = _mm_andnot_ps(dummy_mask,r23);
1022
1023             /* EWALD ELECTROSTATICS */
1024
1025             /* Analytical PME correction */
1026             zeta2            = _mm_mul_ps(beta2,rsq23);
1027             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
1028             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1029             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1030             felec            = _mm_mul_ps(qq23,felec);
1031             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1032             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv23,sh_ewald));
1033             velec            = _mm_mul_ps(qq23,velec);
1034
1035             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1036
1037             /* Update potential sum for this i atom from the interaction with this j atom. */
1038             velec            = _mm_and_ps(velec,cutoff_mask);
1039             velec            = _mm_andnot_ps(dummy_mask,velec);
1040             velecsum         = _mm_add_ps(velecsum,velec);
1041
1042             fscal            = felec;
1043
1044             fscal            = _mm_and_ps(fscal,cutoff_mask);
1045
1046             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1047
1048              /* Update vectorial force */
1049             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1050             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1051             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1052
1053             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1054             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1055             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1056
1057             }
1058
1059             /**************************
1060              * CALCULATE INTERACTIONS *
1061              **************************/
1062
1063             if (gmx_mm_any_lt(rsq31,rcutoff2))
1064             {
1065
1066             r31              = _mm_mul_ps(rsq31,rinv31);
1067             r31              = _mm_andnot_ps(dummy_mask,r31);
1068
1069             /* EWALD ELECTROSTATICS */
1070
1071             /* Analytical PME correction */
1072             zeta2            = _mm_mul_ps(beta2,rsq31);
1073             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
1074             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1075             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1076             felec            = _mm_mul_ps(qq31,felec);
1077             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1078             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv31,sh_ewald));
1079             velec            = _mm_mul_ps(qq31,velec);
1080
1081             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1082
1083             /* Update potential sum for this i atom from the interaction with this j atom. */
1084             velec            = _mm_and_ps(velec,cutoff_mask);
1085             velec            = _mm_andnot_ps(dummy_mask,velec);
1086             velecsum         = _mm_add_ps(velecsum,velec);
1087
1088             fscal            = felec;
1089
1090             fscal            = _mm_and_ps(fscal,cutoff_mask);
1091
1092             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1093
1094              /* Update vectorial force */
1095             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1096             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1097             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1098
1099             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1100             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1101             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1102
1103             }
1104
1105             /**************************
1106              * CALCULATE INTERACTIONS *
1107              **************************/
1108
1109             if (gmx_mm_any_lt(rsq32,rcutoff2))
1110             {
1111
1112             r32              = _mm_mul_ps(rsq32,rinv32);
1113             r32              = _mm_andnot_ps(dummy_mask,r32);
1114
1115             /* EWALD ELECTROSTATICS */
1116
1117             /* Analytical PME correction */
1118             zeta2            = _mm_mul_ps(beta2,rsq32);
1119             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
1120             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1121             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1122             felec            = _mm_mul_ps(qq32,felec);
1123             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1124             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv32,sh_ewald));
1125             velec            = _mm_mul_ps(qq32,velec);
1126
1127             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1128
1129             /* Update potential sum for this i atom from the interaction with this j atom. */
1130             velec            = _mm_and_ps(velec,cutoff_mask);
1131             velec            = _mm_andnot_ps(dummy_mask,velec);
1132             velecsum         = _mm_add_ps(velecsum,velec);
1133
1134             fscal            = felec;
1135
1136             fscal            = _mm_and_ps(fscal,cutoff_mask);
1137
1138             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1139
1140              /* Update vectorial force */
1141             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1142             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1143             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1144
1145             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1146             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1147             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1148
1149             }
1150
1151             /**************************
1152              * CALCULATE INTERACTIONS *
1153              **************************/
1154
1155             if (gmx_mm_any_lt(rsq33,rcutoff2))
1156             {
1157
1158             r33              = _mm_mul_ps(rsq33,rinv33);
1159             r33              = _mm_andnot_ps(dummy_mask,r33);
1160
1161             /* EWALD ELECTROSTATICS */
1162
1163             /* Analytical PME correction */
1164             zeta2            = _mm_mul_ps(beta2,rsq33);
1165             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
1166             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1167             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1168             felec            = _mm_mul_ps(qq33,felec);
1169             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1170             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv33,sh_ewald));
1171             velec            = _mm_mul_ps(qq33,velec);
1172
1173             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1174
1175             /* Update potential sum for this i atom from the interaction with this j atom. */
1176             velec            = _mm_and_ps(velec,cutoff_mask);
1177             velec            = _mm_andnot_ps(dummy_mask,velec);
1178             velecsum         = _mm_add_ps(velecsum,velec);
1179
1180             fscal            = felec;
1181
1182             fscal            = _mm_and_ps(fscal,cutoff_mask);
1183
1184             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1185
1186              /* Update vectorial force */
1187             fix3             = _mm_macc_ps(dx33,fscal,fix3);
1188             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
1189             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
1190
1191             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
1192             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
1193             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
1194
1195             }
1196
1197             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1198             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1199             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1200             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1201
1202             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1203                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1204
1205             /* Inner loop uses 306 flops */
1206         }
1207
1208         /* End of innermost loop */
1209
1210         gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1211                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
1212
1213         ggid                        = gid[iidx];
1214         /* Update potential energies */
1215         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1216
1217         /* Increment number of inner iterations */
1218         inneriter                  += j_index_end - j_index_start;
1219
1220         /* Outer loop uses 19 flops */
1221     }
1222
1223     /* Increment number of outer iterations */
1224     outeriter        += nri;
1225
1226     /* Update outer/inner flops */
1227
1228     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*306);
1229 }
1230 /*
1231  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_avx_128_fma_single
1232  * Electrostatics interaction: Ewald
1233  * VdW interaction:            None
1234  * Geometry:                   Water4-Water4
1235  * Calculate force/pot:        Force
1236  */
1237 void
1238 nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_avx_128_fma_single
1239                     (t_nblist                    * gmx_restrict       nlist,
1240                      rvec                        * gmx_restrict          xx,
1241                      rvec                        * gmx_restrict          ff,
1242                      t_forcerec                  * gmx_restrict          fr,
1243                      t_mdatoms                   * gmx_restrict     mdatoms,
1244                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1245                      t_nrnb                      * gmx_restrict        nrnb)
1246 {
1247     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1248      * just 0 for non-waters.
1249      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1250      * jnr indices corresponding to data put in the four positions in the SIMD register.
1251      */
1252     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1253     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1254     int              jnrA,jnrB,jnrC,jnrD;
1255     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1256     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1257     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1258     real             rcutoff_scalar;
1259     real             *shiftvec,*fshift,*x,*f;
1260     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1261     real             scratch[4*DIM];
1262     __m128           fscal,rcutoff,rcutoff2,jidxall;
1263     int              vdwioffset1;
1264     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1265     int              vdwioffset2;
1266     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1267     int              vdwioffset3;
1268     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1269     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1270     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1271     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1272     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1273     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1274     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1275     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1276     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1277     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1278     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1279     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1280     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1281     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1282     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1283     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1284     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1285     real             *charge;
1286     __m128i          ewitab;
1287     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1288     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1289     real             *ewtab;
1290     __m128           dummy_mask,cutoff_mask;
1291     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1292     __m128           one     = _mm_set1_ps(1.0);
1293     __m128           two     = _mm_set1_ps(2.0);
1294     x                = xx[0];
1295     f                = ff[0];
1296
1297     nri              = nlist->nri;
1298     iinr             = nlist->iinr;
1299     jindex           = nlist->jindex;
1300     jjnr             = nlist->jjnr;
1301     shiftidx         = nlist->shift;
1302     gid              = nlist->gid;
1303     shiftvec         = fr->shift_vec[0];
1304     fshift           = fr->fshift[0];
1305     facel            = _mm_set1_ps(fr->epsfac);
1306     charge           = mdatoms->chargeA;
1307
1308     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1309     beta             = _mm_set1_ps(fr->ic->ewaldcoeff);
1310     beta2            = _mm_mul_ps(beta,beta);
1311     beta3            = _mm_mul_ps(beta,beta2);
1312     ewtab            = fr->ic->tabq_coul_F;
1313     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1314     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1315
1316     /* Setup water-specific parameters */
1317     inr              = nlist->iinr[0];
1318     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1319     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1320     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1321
1322     jq1              = _mm_set1_ps(charge[inr+1]);
1323     jq2              = _mm_set1_ps(charge[inr+2]);
1324     jq3              = _mm_set1_ps(charge[inr+3]);
1325     qq11             = _mm_mul_ps(iq1,jq1);
1326     qq12             = _mm_mul_ps(iq1,jq2);
1327     qq13             = _mm_mul_ps(iq1,jq3);
1328     qq21             = _mm_mul_ps(iq2,jq1);
1329     qq22             = _mm_mul_ps(iq2,jq2);
1330     qq23             = _mm_mul_ps(iq2,jq3);
1331     qq31             = _mm_mul_ps(iq3,jq1);
1332     qq32             = _mm_mul_ps(iq3,jq2);
1333     qq33             = _mm_mul_ps(iq3,jq3);
1334
1335     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1336     rcutoff_scalar   = fr->rcoulomb;
1337     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1338     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1339
1340     /* Avoid stupid compiler warnings */
1341     jnrA = jnrB = jnrC = jnrD = 0;
1342     j_coord_offsetA = 0;
1343     j_coord_offsetB = 0;
1344     j_coord_offsetC = 0;
1345     j_coord_offsetD = 0;
1346
1347     outeriter        = 0;
1348     inneriter        = 0;
1349
1350     for(iidx=0;iidx<4*DIM;iidx++)
1351     {
1352         scratch[iidx] = 0.0;
1353     }
1354
1355     /* Start outer loop over neighborlists */
1356     for(iidx=0; iidx<nri; iidx++)
1357     {
1358         /* Load shift vector for this list */
1359         i_shift_offset   = DIM*shiftidx[iidx];
1360
1361         /* Load limits for loop over neighbors */
1362         j_index_start    = jindex[iidx];
1363         j_index_end      = jindex[iidx+1];
1364
1365         /* Get outer coordinate index */
1366         inr              = iinr[iidx];
1367         i_coord_offset   = DIM*inr;
1368
1369         /* Load i particle coords and add shift vector */
1370         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1371                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1372
1373         fix1             = _mm_setzero_ps();
1374         fiy1             = _mm_setzero_ps();
1375         fiz1             = _mm_setzero_ps();
1376         fix2             = _mm_setzero_ps();
1377         fiy2             = _mm_setzero_ps();
1378         fiz2             = _mm_setzero_ps();
1379         fix3             = _mm_setzero_ps();
1380         fiy3             = _mm_setzero_ps();
1381         fiz3             = _mm_setzero_ps();
1382
1383         /* Start inner kernel loop */
1384         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1385         {
1386
1387             /* Get j neighbor index, and coordinate index */
1388             jnrA             = jjnr[jidx];
1389             jnrB             = jjnr[jidx+1];
1390             jnrC             = jjnr[jidx+2];
1391             jnrD             = jjnr[jidx+3];
1392             j_coord_offsetA  = DIM*jnrA;
1393             j_coord_offsetB  = DIM*jnrB;
1394             j_coord_offsetC  = DIM*jnrC;
1395             j_coord_offsetD  = DIM*jnrD;
1396
1397             /* load j atom coordinates */
1398             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1399                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1400                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1401
1402             /* Calculate displacement vector */
1403             dx11             = _mm_sub_ps(ix1,jx1);
1404             dy11             = _mm_sub_ps(iy1,jy1);
1405             dz11             = _mm_sub_ps(iz1,jz1);
1406             dx12             = _mm_sub_ps(ix1,jx2);
1407             dy12             = _mm_sub_ps(iy1,jy2);
1408             dz12             = _mm_sub_ps(iz1,jz2);
1409             dx13             = _mm_sub_ps(ix1,jx3);
1410             dy13             = _mm_sub_ps(iy1,jy3);
1411             dz13             = _mm_sub_ps(iz1,jz3);
1412             dx21             = _mm_sub_ps(ix2,jx1);
1413             dy21             = _mm_sub_ps(iy2,jy1);
1414             dz21             = _mm_sub_ps(iz2,jz1);
1415             dx22             = _mm_sub_ps(ix2,jx2);
1416             dy22             = _mm_sub_ps(iy2,jy2);
1417             dz22             = _mm_sub_ps(iz2,jz2);
1418             dx23             = _mm_sub_ps(ix2,jx3);
1419             dy23             = _mm_sub_ps(iy2,jy3);
1420             dz23             = _mm_sub_ps(iz2,jz3);
1421             dx31             = _mm_sub_ps(ix3,jx1);
1422             dy31             = _mm_sub_ps(iy3,jy1);
1423             dz31             = _mm_sub_ps(iz3,jz1);
1424             dx32             = _mm_sub_ps(ix3,jx2);
1425             dy32             = _mm_sub_ps(iy3,jy2);
1426             dz32             = _mm_sub_ps(iz3,jz2);
1427             dx33             = _mm_sub_ps(ix3,jx3);
1428             dy33             = _mm_sub_ps(iy3,jy3);
1429             dz33             = _mm_sub_ps(iz3,jz3);
1430
1431             /* Calculate squared distance and things based on it */
1432             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1433             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1434             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1435             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1436             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1437             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1438             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1439             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1440             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1441
1442             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1443             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1444             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1445             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1446             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1447             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1448             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1449             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1450             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1451
1452             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1453             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1454             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1455             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1456             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1457             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1458             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1459             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1460             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1461
1462             fjx1             = _mm_setzero_ps();
1463             fjy1             = _mm_setzero_ps();
1464             fjz1             = _mm_setzero_ps();
1465             fjx2             = _mm_setzero_ps();
1466             fjy2             = _mm_setzero_ps();
1467             fjz2             = _mm_setzero_ps();
1468             fjx3             = _mm_setzero_ps();
1469             fjy3             = _mm_setzero_ps();
1470             fjz3             = _mm_setzero_ps();
1471
1472             /**************************
1473              * CALCULATE INTERACTIONS *
1474              **************************/
1475
1476             if (gmx_mm_any_lt(rsq11,rcutoff2))
1477             {
1478
1479             r11              = _mm_mul_ps(rsq11,rinv11);
1480
1481             /* EWALD ELECTROSTATICS */
1482
1483             /* Analytical PME correction */
1484             zeta2            = _mm_mul_ps(beta2,rsq11);
1485             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1486             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1487             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1488             felec            = _mm_mul_ps(qq11,felec);
1489
1490             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1491
1492             fscal            = felec;
1493
1494             fscal            = _mm_and_ps(fscal,cutoff_mask);
1495
1496              /* Update vectorial force */
1497             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1498             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1499             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1500
1501             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1502             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1503             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1504
1505             }
1506
1507             /**************************
1508              * CALCULATE INTERACTIONS *
1509              **************************/
1510
1511             if (gmx_mm_any_lt(rsq12,rcutoff2))
1512             {
1513
1514             r12              = _mm_mul_ps(rsq12,rinv12);
1515
1516             /* EWALD ELECTROSTATICS */
1517
1518             /* Analytical PME correction */
1519             zeta2            = _mm_mul_ps(beta2,rsq12);
1520             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
1521             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1522             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1523             felec            = _mm_mul_ps(qq12,felec);
1524
1525             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1526
1527             fscal            = felec;
1528
1529             fscal            = _mm_and_ps(fscal,cutoff_mask);
1530
1531              /* Update vectorial force */
1532             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1533             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1534             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1535
1536             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1537             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1538             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1539
1540             }
1541
1542             /**************************
1543              * CALCULATE INTERACTIONS *
1544              **************************/
1545
1546             if (gmx_mm_any_lt(rsq13,rcutoff2))
1547             {
1548
1549             r13              = _mm_mul_ps(rsq13,rinv13);
1550
1551             /* EWALD ELECTROSTATICS */
1552
1553             /* Analytical PME correction */
1554             zeta2            = _mm_mul_ps(beta2,rsq13);
1555             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
1556             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1557             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1558             felec            = _mm_mul_ps(qq13,felec);
1559
1560             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
1561
1562             fscal            = felec;
1563
1564             fscal            = _mm_and_ps(fscal,cutoff_mask);
1565
1566              /* Update vectorial force */
1567             fix1             = _mm_macc_ps(dx13,fscal,fix1);
1568             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
1569             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
1570
1571             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
1572             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
1573             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
1574
1575             }
1576
1577             /**************************
1578              * CALCULATE INTERACTIONS *
1579              **************************/
1580
1581             if (gmx_mm_any_lt(rsq21,rcutoff2))
1582             {
1583
1584             r21              = _mm_mul_ps(rsq21,rinv21);
1585
1586             /* EWALD ELECTROSTATICS */
1587
1588             /* Analytical PME correction */
1589             zeta2            = _mm_mul_ps(beta2,rsq21);
1590             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
1591             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1592             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1593             felec            = _mm_mul_ps(qq21,felec);
1594
1595             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1596
1597             fscal            = felec;
1598
1599             fscal            = _mm_and_ps(fscal,cutoff_mask);
1600
1601              /* Update vectorial force */
1602             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1603             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1604             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1605
1606             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1607             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1608             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1609
1610             }
1611
1612             /**************************
1613              * CALCULATE INTERACTIONS *
1614              **************************/
1615
1616             if (gmx_mm_any_lt(rsq22,rcutoff2))
1617             {
1618
1619             r22              = _mm_mul_ps(rsq22,rinv22);
1620
1621             /* EWALD ELECTROSTATICS */
1622
1623             /* Analytical PME correction */
1624             zeta2            = _mm_mul_ps(beta2,rsq22);
1625             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
1626             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1627             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1628             felec            = _mm_mul_ps(qq22,felec);
1629
1630             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1631
1632             fscal            = felec;
1633
1634             fscal            = _mm_and_ps(fscal,cutoff_mask);
1635
1636              /* Update vectorial force */
1637             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1638             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1639             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1640
1641             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1642             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1643             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1644
1645             }
1646
1647             /**************************
1648              * CALCULATE INTERACTIONS *
1649              **************************/
1650
1651             if (gmx_mm_any_lt(rsq23,rcutoff2))
1652             {
1653
1654             r23              = _mm_mul_ps(rsq23,rinv23);
1655
1656             /* EWALD ELECTROSTATICS */
1657
1658             /* Analytical PME correction */
1659             zeta2            = _mm_mul_ps(beta2,rsq23);
1660             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
1661             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1662             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1663             felec            = _mm_mul_ps(qq23,felec);
1664
1665             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1666
1667             fscal            = felec;
1668
1669             fscal            = _mm_and_ps(fscal,cutoff_mask);
1670
1671              /* Update vectorial force */
1672             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1673             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1674             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1675
1676             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1677             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1678             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1679
1680             }
1681
1682             /**************************
1683              * CALCULATE INTERACTIONS *
1684              **************************/
1685
1686             if (gmx_mm_any_lt(rsq31,rcutoff2))
1687             {
1688
1689             r31              = _mm_mul_ps(rsq31,rinv31);
1690
1691             /* EWALD ELECTROSTATICS */
1692
1693             /* Analytical PME correction */
1694             zeta2            = _mm_mul_ps(beta2,rsq31);
1695             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
1696             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1697             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1698             felec            = _mm_mul_ps(qq31,felec);
1699
1700             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1701
1702             fscal            = felec;
1703
1704             fscal            = _mm_and_ps(fscal,cutoff_mask);
1705
1706              /* Update vectorial force */
1707             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1708             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1709             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1710
1711             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1712             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1713             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1714
1715             }
1716
1717             /**************************
1718              * CALCULATE INTERACTIONS *
1719              **************************/
1720
1721             if (gmx_mm_any_lt(rsq32,rcutoff2))
1722             {
1723
1724             r32              = _mm_mul_ps(rsq32,rinv32);
1725
1726             /* EWALD ELECTROSTATICS */
1727
1728             /* Analytical PME correction */
1729             zeta2            = _mm_mul_ps(beta2,rsq32);
1730             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
1731             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1732             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1733             felec            = _mm_mul_ps(qq32,felec);
1734
1735             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1736
1737             fscal            = felec;
1738
1739             fscal            = _mm_and_ps(fscal,cutoff_mask);
1740
1741              /* Update vectorial force */
1742             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1743             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1744             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1745
1746             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1747             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1748             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1749
1750             }
1751
1752             /**************************
1753              * CALCULATE INTERACTIONS *
1754              **************************/
1755
1756             if (gmx_mm_any_lt(rsq33,rcutoff2))
1757             {
1758
1759             r33              = _mm_mul_ps(rsq33,rinv33);
1760
1761             /* EWALD ELECTROSTATICS */
1762
1763             /* Analytical PME correction */
1764             zeta2            = _mm_mul_ps(beta2,rsq33);
1765             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
1766             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1767             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1768             felec            = _mm_mul_ps(qq33,felec);
1769
1770             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1771
1772             fscal            = felec;
1773
1774             fscal            = _mm_and_ps(fscal,cutoff_mask);
1775
1776              /* Update vectorial force */
1777             fix3             = _mm_macc_ps(dx33,fscal,fix3);
1778             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
1779             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
1780
1781             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
1782             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
1783             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
1784
1785             }
1786
1787             fjptrA             = f+j_coord_offsetA;
1788             fjptrB             = f+j_coord_offsetB;
1789             fjptrC             = f+j_coord_offsetC;
1790             fjptrD             = f+j_coord_offsetD;
1791
1792             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1793                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1794
1795             /* Inner loop uses 279 flops */
1796         }
1797
1798         if(jidx<j_index_end)
1799         {
1800
1801             /* Get j neighbor index, and coordinate index */
1802             jnrlistA         = jjnr[jidx];
1803             jnrlistB         = jjnr[jidx+1];
1804             jnrlistC         = jjnr[jidx+2];
1805             jnrlistD         = jjnr[jidx+3];
1806             /* Sign of each element will be negative for non-real atoms.
1807              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1808              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1809              */
1810             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1811             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1812             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1813             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1814             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1815             j_coord_offsetA  = DIM*jnrA;
1816             j_coord_offsetB  = DIM*jnrB;
1817             j_coord_offsetC  = DIM*jnrC;
1818             j_coord_offsetD  = DIM*jnrD;
1819
1820             /* load j atom coordinates */
1821             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1822                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1823                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1824
1825             /* Calculate displacement vector */
1826             dx11             = _mm_sub_ps(ix1,jx1);
1827             dy11             = _mm_sub_ps(iy1,jy1);
1828             dz11             = _mm_sub_ps(iz1,jz1);
1829             dx12             = _mm_sub_ps(ix1,jx2);
1830             dy12             = _mm_sub_ps(iy1,jy2);
1831             dz12             = _mm_sub_ps(iz1,jz2);
1832             dx13             = _mm_sub_ps(ix1,jx3);
1833             dy13             = _mm_sub_ps(iy1,jy3);
1834             dz13             = _mm_sub_ps(iz1,jz3);
1835             dx21             = _mm_sub_ps(ix2,jx1);
1836             dy21             = _mm_sub_ps(iy2,jy1);
1837             dz21             = _mm_sub_ps(iz2,jz1);
1838             dx22             = _mm_sub_ps(ix2,jx2);
1839             dy22             = _mm_sub_ps(iy2,jy2);
1840             dz22             = _mm_sub_ps(iz2,jz2);
1841             dx23             = _mm_sub_ps(ix2,jx3);
1842             dy23             = _mm_sub_ps(iy2,jy3);
1843             dz23             = _mm_sub_ps(iz2,jz3);
1844             dx31             = _mm_sub_ps(ix3,jx1);
1845             dy31             = _mm_sub_ps(iy3,jy1);
1846             dz31             = _mm_sub_ps(iz3,jz1);
1847             dx32             = _mm_sub_ps(ix3,jx2);
1848             dy32             = _mm_sub_ps(iy3,jy2);
1849             dz32             = _mm_sub_ps(iz3,jz2);
1850             dx33             = _mm_sub_ps(ix3,jx3);
1851             dy33             = _mm_sub_ps(iy3,jy3);
1852             dz33             = _mm_sub_ps(iz3,jz3);
1853
1854             /* Calculate squared distance and things based on it */
1855             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1856             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1857             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1858             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1859             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1860             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1861             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1862             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1863             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1864
1865             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1866             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1867             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1868             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1869             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1870             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1871             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1872             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1873             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1874
1875             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1876             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1877             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1878             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1879             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1880             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1881             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1882             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1883             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1884
1885             fjx1             = _mm_setzero_ps();
1886             fjy1             = _mm_setzero_ps();
1887             fjz1             = _mm_setzero_ps();
1888             fjx2             = _mm_setzero_ps();
1889             fjy2             = _mm_setzero_ps();
1890             fjz2             = _mm_setzero_ps();
1891             fjx3             = _mm_setzero_ps();
1892             fjy3             = _mm_setzero_ps();
1893             fjz3             = _mm_setzero_ps();
1894
1895             /**************************
1896              * CALCULATE INTERACTIONS *
1897              **************************/
1898
1899             if (gmx_mm_any_lt(rsq11,rcutoff2))
1900             {
1901
1902             r11              = _mm_mul_ps(rsq11,rinv11);
1903             r11              = _mm_andnot_ps(dummy_mask,r11);
1904
1905             /* EWALD ELECTROSTATICS */
1906
1907             /* Analytical PME correction */
1908             zeta2            = _mm_mul_ps(beta2,rsq11);
1909             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1910             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1911             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1912             felec            = _mm_mul_ps(qq11,felec);
1913
1914             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1915
1916             fscal            = felec;
1917
1918             fscal            = _mm_and_ps(fscal,cutoff_mask);
1919
1920             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1921
1922              /* Update vectorial force */
1923             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1924             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1925             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1926
1927             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1928             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1929             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1930
1931             }
1932
1933             /**************************
1934              * CALCULATE INTERACTIONS *
1935              **************************/
1936
1937             if (gmx_mm_any_lt(rsq12,rcutoff2))
1938             {
1939
1940             r12              = _mm_mul_ps(rsq12,rinv12);
1941             r12              = _mm_andnot_ps(dummy_mask,r12);
1942
1943             /* EWALD ELECTROSTATICS */
1944
1945             /* Analytical PME correction */
1946             zeta2            = _mm_mul_ps(beta2,rsq12);
1947             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
1948             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1949             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1950             felec            = _mm_mul_ps(qq12,felec);
1951
1952             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1953
1954             fscal            = felec;
1955
1956             fscal            = _mm_and_ps(fscal,cutoff_mask);
1957
1958             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1959
1960              /* Update vectorial force */
1961             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1962             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1963             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1964
1965             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1966             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1967             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1968
1969             }
1970
1971             /**************************
1972              * CALCULATE INTERACTIONS *
1973              **************************/
1974
1975             if (gmx_mm_any_lt(rsq13,rcutoff2))
1976             {
1977
1978             r13              = _mm_mul_ps(rsq13,rinv13);
1979             r13              = _mm_andnot_ps(dummy_mask,r13);
1980
1981             /* EWALD ELECTROSTATICS */
1982
1983             /* Analytical PME correction */
1984             zeta2            = _mm_mul_ps(beta2,rsq13);
1985             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
1986             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1987             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1988             felec            = _mm_mul_ps(qq13,felec);
1989
1990             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
1991
1992             fscal            = felec;
1993
1994             fscal            = _mm_and_ps(fscal,cutoff_mask);
1995
1996             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1997
1998              /* Update vectorial force */
1999             fix1             = _mm_macc_ps(dx13,fscal,fix1);
2000             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
2001             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
2002
2003             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
2004             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
2005             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
2006
2007             }
2008
2009             /**************************
2010              * CALCULATE INTERACTIONS *
2011              **************************/
2012
2013             if (gmx_mm_any_lt(rsq21,rcutoff2))
2014             {
2015
2016             r21              = _mm_mul_ps(rsq21,rinv21);
2017             r21              = _mm_andnot_ps(dummy_mask,r21);
2018
2019             /* EWALD ELECTROSTATICS */
2020
2021             /* Analytical PME correction */
2022             zeta2            = _mm_mul_ps(beta2,rsq21);
2023             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
2024             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2025             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2026             felec            = _mm_mul_ps(qq21,felec);
2027
2028             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2029
2030             fscal            = felec;
2031
2032             fscal            = _mm_and_ps(fscal,cutoff_mask);
2033
2034             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2035
2036              /* Update vectorial force */
2037             fix2             = _mm_macc_ps(dx21,fscal,fix2);
2038             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
2039             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
2040
2041             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
2042             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
2043             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
2044
2045             }
2046
2047             /**************************
2048              * CALCULATE INTERACTIONS *
2049              **************************/
2050
2051             if (gmx_mm_any_lt(rsq22,rcutoff2))
2052             {
2053
2054             r22              = _mm_mul_ps(rsq22,rinv22);
2055             r22              = _mm_andnot_ps(dummy_mask,r22);
2056
2057             /* EWALD ELECTROSTATICS */
2058
2059             /* Analytical PME correction */
2060             zeta2            = _mm_mul_ps(beta2,rsq22);
2061             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
2062             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2063             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2064             felec            = _mm_mul_ps(qq22,felec);
2065
2066             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2067
2068             fscal            = felec;
2069
2070             fscal            = _mm_and_ps(fscal,cutoff_mask);
2071
2072             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2073
2074              /* Update vectorial force */
2075             fix2             = _mm_macc_ps(dx22,fscal,fix2);
2076             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
2077             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
2078
2079             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
2080             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
2081             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
2082
2083             }
2084
2085             /**************************
2086              * CALCULATE INTERACTIONS *
2087              **************************/
2088
2089             if (gmx_mm_any_lt(rsq23,rcutoff2))
2090             {
2091
2092             r23              = _mm_mul_ps(rsq23,rinv23);
2093             r23              = _mm_andnot_ps(dummy_mask,r23);
2094
2095             /* EWALD ELECTROSTATICS */
2096
2097             /* Analytical PME correction */
2098             zeta2            = _mm_mul_ps(beta2,rsq23);
2099             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
2100             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2101             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2102             felec            = _mm_mul_ps(qq23,felec);
2103
2104             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
2105
2106             fscal            = felec;
2107
2108             fscal            = _mm_and_ps(fscal,cutoff_mask);
2109
2110             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2111
2112              /* Update vectorial force */
2113             fix2             = _mm_macc_ps(dx23,fscal,fix2);
2114             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
2115             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
2116
2117             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
2118             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
2119             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
2120
2121             }
2122
2123             /**************************
2124              * CALCULATE INTERACTIONS *
2125              **************************/
2126
2127             if (gmx_mm_any_lt(rsq31,rcutoff2))
2128             {
2129
2130             r31              = _mm_mul_ps(rsq31,rinv31);
2131             r31              = _mm_andnot_ps(dummy_mask,r31);
2132
2133             /* EWALD ELECTROSTATICS */
2134
2135             /* Analytical PME correction */
2136             zeta2            = _mm_mul_ps(beta2,rsq31);
2137             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
2138             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2139             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2140             felec            = _mm_mul_ps(qq31,felec);
2141
2142             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
2143
2144             fscal            = felec;
2145
2146             fscal            = _mm_and_ps(fscal,cutoff_mask);
2147
2148             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2149
2150              /* Update vectorial force */
2151             fix3             = _mm_macc_ps(dx31,fscal,fix3);
2152             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
2153             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
2154
2155             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
2156             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
2157             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
2158
2159             }
2160
2161             /**************************
2162              * CALCULATE INTERACTIONS *
2163              **************************/
2164
2165             if (gmx_mm_any_lt(rsq32,rcutoff2))
2166             {
2167
2168             r32              = _mm_mul_ps(rsq32,rinv32);
2169             r32              = _mm_andnot_ps(dummy_mask,r32);
2170
2171             /* EWALD ELECTROSTATICS */
2172
2173             /* Analytical PME correction */
2174             zeta2            = _mm_mul_ps(beta2,rsq32);
2175             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
2176             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2177             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2178             felec            = _mm_mul_ps(qq32,felec);
2179
2180             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
2181
2182             fscal            = felec;
2183
2184             fscal            = _mm_and_ps(fscal,cutoff_mask);
2185
2186             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2187
2188              /* Update vectorial force */
2189             fix3             = _mm_macc_ps(dx32,fscal,fix3);
2190             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
2191             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
2192
2193             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
2194             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
2195             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
2196
2197             }
2198
2199             /**************************
2200              * CALCULATE INTERACTIONS *
2201              **************************/
2202
2203             if (gmx_mm_any_lt(rsq33,rcutoff2))
2204             {
2205
2206             r33              = _mm_mul_ps(rsq33,rinv33);
2207             r33              = _mm_andnot_ps(dummy_mask,r33);
2208
2209             /* EWALD ELECTROSTATICS */
2210
2211             /* Analytical PME correction */
2212             zeta2            = _mm_mul_ps(beta2,rsq33);
2213             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
2214             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2215             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2216             felec            = _mm_mul_ps(qq33,felec);
2217
2218             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
2219
2220             fscal            = felec;
2221
2222             fscal            = _mm_and_ps(fscal,cutoff_mask);
2223
2224             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2225
2226              /* Update vectorial force */
2227             fix3             = _mm_macc_ps(dx33,fscal,fix3);
2228             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
2229             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
2230
2231             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
2232             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
2233             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
2234
2235             }
2236
2237             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2238             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2239             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2240             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2241
2242             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2243                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2244
2245             /* Inner loop uses 288 flops */
2246         }
2247
2248         /* End of innermost loop */
2249
2250         gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2251                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
2252
2253         /* Increment number of inner iterations */
2254         inneriter                  += j_index_end - j_index_start;
2255
2256         /* Outer loop uses 18 flops */
2257     }
2258
2259     /* Increment number of outer iterations */
2260     outeriter        += nri;
2261
2262     /* Update outer/inner flops */
2263
2264     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*288);
2265 }