39ee685c841252201cdf4dd4e0e5e2dec8815264
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_sse4_1_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
46
47 #include "gromacs/simd/math_x86_sse4_1_single.h"
48 #include "kernelutil_x86_sse4_1_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse4_1_single
52  * Electrostatics interaction: Ewald
53  * VdW interaction:            LennardJones
54  * Geometry:                   Water3-Water3
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse4_1_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83     int              vdwioffset0;
84     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     int              vdwioffset1;
86     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     int              vdwioffset2;
88     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
105     real             *charge;
106     int              nvdwtype;
107     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
108     int              *vdwtype;
109     real             *vdwparam;
110     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
111     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
112     __m128i          ewitab;
113     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
114     real             *ewtab;
115     __m128           dummy_mask,cutoff_mask;
116     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
117     __m128           one     = _mm_set1_ps(1.0);
118     __m128           two     = _mm_set1_ps(2.0);
119     x                = xx[0];
120     f                = ff[0];
121
122     nri              = nlist->nri;
123     iinr             = nlist->iinr;
124     jindex           = nlist->jindex;
125     jjnr             = nlist->jjnr;
126     shiftidx         = nlist->shift;
127     gid              = nlist->gid;
128     shiftvec         = fr->shift_vec[0];
129     fshift           = fr->fshift[0];
130     facel            = _mm_set1_ps(fr->epsfac);
131     charge           = mdatoms->chargeA;
132     nvdwtype         = fr->ntype;
133     vdwparam         = fr->nbfp;
134     vdwtype          = mdatoms->typeA;
135
136     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
137     ewtab            = fr->ic->tabq_coul_FDV0;
138     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
139     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
140
141     /* Setup water-specific parameters */
142     inr              = nlist->iinr[0];
143     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
144     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
145     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
146     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
147
148     jq0              = _mm_set1_ps(charge[inr+0]);
149     jq1              = _mm_set1_ps(charge[inr+1]);
150     jq2              = _mm_set1_ps(charge[inr+2]);
151     vdwjidx0A        = 2*vdwtype[inr+0];
152     qq00             = _mm_mul_ps(iq0,jq0);
153     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
154     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
155     qq01             = _mm_mul_ps(iq0,jq1);
156     qq02             = _mm_mul_ps(iq0,jq2);
157     qq10             = _mm_mul_ps(iq1,jq0);
158     qq11             = _mm_mul_ps(iq1,jq1);
159     qq12             = _mm_mul_ps(iq1,jq2);
160     qq20             = _mm_mul_ps(iq2,jq0);
161     qq21             = _mm_mul_ps(iq2,jq1);
162     qq22             = _mm_mul_ps(iq2,jq2);
163
164     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
165     rcutoff_scalar   = fr->rcoulomb;
166     rcutoff          = _mm_set1_ps(rcutoff_scalar);
167     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
168
169     sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
170     rvdw             = _mm_set1_ps(fr->rvdw);
171
172     /* Avoid stupid compiler warnings */
173     jnrA = jnrB = jnrC = jnrD = 0;
174     j_coord_offsetA = 0;
175     j_coord_offsetB = 0;
176     j_coord_offsetC = 0;
177     j_coord_offsetD = 0;
178
179     outeriter        = 0;
180     inneriter        = 0;
181
182     for(iidx=0;iidx<4*DIM;iidx++)
183     {
184         scratch[iidx] = 0.0;
185     }
186
187     /* Start outer loop over neighborlists */
188     for(iidx=0; iidx<nri; iidx++)
189     {
190         /* Load shift vector for this list */
191         i_shift_offset   = DIM*shiftidx[iidx];
192
193         /* Load limits for loop over neighbors */
194         j_index_start    = jindex[iidx];
195         j_index_end      = jindex[iidx+1];
196
197         /* Get outer coordinate index */
198         inr              = iinr[iidx];
199         i_coord_offset   = DIM*inr;
200
201         /* Load i particle coords and add shift vector */
202         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
203                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
204
205         fix0             = _mm_setzero_ps();
206         fiy0             = _mm_setzero_ps();
207         fiz0             = _mm_setzero_ps();
208         fix1             = _mm_setzero_ps();
209         fiy1             = _mm_setzero_ps();
210         fiz1             = _mm_setzero_ps();
211         fix2             = _mm_setzero_ps();
212         fiy2             = _mm_setzero_ps();
213         fiz2             = _mm_setzero_ps();
214
215         /* Reset potential sums */
216         velecsum         = _mm_setzero_ps();
217         vvdwsum          = _mm_setzero_ps();
218
219         /* Start inner kernel loop */
220         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
221         {
222
223             /* Get j neighbor index, and coordinate index */
224             jnrA             = jjnr[jidx];
225             jnrB             = jjnr[jidx+1];
226             jnrC             = jjnr[jidx+2];
227             jnrD             = jjnr[jidx+3];
228             j_coord_offsetA  = DIM*jnrA;
229             j_coord_offsetB  = DIM*jnrB;
230             j_coord_offsetC  = DIM*jnrC;
231             j_coord_offsetD  = DIM*jnrD;
232
233             /* load j atom coordinates */
234             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
235                                               x+j_coord_offsetC,x+j_coord_offsetD,
236                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
237
238             /* Calculate displacement vector */
239             dx00             = _mm_sub_ps(ix0,jx0);
240             dy00             = _mm_sub_ps(iy0,jy0);
241             dz00             = _mm_sub_ps(iz0,jz0);
242             dx01             = _mm_sub_ps(ix0,jx1);
243             dy01             = _mm_sub_ps(iy0,jy1);
244             dz01             = _mm_sub_ps(iz0,jz1);
245             dx02             = _mm_sub_ps(ix0,jx2);
246             dy02             = _mm_sub_ps(iy0,jy2);
247             dz02             = _mm_sub_ps(iz0,jz2);
248             dx10             = _mm_sub_ps(ix1,jx0);
249             dy10             = _mm_sub_ps(iy1,jy0);
250             dz10             = _mm_sub_ps(iz1,jz0);
251             dx11             = _mm_sub_ps(ix1,jx1);
252             dy11             = _mm_sub_ps(iy1,jy1);
253             dz11             = _mm_sub_ps(iz1,jz1);
254             dx12             = _mm_sub_ps(ix1,jx2);
255             dy12             = _mm_sub_ps(iy1,jy2);
256             dz12             = _mm_sub_ps(iz1,jz2);
257             dx20             = _mm_sub_ps(ix2,jx0);
258             dy20             = _mm_sub_ps(iy2,jy0);
259             dz20             = _mm_sub_ps(iz2,jz0);
260             dx21             = _mm_sub_ps(ix2,jx1);
261             dy21             = _mm_sub_ps(iy2,jy1);
262             dz21             = _mm_sub_ps(iz2,jz1);
263             dx22             = _mm_sub_ps(ix2,jx2);
264             dy22             = _mm_sub_ps(iy2,jy2);
265             dz22             = _mm_sub_ps(iz2,jz2);
266
267             /* Calculate squared distance and things based on it */
268             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
269             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
270             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
271             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
272             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
273             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
274             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
275             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
276             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
277
278             rinv00           = gmx_mm_invsqrt_ps(rsq00);
279             rinv01           = gmx_mm_invsqrt_ps(rsq01);
280             rinv02           = gmx_mm_invsqrt_ps(rsq02);
281             rinv10           = gmx_mm_invsqrt_ps(rsq10);
282             rinv11           = gmx_mm_invsqrt_ps(rsq11);
283             rinv12           = gmx_mm_invsqrt_ps(rsq12);
284             rinv20           = gmx_mm_invsqrt_ps(rsq20);
285             rinv21           = gmx_mm_invsqrt_ps(rsq21);
286             rinv22           = gmx_mm_invsqrt_ps(rsq22);
287
288             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
289             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
290             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
291             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
292             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
293             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
294             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
295             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
296             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
297
298             fjx0             = _mm_setzero_ps();
299             fjy0             = _mm_setzero_ps();
300             fjz0             = _mm_setzero_ps();
301             fjx1             = _mm_setzero_ps();
302             fjy1             = _mm_setzero_ps();
303             fjz1             = _mm_setzero_ps();
304             fjx2             = _mm_setzero_ps();
305             fjy2             = _mm_setzero_ps();
306             fjz2             = _mm_setzero_ps();
307
308             /**************************
309              * CALCULATE INTERACTIONS *
310              **************************/
311
312             if (gmx_mm_any_lt(rsq00,rcutoff2))
313             {
314
315             r00              = _mm_mul_ps(rsq00,rinv00);
316
317             /* EWALD ELECTROSTATICS */
318
319             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
320             ewrt             = _mm_mul_ps(r00,ewtabscale);
321             ewitab           = _mm_cvttps_epi32(ewrt);
322             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
323             ewitab           = _mm_slli_epi32(ewitab,2);
324             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
325             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
326             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
327             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
328             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
329             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
330             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
331             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
332             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
333
334             /* LENNARD-JONES DISPERSION/REPULSION */
335
336             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
337             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
338             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
339             vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
340                                           _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
341             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
342
343             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
344
345             /* Update potential sum for this i atom from the interaction with this j atom. */
346             velec            = _mm_and_ps(velec,cutoff_mask);
347             velecsum         = _mm_add_ps(velecsum,velec);
348             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
349             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
350
351             fscal            = _mm_add_ps(felec,fvdw);
352
353             fscal            = _mm_and_ps(fscal,cutoff_mask);
354
355             /* Calculate temporary vectorial force */
356             tx               = _mm_mul_ps(fscal,dx00);
357             ty               = _mm_mul_ps(fscal,dy00);
358             tz               = _mm_mul_ps(fscal,dz00);
359
360             /* Update vectorial force */
361             fix0             = _mm_add_ps(fix0,tx);
362             fiy0             = _mm_add_ps(fiy0,ty);
363             fiz0             = _mm_add_ps(fiz0,tz);
364
365             fjx0             = _mm_add_ps(fjx0,tx);
366             fjy0             = _mm_add_ps(fjy0,ty);
367             fjz0             = _mm_add_ps(fjz0,tz);
368
369             }
370
371             /**************************
372              * CALCULATE INTERACTIONS *
373              **************************/
374
375             if (gmx_mm_any_lt(rsq01,rcutoff2))
376             {
377
378             r01              = _mm_mul_ps(rsq01,rinv01);
379
380             /* EWALD ELECTROSTATICS */
381
382             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
383             ewrt             = _mm_mul_ps(r01,ewtabscale);
384             ewitab           = _mm_cvttps_epi32(ewrt);
385             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
386             ewitab           = _mm_slli_epi32(ewitab,2);
387             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
388             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
389             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
390             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
391             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
392             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
393             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
394             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
395             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
396
397             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
398
399             /* Update potential sum for this i atom from the interaction with this j atom. */
400             velec            = _mm_and_ps(velec,cutoff_mask);
401             velecsum         = _mm_add_ps(velecsum,velec);
402
403             fscal            = felec;
404
405             fscal            = _mm_and_ps(fscal,cutoff_mask);
406
407             /* Calculate temporary vectorial force */
408             tx               = _mm_mul_ps(fscal,dx01);
409             ty               = _mm_mul_ps(fscal,dy01);
410             tz               = _mm_mul_ps(fscal,dz01);
411
412             /* Update vectorial force */
413             fix0             = _mm_add_ps(fix0,tx);
414             fiy0             = _mm_add_ps(fiy0,ty);
415             fiz0             = _mm_add_ps(fiz0,tz);
416
417             fjx1             = _mm_add_ps(fjx1,tx);
418             fjy1             = _mm_add_ps(fjy1,ty);
419             fjz1             = _mm_add_ps(fjz1,tz);
420
421             }
422
423             /**************************
424              * CALCULATE INTERACTIONS *
425              **************************/
426
427             if (gmx_mm_any_lt(rsq02,rcutoff2))
428             {
429
430             r02              = _mm_mul_ps(rsq02,rinv02);
431
432             /* EWALD ELECTROSTATICS */
433
434             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
435             ewrt             = _mm_mul_ps(r02,ewtabscale);
436             ewitab           = _mm_cvttps_epi32(ewrt);
437             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
438             ewitab           = _mm_slli_epi32(ewitab,2);
439             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
440             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
441             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
442             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
443             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
444             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
445             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
446             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
447             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
448
449             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
450
451             /* Update potential sum for this i atom from the interaction with this j atom. */
452             velec            = _mm_and_ps(velec,cutoff_mask);
453             velecsum         = _mm_add_ps(velecsum,velec);
454
455             fscal            = felec;
456
457             fscal            = _mm_and_ps(fscal,cutoff_mask);
458
459             /* Calculate temporary vectorial force */
460             tx               = _mm_mul_ps(fscal,dx02);
461             ty               = _mm_mul_ps(fscal,dy02);
462             tz               = _mm_mul_ps(fscal,dz02);
463
464             /* Update vectorial force */
465             fix0             = _mm_add_ps(fix0,tx);
466             fiy0             = _mm_add_ps(fiy0,ty);
467             fiz0             = _mm_add_ps(fiz0,tz);
468
469             fjx2             = _mm_add_ps(fjx2,tx);
470             fjy2             = _mm_add_ps(fjy2,ty);
471             fjz2             = _mm_add_ps(fjz2,tz);
472
473             }
474
475             /**************************
476              * CALCULATE INTERACTIONS *
477              **************************/
478
479             if (gmx_mm_any_lt(rsq10,rcutoff2))
480             {
481
482             r10              = _mm_mul_ps(rsq10,rinv10);
483
484             /* EWALD ELECTROSTATICS */
485
486             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
487             ewrt             = _mm_mul_ps(r10,ewtabscale);
488             ewitab           = _mm_cvttps_epi32(ewrt);
489             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
490             ewitab           = _mm_slli_epi32(ewitab,2);
491             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
492             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
493             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
494             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
495             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
496             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
497             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
498             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
499             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
500
501             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
502
503             /* Update potential sum for this i atom from the interaction with this j atom. */
504             velec            = _mm_and_ps(velec,cutoff_mask);
505             velecsum         = _mm_add_ps(velecsum,velec);
506
507             fscal            = felec;
508
509             fscal            = _mm_and_ps(fscal,cutoff_mask);
510
511             /* Calculate temporary vectorial force */
512             tx               = _mm_mul_ps(fscal,dx10);
513             ty               = _mm_mul_ps(fscal,dy10);
514             tz               = _mm_mul_ps(fscal,dz10);
515
516             /* Update vectorial force */
517             fix1             = _mm_add_ps(fix1,tx);
518             fiy1             = _mm_add_ps(fiy1,ty);
519             fiz1             = _mm_add_ps(fiz1,tz);
520
521             fjx0             = _mm_add_ps(fjx0,tx);
522             fjy0             = _mm_add_ps(fjy0,ty);
523             fjz0             = _mm_add_ps(fjz0,tz);
524
525             }
526
527             /**************************
528              * CALCULATE INTERACTIONS *
529              **************************/
530
531             if (gmx_mm_any_lt(rsq11,rcutoff2))
532             {
533
534             r11              = _mm_mul_ps(rsq11,rinv11);
535
536             /* EWALD ELECTROSTATICS */
537
538             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
539             ewrt             = _mm_mul_ps(r11,ewtabscale);
540             ewitab           = _mm_cvttps_epi32(ewrt);
541             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
542             ewitab           = _mm_slli_epi32(ewitab,2);
543             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
544             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
545             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
546             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
547             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
548             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
549             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
550             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
551             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
552
553             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
554
555             /* Update potential sum for this i atom from the interaction with this j atom. */
556             velec            = _mm_and_ps(velec,cutoff_mask);
557             velecsum         = _mm_add_ps(velecsum,velec);
558
559             fscal            = felec;
560
561             fscal            = _mm_and_ps(fscal,cutoff_mask);
562
563             /* Calculate temporary vectorial force */
564             tx               = _mm_mul_ps(fscal,dx11);
565             ty               = _mm_mul_ps(fscal,dy11);
566             tz               = _mm_mul_ps(fscal,dz11);
567
568             /* Update vectorial force */
569             fix1             = _mm_add_ps(fix1,tx);
570             fiy1             = _mm_add_ps(fiy1,ty);
571             fiz1             = _mm_add_ps(fiz1,tz);
572
573             fjx1             = _mm_add_ps(fjx1,tx);
574             fjy1             = _mm_add_ps(fjy1,ty);
575             fjz1             = _mm_add_ps(fjz1,tz);
576
577             }
578
579             /**************************
580              * CALCULATE INTERACTIONS *
581              **************************/
582
583             if (gmx_mm_any_lt(rsq12,rcutoff2))
584             {
585
586             r12              = _mm_mul_ps(rsq12,rinv12);
587
588             /* EWALD ELECTROSTATICS */
589
590             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
591             ewrt             = _mm_mul_ps(r12,ewtabscale);
592             ewitab           = _mm_cvttps_epi32(ewrt);
593             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
594             ewitab           = _mm_slli_epi32(ewitab,2);
595             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
596             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
597             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
598             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
599             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
600             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
601             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
602             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
603             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
604
605             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
606
607             /* Update potential sum for this i atom from the interaction with this j atom. */
608             velec            = _mm_and_ps(velec,cutoff_mask);
609             velecsum         = _mm_add_ps(velecsum,velec);
610
611             fscal            = felec;
612
613             fscal            = _mm_and_ps(fscal,cutoff_mask);
614
615             /* Calculate temporary vectorial force */
616             tx               = _mm_mul_ps(fscal,dx12);
617             ty               = _mm_mul_ps(fscal,dy12);
618             tz               = _mm_mul_ps(fscal,dz12);
619
620             /* Update vectorial force */
621             fix1             = _mm_add_ps(fix1,tx);
622             fiy1             = _mm_add_ps(fiy1,ty);
623             fiz1             = _mm_add_ps(fiz1,tz);
624
625             fjx2             = _mm_add_ps(fjx2,tx);
626             fjy2             = _mm_add_ps(fjy2,ty);
627             fjz2             = _mm_add_ps(fjz2,tz);
628
629             }
630
631             /**************************
632              * CALCULATE INTERACTIONS *
633              **************************/
634
635             if (gmx_mm_any_lt(rsq20,rcutoff2))
636             {
637
638             r20              = _mm_mul_ps(rsq20,rinv20);
639
640             /* EWALD ELECTROSTATICS */
641
642             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
643             ewrt             = _mm_mul_ps(r20,ewtabscale);
644             ewitab           = _mm_cvttps_epi32(ewrt);
645             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
646             ewitab           = _mm_slli_epi32(ewitab,2);
647             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
648             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
649             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
650             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
651             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
652             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
653             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
654             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
655             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
656
657             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
658
659             /* Update potential sum for this i atom from the interaction with this j atom. */
660             velec            = _mm_and_ps(velec,cutoff_mask);
661             velecsum         = _mm_add_ps(velecsum,velec);
662
663             fscal            = felec;
664
665             fscal            = _mm_and_ps(fscal,cutoff_mask);
666
667             /* Calculate temporary vectorial force */
668             tx               = _mm_mul_ps(fscal,dx20);
669             ty               = _mm_mul_ps(fscal,dy20);
670             tz               = _mm_mul_ps(fscal,dz20);
671
672             /* Update vectorial force */
673             fix2             = _mm_add_ps(fix2,tx);
674             fiy2             = _mm_add_ps(fiy2,ty);
675             fiz2             = _mm_add_ps(fiz2,tz);
676
677             fjx0             = _mm_add_ps(fjx0,tx);
678             fjy0             = _mm_add_ps(fjy0,ty);
679             fjz0             = _mm_add_ps(fjz0,tz);
680
681             }
682
683             /**************************
684              * CALCULATE INTERACTIONS *
685              **************************/
686
687             if (gmx_mm_any_lt(rsq21,rcutoff2))
688             {
689
690             r21              = _mm_mul_ps(rsq21,rinv21);
691
692             /* EWALD ELECTROSTATICS */
693
694             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
695             ewrt             = _mm_mul_ps(r21,ewtabscale);
696             ewitab           = _mm_cvttps_epi32(ewrt);
697             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
698             ewitab           = _mm_slli_epi32(ewitab,2);
699             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
700             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
701             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
702             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
703             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
704             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
705             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
706             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
707             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
708
709             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
710
711             /* Update potential sum for this i atom from the interaction with this j atom. */
712             velec            = _mm_and_ps(velec,cutoff_mask);
713             velecsum         = _mm_add_ps(velecsum,velec);
714
715             fscal            = felec;
716
717             fscal            = _mm_and_ps(fscal,cutoff_mask);
718
719             /* Calculate temporary vectorial force */
720             tx               = _mm_mul_ps(fscal,dx21);
721             ty               = _mm_mul_ps(fscal,dy21);
722             tz               = _mm_mul_ps(fscal,dz21);
723
724             /* Update vectorial force */
725             fix2             = _mm_add_ps(fix2,tx);
726             fiy2             = _mm_add_ps(fiy2,ty);
727             fiz2             = _mm_add_ps(fiz2,tz);
728
729             fjx1             = _mm_add_ps(fjx1,tx);
730             fjy1             = _mm_add_ps(fjy1,ty);
731             fjz1             = _mm_add_ps(fjz1,tz);
732
733             }
734
735             /**************************
736              * CALCULATE INTERACTIONS *
737              **************************/
738
739             if (gmx_mm_any_lt(rsq22,rcutoff2))
740             {
741
742             r22              = _mm_mul_ps(rsq22,rinv22);
743
744             /* EWALD ELECTROSTATICS */
745
746             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
747             ewrt             = _mm_mul_ps(r22,ewtabscale);
748             ewitab           = _mm_cvttps_epi32(ewrt);
749             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
750             ewitab           = _mm_slli_epi32(ewitab,2);
751             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
752             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
753             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
754             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
755             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
756             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
757             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
758             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
759             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
760
761             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
762
763             /* Update potential sum for this i atom from the interaction with this j atom. */
764             velec            = _mm_and_ps(velec,cutoff_mask);
765             velecsum         = _mm_add_ps(velecsum,velec);
766
767             fscal            = felec;
768
769             fscal            = _mm_and_ps(fscal,cutoff_mask);
770
771             /* Calculate temporary vectorial force */
772             tx               = _mm_mul_ps(fscal,dx22);
773             ty               = _mm_mul_ps(fscal,dy22);
774             tz               = _mm_mul_ps(fscal,dz22);
775
776             /* Update vectorial force */
777             fix2             = _mm_add_ps(fix2,tx);
778             fiy2             = _mm_add_ps(fiy2,ty);
779             fiz2             = _mm_add_ps(fiz2,tz);
780
781             fjx2             = _mm_add_ps(fjx2,tx);
782             fjy2             = _mm_add_ps(fjy2,ty);
783             fjz2             = _mm_add_ps(fjz2,tz);
784
785             }
786
787             fjptrA             = f+j_coord_offsetA;
788             fjptrB             = f+j_coord_offsetB;
789             fjptrC             = f+j_coord_offsetC;
790             fjptrD             = f+j_coord_offsetD;
791
792             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
793                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
794
795             /* Inner loop uses 432 flops */
796         }
797
798         if(jidx<j_index_end)
799         {
800
801             /* Get j neighbor index, and coordinate index */
802             jnrlistA         = jjnr[jidx];
803             jnrlistB         = jjnr[jidx+1];
804             jnrlistC         = jjnr[jidx+2];
805             jnrlistD         = jjnr[jidx+3];
806             /* Sign of each element will be negative for non-real atoms.
807              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
808              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
809              */
810             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
811             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
812             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
813             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
814             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
815             j_coord_offsetA  = DIM*jnrA;
816             j_coord_offsetB  = DIM*jnrB;
817             j_coord_offsetC  = DIM*jnrC;
818             j_coord_offsetD  = DIM*jnrD;
819
820             /* load j atom coordinates */
821             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
822                                               x+j_coord_offsetC,x+j_coord_offsetD,
823                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
824
825             /* Calculate displacement vector */
826             dx00             = _mm_sub_ps(ix0,jx0);
827             dy00             = _mm_sub_ps(iy0,jy0);
828             dz00             = _mm_sub_ps(iz0,jz0);
829             dx01             = _mm_sub_ps(ix0,jx1);
830             dy01             = _mm_sub_ps(iy0,jy1);
831             dz01             = _mm_sub_ps(iz0,jz1);
832             dx02             = _mm_sub_ps(ix0,jx2);
833             dy02             = _mm_sub_ps(iy0,jy2);
834             dz02             = _mm_sub_ps(iz0,jz2);
835             dx10             = _mm_sub_ps(ix1,jx0);
836             dy10             = _mm_sub_ps(iy1,jy0);
837             dz10             = _mm_sub_ps(iz1,jz0);
838             dx11             = _mm_sub_ps(ix1,jx1);
839             dy11             = _mm_sub_ps(iy1,jy1);
840             dz11             = _mm_sub_ps(iz1,jz1);
841             dx12             = _mm_sub_ps(ix1,jx2);
842             dy12             = _mm_sub_ps(iy1,jy2);
843             dz12             = _mm_sub_ps(iz1,jz2);
844             dx20             = _mm_sub_ps(ix2,jx0);
845             dy20             = _mm_sub_ps(iy2,jy0);
846             dz20             = _mm_sub_ps(iz2,jz0);
847             dx21             = _mm_sub_ps(ix2,jx1);
848             dy21             = _mm_sub_ps(iy2,jy1);
849             dz21             = _mm_sub_ps(iz2,jz1);
850             dx22             = _mm_sub_ps(ix2,jx2);
851             dy22             = _mm_sub_ps(iy2,jy2);
852             dz22             = _mm_sub_ps(iz2,jz2);
853
854             /* Calculate squared distance and things based on it */
855             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
856             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
857             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
858             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
859             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
860             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
861             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
862             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
863             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
864
865             rinv00           = gmx_mm_invsqrt_ps(rsq00);
866             rinv01           = gmx_mm_invsqrt_ps(rsq01);
867             rinv02           = gmx_mm_invsqrt_ps(rsq02);
868             rinv10           = gmx_mm_invsqrt_ps(rsq10);
869             rinv11           = gmx_mm_invsqrt_ps(rsq11);
870             rinv12           = gmx_mm_invsqrt_ps(rsq12);
871             rinv20           = gmx_mm_invsqrt_ps(rsq20);
872             rinv21           = gmx_mm_invsqrt_ps(rsq21);
873             rinv22           = gmx_mm_invsqrt_ps(rsq22);
874
875             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
876             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
877             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
878             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
879             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
880             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
881             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
882             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
883             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
884
885             fjx0             = _mm_setzero_ps();
886             fjy0             = _mm_setzero_ps();
887             fjz0             = _mm_setzero_ps();
888             fjx1             = _mm_setzero_ps();
889             fjy1             = _mm_setzero_ps();
890             fjz1             = _mm_setzero_ps();
891             fjx2             = _mm_setzero_ps();
892             fjy2             = _mm_setzero_ps();
893             fjz2             = _mm_setzero_ps();
894
895             /**************************
896              * CALCULATE INTERACTIONS *
897              **************************/
898
899             if (gmx_mm_any_lt(rsq00,rcutoff2))
900             {
901
902             r00              = _mm_mul_ps(rsq00,rinv00);
903             r00              = _mm_andnot_ps(dummy_mask,r00);
904
905             /* EWALD ELECTROSTATICS */
906
907             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
908             ewrt             = _mm_mul_ps(r00,ewtabscale);
909             ewitab           = _mm_cvttps_epi32(ewrt);
910             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
911             ewitab           = _mm_slli_epi32(ewitab,2);
912             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
913             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
914             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
915             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
916             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
917             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
918             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
919             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
920             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
921
922             /* LENNARD-JONES DISPERSION/REPULSION */
923
924             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
925             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
926             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
927             vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
928                                           _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
929             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
930
931             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
932
933             /* Update potential sum for this i atom from the interaction with this j atom. */
934             velec            = _mm_and_ps(velec,cutoff_mask);
935             velec            = _mm_andnot_ps(dummy_mask,velec);
936             velecsum         = _mm_add_ps(velecsum,velec);
937             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
938             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
939             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
940
941             fscal            = _mm_add_ps(felec,fvdw);
942
943             fscal            = _mm_and_ps(fscal,cutoff_mask);
944
945             fscal            = _mm_andnot_ps(dummy_mask,fscal);
946
947             /* Calculate temporary vectorial force */
948             tx               = _mm_mul_ps(fscal,dx00);
949             ty               = _mm_mul_ps(fscal,dy00);
950             tz               = _mm_mul_ps(fscal,dz00);
951
952             /* Update vectorial force */
953             fix0             = _mm_add_ps(fix0,tx);
954             fiy0             = _mm_add_ps(fiy0,ty);
955             fiz0             = _mm_add_ps(fiz0,tz);
956
957             fjx0             = _mm_add_ps(fjx0,tx);
958             fjy0             = _mm_add_ps(fjy0,ty);
959             fjz0             = _mm_add_ps(fjz0,tz);
960
961             }
962
963             /**************************
964              * CALCULATE INTERACTIONS *
965              **************************/
966
967             if (gmx_mm_any_lt(rsq01,rcutoff2))
968             {
969
970             r01              = _mm_mul_ps(rsq01,rinv01);
971             r01              = _mm_andnot_ps(dummy_mask,r01);
972
973             /* EWALD ELECTROSTATICS */
974
975             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
976             ewrt             = _mm_mul_ps(r01,ewtabscale);
977             ewitab           = _mm_cvttps_epi32(ewrt);
978             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
979             ewitab           = _mm_slli_epi32(ewitab,2);
980             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
981             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
982             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
983             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
984             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
985             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
986             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
987             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
988             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
989
990             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
991
992             /* Update potential sum for this i atom from the interaction with this j atom. */
993             velec            = _mm_and_ps(velec,cutoff_mask);
994             velec            = _mm_andnot_ps(dummy_mask,velec);
995             velecsum         = _mm_add_ps(velecsum,velec);
996
997             fscal            = felec;
998
999             fscal            = _mm_and_ps(fscal,cutoff_mask);
1000
1001             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1002
1003             /* Calculate temporary vectorial force */
1004             tx               = _mm_mul_ps(fscal,dx01);
1005             ty               = _mm_mul_ps(fscal,dy01);
1006             tz               = _mm_mul_ps(fscal,dz01);
1007
1008             /* Update vectorial force */
1009             fix0             = _mm_add_ps(fix0,tx);
1010             fiy0             = _mm_add_ps(fiy0,ty);
1011             fiz0             = _mm_add_ps(fiz0,tz);
1012
1013             fjx1             = _mm_add_ps(fjx1,tx);
1014             fjy1             = _mm_add_ps(fjy1,ty);
1015             fjz1             = _mm_add_ps(fjz1,tz);
1016
1017             }
1018
1019             /**************************
1020              * CALCULATE INTERACTIONS *
1021              **************************/
1022
1023             if (gmx_mm_any_lt(rsq02,rcutoff2))
1024             {
1025
1026             r02              = _mm_mul_ps(rsq02,rinv02);
1027             r02              = _mm_andnot_ps(dummy_mask,r02);
1028
1029             /* EWALD ELECTROSTATICS */
1030
1031             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1032             ewrt             = _mm_mul_ps(r02,ewtabscale);
1033             ewitab           = _mm_cvttps_epi32(ewrt);
1034             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1035             ewitab           = _mm_slli_epi32(ewitab,2);
1036             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1037             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1038             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1039             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1040             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1041             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1042             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1043             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
1044             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1045
1046             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1047
1048             /* Update potential sum for this i atom from the interaction with this j atom. */
1049             velec            = _mm_and_ps(velec,cutoff_mask);
1050             velec            = _mm_andnot_ps(dummy_mask,velec);
1051             velecsum         = _mm_add_ps(velecsum,velec);
1052
1053             fscal            = felec;
1054
1055             fscal            = _mm_and_ps(fscal,cutoff_mask);
1056
1057             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1058
1059             /* Calculate temporary vectorial force */
1060             tx               = _mm_mul_ps(fscal,dx02);
1061             ty               = _mm_mul_ps(fscal,dy02);
1062             tz               = _mm_mul_ps(fscal,dz02);
1063
1064             /* Update vectorial force */
1065             fix0             = _mm_add_ps(fix0,tx);
1066             fiy0             = _mm_add_ps(fiy0,ty);
1067             fiz0             = _mm_add_ps(fiz0,tz);
1068
1069             fjx2             = _mm_add_ps(fjx2,tx);
1070             fjy2             = _mm_add_ps(fjy2,ty);
1071             fjz2             = _mm_add_ps(fjz2,tz);
1072
1073             }
1074
1075             /**************************
1076              * CALCULATE INTERACTIONS *
1077              **************************/
1078
1079             if (gmx_mm_any_lt(rsq10,rcutoff2))
1080             {
1081
1082             r10              = _mm_mul_ps(rsq10,rinv10);
1083             r10              = _mm_andnot_ps(dummy_mask,r10);
1084
1085             /* EWALD ELECTROSTATICS */
1086
1087             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1088             ewrt             = _mm_mul_ps(r10,ewtabscale);
1089             ewitab           = _mm_cvttps_epi32(ewrt);
1090             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1091             ewitab           = _mm_slli_epi32(ewitab,2);
1092             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1093             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1094             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1095             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1096             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1097             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1098             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1099             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
1100             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1101
1102             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1103
1104             /* Update potential sum for this i atom from the interaction with this j atom. */
1105             velec            = _mm_and_ps(velec,cutoff_mask);
1106             velec            = _mm_andnot_ps(dummy_mask,velec);
1107             velecsum         = _mm_add_ps(velecsum,velec);
1108
1109             fscal            = felec;
1110
1111             fscal            = _mm_and_ps(fscal,cutoff_mask);
1112
1113             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1114
1115             /* Calculate temporary vectorial force */
1116             tx               = _mm_mul_ps(fscal,dx10);
1117             ty               = _mm_mul_ps(fscal,dy10);
1118             tz               = _mm_mul_ps(fscal,dz10);
1119
1120             /* Update vectorial force */
1121             fix1             = _mm_add_ps(fix1,tx);
1122             fiy1             = _mm_add_ps(fiy1,ty);
1123             fiz1             = _mm_add_ps(fiz1,tz);
1124
1125             fjx0             = _mm_add_ps(fjx0,tx);
1126             fjy0             = _mm_add_ps(fjy0,ty);
1127             fjz0             = _mm_add_ps(fjz0,tz);
1128
1129             }
1130
1131             /**************************
1132              * CALCULATE INTERACTIONS *
1133              **************************/
1134
1135             if (gmx_mm_any_lt(rsq11,rcutoff2))
1136             {
1137
1138             r11              = _mm_mul_ps(rsq11,rinv11);
1139             r11              = _mm_andnot_ps(dummy_mask,r11);
1140
1141             /* EWALD ELECTROSTATICS */
1142
1143             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1144             ewrt             = _mm_mul_ps(r11,ewtabscale);
1145             ewitab           = _mm_cvttps_epi32(ewrt);
1146             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1147             ewitab           = _mm_slli_epi32(ewitab,2);
1148             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1149             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1150             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1151             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1152             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1153             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1154             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1155             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
1156             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1157
1158             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1159
1160             /* Update potential sum for this i atom from the interaction with this j atom. */
1161             velec            = _mm_and_ps(velec,cutoff_mask);
1162             velec            = _mm_andnot_ps(dummy_mask,velec);
1163             velecsum         = _mm_add_ps(velecsum,velec);
1164
1165             fscal            = felec;
1166
1167             fscal            = _mm_and_ps(fscal,cutoff_mask);
1168
1169             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1170
1171             /* Calculate temporary vectorial force */
1172             tx               = _mm_mul_ps(fscal,dx11);
1173             ty               = _mm_mul_ps(fscal,dy11);
1174             tz               = _mm_mul_ps(fscal,dz11);
1175
1176             /* Update vectorial force */
1177             fix1             = _mm_add_ps(fix1,tx);
1178             fiy1             = _mm_add_ps(fiy1,ty);
1179             fiz1             = _mm_add_ps(fiz1,tz);
1180
1181             fjx1             = _mm_add_ps(fjx1,tx);
1182             fjy1             = _mm_add_ps(fjy1,ty);
1183             fjz1             = _mm_add_ps(fjz1,tz);
1184
1185             }
1186
1187             /**************************
1188              * CALCULATE INTERACTIONS *
1189              **************************/
1190
1191             if (gmx_mm_any_lt(rsq12,rcutoff2))
1192             {
1193
1194             r12              = _mm_mul_ps(rsq12,rinv12);
1195             r12              = _mm_andnot_ps(dummy_mask,r12);
1196
1197             /* EWALD ELECTROSTATICS */
1198
1199             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1200             ewrt             = _mm_mul_ps(r12,ewtabscale);
1201             ewitab           = _mm_cvttps_epi32(ewrt);
1202             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1203             ewitab           = _mm_slli_epi32(ewitab,2);
1204             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1205             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1206             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1207             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1208             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1209             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1210             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1211             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
1212             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1213
1214             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1215
1216             /* Update potential sum for this i atom from the interaction with this j atom. */
1217             velec            = _mm_and_ps(velec,cutoff_mask);
1218             velec            = _mm_andnot_ps(dummy_mask,velec);
1219             velecsum         = _mm_add_ps(velecsum,velec);
1220
1221             fscal            = felec;
1222
1223             fscal            = _mm_and_ps(fscal,cutoff_mask);
1224
1225             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1226
1227             /* Calculate temporary vectorial force */
1228             tx               = _mm_mul_ps(fscal,dx12);
1229             ty               = _mm_mul_ps(fscal,dy12);
1230             tz               = _mm_mul_ps(fscal,dz12);
1231
1232             /* Update vectorial force */
1233             fix1             = _mm_add_ps(fix1,tx);
1234             fiy1             = _mm_add_ps(fiy1,ty);
1235             fiz1             = _mm_add_ps(fiz1,tz);
1236
1237             fjx2             = _mm_add_ps(fjx2,tx);
1238             fjy2             = _mm_add_ps(fjy2,ty);
1239             fjz2             = _mm_add_ps(fjz2,tz);
1240
1241             }
1242
1243             /**************************
1244              * CALCULATE INTERACTIONS *
1245              **************************/
1246
1247             if (gmx_mm_any_lt(rsq20,rcutoff2))
1248             {
1249
1250             r20              = _mm_mul_ps(rsq20,rinv20);
1251             r20              = _mm_andnot_ps(dummy_mask,r20);
1252
1253             /* EWALD ELECTROSTATICS */
1254
1255             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1256             ewrt             = _mm_mul_ps(r20,ewtabscale);
1257             ewitab           = _mm_cvttps_epi32(ewrt);
1258             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1259             ewitab           = _mm_slli_epi32(ewitab,2);
1260             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1261             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1262             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1263             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1264             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1265             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1266             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1267             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
1268             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1269
1270             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1271
1272             /* Update potential sum for this i atom from the interaction with this j atom. */
1273             velec            = _mm_and_ps(velec,cutoff_mask);
1274             velec            = _mm_andnot_ps(dummy_mask,velec);
1275             velecsum         = _mm_add_ps(velecsum,velec);
1276
1277             fscal            = felec;
1278
1279             fscal            = _mm_and_ps(fscal,cutoff_mask);
1280
1281             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1282
1283             /* Calculate temporary vectorial force */
1284             tx               = _mm_mul_ps(fscal,dx20);
1285             ty               = _mm_mul_ps(fscal,dy20);
1286             tz               = _mm_mul_ps(fscal,dz20);
1287
1288             /* Update vectorial force */
1289             fix2             = _mm_add_ps(fix2,tx);
1290             fiy2             = _mm_add_ps(fiy2,ty);
1291             fiz2             = _mm_add_ps(fiz2,tz);
1292
1293             fjx0             = _mm_add_ps(fjx0,tx);
1294             fjy0             = _mm_add_ps(fjy0,ty);
1295             fjz0             = _mm_add_ps(fjz0,tz);
1296
1297             }
1298
1299             /**************************
1300              * CALCULATE INTERACTIONS *
1301              **************************/
1302
1303             if (gmx_mm_any_lt(rsq21,rcutoff2))
1304             {
1305
1306             r21              = _mm_mul_ps(rsq21,rinv21);
1307             r21              = _mm_andnot_ps(dummy_mask,r21);
1308
1309             /* EWALD ELECTROSTATICS */
1310
1311             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1312             ewrt             = _mm_mul_ps(r21,ewtabscale);
1313             ewitab           = _mm_cvttps_epi32(ewrt);
1314             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1315             ewitab           = _mm_slli_epi32(ewitab,2);
1316             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1317             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1318             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1319             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1320             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1321             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1322             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1323             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
1324             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1325
1326             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1327
1328             /* Update potential sum for this i atom from the interaction with this j atom. */
1329             velec            = _mm_and_ps(velec,cutoff_mask);
1330             velec            = _mm_andnot_ps(dummy_mask,velec);
1331             velecsum         = _mm_add_ps(velecsum,velec);
1332
1333             fscal            = felec;
1334
1335             fscal            = _mm_and_ps(fscal,cutoff_mask);
1336
1337             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1338
1339             /* Calculate temporary vectorial force */
1340             tx               = _mm_mul_ps(fscal,dx21);
1341             ty               = _mm_mul_ps(fscal,dy21);
1342             tz               = _mm_mul_ps(fscal,dz21);
1343
1344             /* Update vectorial force */
1345             fix2             = _mm_add_ps(fix2,tx);
1346             fiy2             = _mm_add_ps(fiy2,ty);
1347             fiz2             = _mm_add_ps(fiz2,tz);
1348
1349             fjx1             = _mm_add_ps(fjx1,tx);
1350             fjy1             = _mm_add_ps(fjy1,ty);
1351             fjz1             = _mm_add_ps(fjz1,tz);
1352
1353             }
1354
1355             /**************************
1356              * CALCULATE INTERACTIONS *
1357              **************************/
1358
1359             if (gmx_mm_any_lt(rsq22,rcutoff2))
1360             {
1361
1362             r22              = _mm_mul_ps(rsq22,rinv22);
1363             r22              = _mm_andnot_ps(dummy_mask,r22);
1364
1365             /* EWALD ELECTROSTATICS */
1366
1367             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1368             ewrt             = _mm_mul_ps(r22,ewtabscale);
1369             ewitab           = _mm_cvttps_epi32(ewrt);
1370             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1371             ewitab           = _mm_slli_epi32(ewitab,2);
1372             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1373             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1374             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1375             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1376             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1377             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1378             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1379             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
1380             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1381
1382             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1383
1384             /* Update potential sum for this i atom from the interaction with this j atom. */
1385             velec            = _mm_and_ps(velec,cutoff_mask);
1386             velec            = _mm_andnot_ps(dummy_mask,velec);
1387             velecsum         = _mm_add_ps(velecsum,velec);
1388
1389             fscal            = felec;
1390
1391             fscal            = _mm_and_ps(fscal,cutoff_mask);
1392
1393             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1394
1395             /* Calculate temporary vectorial force */
1396             tx               = _mm_mul_ps(fscal,dx22);
1397             ty               = _mm_mul_ps(fscal,dy22);
1398             tz               = _mm_mul_ps(fscal,dz22);
1399
1400             /* Update vectorial force */
1401             fix2             = _mm_add_ps(fix2,tx);
1402             fiy2             = _mm_add_ps(fiy2,ty);
1403             fiz2             = _mm_add_ps(fiz2,tz);
1404
1405             fjx2             = _mm_add_ps(fjx2,tx);
1406             fjy2             = _mm_add_ps(fjy2,ty);
1407             fjz2             = _mm_add_ps(fjz2,tz);
1408
1409             }
1410
1411             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1412             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1413             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1414             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1415
1416             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1417                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1418
1419             /* Inner loop uses 441 flops */
1420         }
1421
1422         /* End of innermost loop */
1423
1424         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1425                                               f+i_coord_offset,fshift+i_shift_offset);
1426
1427         ggid                        = gid[iidx];
1428         /* Update potential energies */
1429         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1430         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1431
1432         /* Increment number of inner iterations */
1433         inneriter                  += j_index_end - j_index_start;
1434
1435         /* Outer loop uses 20 flops */
1436     }
1437
1438     /* Increment number of outer iterations */
1439     outeriter        += nri;
1440
1441     /* Update outer/inner flops */
1442
1443     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*441);
1444 }
1445 /*
1446  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse4_1_single
1447  * Electrostatics interaction: Ewald
1448  * VdW interaction:            LennardJones
1449  * Geometry:                   Water3-Water3
1450  * Calculate force/pot:        Force
1451  */
1452 void
1453 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse4_1_single
1454                     (t_nblist                    * gmx_restrict       nlist,
1455                      rvec                        * gmx_restrict          xx,
1456                      rvec                        * gmx_restrict          ff,
1457                      t_forcerec                  * gmx_restrict          fr,
1458                      t_mdatoms                   * gmx_restrict     mdatoms,
1459                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1460                      t_nrnb                      * gmx_restrict        nrnb)
1461 {
1462     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1463      * just 0 for non-waters.
1464      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1465      * jnr indices corresponding to data put in the four positions in the SIMD register.
1466      */
1467     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1468     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1469     int              jnrA,jnrB,jnrC,jnrD;
1470     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1471     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1472     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1473     real             rcutoff_scalar;
1474     real             *shiftvec,*fshift,*x,*f;
1475     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1476     real             scratch[4*DIM];
1477     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1478     int              vdwioffset0;
1479     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1480     int              vdwioffset1;
1481     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1482     int              vdwioffset2;
1483     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1484     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1485     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1486     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1487     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1488     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1489     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1490     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1491     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1492     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1493     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1494     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1495     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1496     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1497     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1498     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1499     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1500     real             *charge;
1501     int              nvdwtype;
1502     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1503     int              *vdwtype;
1504     real             *vdwparam;
1505     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1506     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1507     __m128i          ewitab;
1508     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1509     real             *ewtab;
1510     __m128           dummy_mask,cutoff_mask;
1511     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1512     __m128           one     = _mm_set1_ps(1.0);
1513     __m128           two     = _mm_set1_ps(2.0);
1514     x                = xx[0];
1515     f                = ff[0];
1516
1517     nri              = nlist->nri;
1518     iinr             = nlist->iinr;
1519     jindex           = nlist->jindex;
1520     jjnr             = nlist->jjnr;
1521     shiftidx         = nlist->shift;
1522     gid              = nlist->gid;
1523     shiftvec         = fr->shift_vec[0];
1524     fshift           = fr->fshift[0];
1525     facel            = _mm_set1_ps(fr->epsfac);
1526     charge           = mdatoms->chargeA;
1527     nvdwtype         = fr->ntype;
1528     vdwparam         = fr->nbfp;
1529     vdwtype          = mdatoms->typeA;
1530
1531     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1532     ewtab            = fr->ic->tabq_coul_F;
1533     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1534     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1535
1536     /* Setup water-specific parameters */
1537     inr              = nlist->iinr[0];
1538     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1539     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1540     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1541     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1542
1543     jq0              = _mm_set1_ps(charge[inr+0]);
1544     jq1              = _mm_set1_ps(charge[inr+1]);
1545     jq2              = _mm_set1_ps(charge[inr+2]);
1546     vdwjidx0A        = 2*vdwtype[inr+0];
1547     qq00             = _mm_mul_ps(iq0,jq0);
1548     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1549     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1550     qq01             = _mm_mul_ps(iq0,jq1);
1551     qq02             = _mm_mul_ps(iq0,jq2);
1552     qq10             = _mm_mul_ps(iq1,jq0);
1553     qq11             = _mm_mul_ps(iq1,jq1);
1554     qq12             = _mm_mul_ps(iq1,jq2);
1555     qq20             = _mm_mul_ps(iq2,jq0);
1556     qq21             = _mm_mul_ps(iq2,jq1);
1557     qq22             = _mm_mul_ps(iq2,jq2);
1558
1559     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1560     rcutoff_scalar   = fr->rcoulomb;
1561     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1562     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1563
1564     sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
1565     rvdw             = _mm_set1_ps(fr->rvdw);
1566
1567     /* Avoid stupid compiler warnings */
1568     jnrA = jnrB = jnrC = jnrD = 0;
1569     j_coord_offsetA = 0;
1570     j_coord_offsetB = 0;
1571     j_coord_offsetC = 0;
1572     j_coord_offsetD = 0;
1573
1574     outeriter        = 0;
1575     inneriter        = 0;
1576
1577     for(iidx=0;iidx<4*DIM;iidx++)
1578     {
1579         scratch[iidx] = 0.0;
1580     }
1581
1582     /* Start outer loop over neighborlists */
1583     for(iidx=0; iidx<nri; iidx++)
1584     {
1585         /* Load shift vector for this list */
1586         i_shift_offset   = DIM*shiftidx[iidx];
1587
1588         /* Load limits for loop over neighbors */
1589         j_index_start    = jindex[iidx];
1590         j_index_end      = jindex[iidx+1];
1591
1592         /* Get outer coordinate index */
1593         inr              = iinr[iidx];
1594         i_coord_offset   = DIM*inr;
1595
1596         /* Load i particle coords and add shift vector */
1597         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1598                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1599
1600         fix0             = _mm_setzero_ps();
1601         fiy0             = _mm_setzero_ps();
1602         fiz0             = _mm_setzero_ps();
1603         fix1             = _mm_setzero_ps();
1604         fiy1             = _mm_setzero_ps();
1605         fiz1             = _mm_setzero_ps();
1606         fix2             = _mm_setzero_ps();
1607         fiy2             = _mm_setzero_ps();
1608         fiz2             = _mm_setzero_ps();
1609
1610         /* Start inner kernel loop */
1611         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1612         {
1613
1614             /* Get j neighbor index, and coordinate index */
1615             jnrA             = jjnr[jidx];
1616             jnrB             = jjnr[jidx+1];
1617             jnrC             = jjnr[jidx+2];
1618             jnrD             = jjnr[jidx+3];
1619             j_coord_offsetA  = DIM*jnrA;
1620             j_coord_offsetB  = DIM*jnrB;
1621             j_coord_offsetC  = DIM*jnrC;
1622             j_coord_offsetD  = DIM*jnrD;
1623
1624             /* load j atom coordinates */
1625             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1626                                               x+j_coord_offsetC,x+j_coord_offsetD,
1627                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1628
1629             /* Calculate displacement vector */
1630             dx00             = _mm_sub_ps(ix0,jx0);
1631             dy00             = _mm_sub_ps(iy0,jy0);
1632             dz00             = _mm_sub_ps(iz0,jz0);
1633             dx01             = _mm_sub_ps(ix0,jx1);
1634             dy01             = _mm_sub_ps(iy0,jy1);
1635             dz01             = _mm_sub_ps(iz0,jz1);
1636             dx02             = _mm_sub_ps(ix0,jx2);
1637             dy02             = _mm_sub_ps(iy0,jy2);
1638             dz02             = _mm_sub_ps(iz0,jz2);
1639             dx10             = _mm_sub_ps(ix1,jx0);
1640             dy10             = _mm_sub_ps(iy1,jy0);
1641             dz10             = _mm_sub_ps(iz1,jz0);
1642             dx11             = _mm_sub_ps(ix1,jx1);
1643             dy11             = _mm_sub_ps(iy1,jy1);
1644             dz11             = _mm_sub_ps(iz1,jz1);
1645             dx12             = _mm_sub_ps(ix1,jx2);
1646             dy12             = _mm_sub_ps(iy1,jy2);
1647             dz12             = _mm_sub_ps(iz1,jz2);
1648             dx20             = _mm_sub_ps(ix2,jx0);
1649             dy20             = _mm_sub_ps(iy2,jy0);
1650             dz20             = _mm_sub_ps(iz2,jz0);
1651             dx21             = _mm_sub_ps(ix2,jx1);
1652             dy21             = _mm_sub_ps(iy2,jy1);
1653             dz21             = _mm_sub_ps(iz2,jz1);
1654             dx22             = _mm_sub_ps(ix2,jx2);
1655             dy22             = _mm_sub_ps(iy2,jy2);
1656             dz22             = _mm_sub_ps(iz2,jz2);
1657
1658             /* Calculate squared distance and things based on it */
1659             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1660             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1661             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1662             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1663             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1664             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1665             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1666             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1667             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1668
1669             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1670             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1671             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1672             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1673             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1674             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1675             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1676             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1677             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1678
1679             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1680             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1681             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1682             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1683             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1684             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1685             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1686             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1687             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1688
1689             fjx0             = _mm_setzero_ps();
1690             fjy0             = _mm_setzero_ps();
1691             fjz0             = _mm_setzero_ps();
1692             fjx1             = _mm_setzero_ps();
1693             fjy1             = _mm_setzero_ps();
1694             fjz1             = _mm_setzero_ps();
1695             fjx2             = _mm_setzero_ps();
1696             fjy2             = _mm_setzero_ps();
1697             fjz2             = _mm_setzero_ps();
1698
1699             /**************************
1700              * CALCULATE INTERACTIONS *
1701              **************************/
1702
1703             if (gmx_mm_any_lt(rsq00,rcutoff2))
1704             {
1705
1706             r00              = _mm_mul_ps(rsq00,rinv00);
1707
1708             /* EWALD ELECTROSTATICS */
1709
1710             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1711             ewrt             = _mm_mul_ps(r00,ewtabscale);
1712             ewitab           = _mm_cvttps_epi32(ewrt);
1713             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1714             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1715                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1716                                          &ewtabF,&ewtabFn);
1717             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1718             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1719
1720             /* LENNARD-JONES DISPERSION/REPULSION */
1721
1722             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1723             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1724
1725             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1726
1727             fscal            = _mm_add_ps(felec,fvdw);
1728
1729             fscal            = _mm_and_ps(fscal,cutoff_mask);
1730
1731             /* Calculate temporary vectorial force */
1732             tx               = _mm_mul_ps(fscal,dx00);
1733             ty               = _mm_mul_ps(fscal,dy00);
1734             tz               = _mm_mul_ps(fscal,dz00);
1735
1736             /* Update vectorial force */
1737             fix0             = _mm_add_ps(fix0,tx);
1738             fiy0             = _mm_add_ps(fiy0,ty);
1739             fiz0             = _mm_add_ps(fiz0,tz);
1740
1741             fjx0             = _mm_add_ps(fjx0,tx);
1742             fjy0             = _mm_add_ps(fjy0,ty);
1743             fjz0             = _mm_add_ps(fjz0,tz);
1744
1745             }
1746
1747             /**************************
1748              * CALCULATE INTERACTIONS *
1749              **************************/
1750
1751             if (gmx_mm_any_lt(rsq01,rcutoff2))
1752             {
1753
1754             r01              = _mm_mul_ps(rsq01,rinv01);
1755
1756             /* EWALD ELECTROSTATICS */
1757
1758             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1759             ewrt             = _mm_mul_ps(r01,ewtabscale);
1760             ewitab           = _mm_cvttps_epi32(ewrt);
1761             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1762             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1763                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1764                                          &ewtabF,&ewtabFn);
1765             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1766             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1767
1768             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1769
1770             fscal            = felec;
1771
1772             fscal            = _mm_and_ps(fscal,cutoff_mask);
1773
1774             /* Calculate temporary vectorial force */
1775             tx               = _mm_mul_ps(fscal,dx01);
1776             ty               = _mm_mul_ps(fscal,dy01);
1777             tz               = _mm_mul_ps(fscal,dz01);
1778
1779             /* Update vectorial force */
1780             fix0             = _mm_add_ps(fix0,tx);
1781             fiy0             = _mm_add_ps(fiy0,ty);
1782             fiz0             = _mm_add_ps(fiz0,tz);
1783
1784             fjx1             = _mm_add_ps(fjx1,tx);
1785             fjy1             = _mm_add_ps(fjy1,ty);
1786             fjz1             = _mm_add_ps(fjz1,tz);
1787
1788             }
1789
1790             /**************************
1791              * CALCULATE INTERACTIONS *
1792              **************************/
1793
1794             if (gmx_mm_any_lt(rsq02,rcutoff2))
1795             {
1796
1797             r02              = _mm_mul_ps(rsq02,rinv02);
1798
1799             /* EWALD ELECTROSTATICS */
1800
1801             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1802             ewrt             = _mm_mul_ps(r02,ewtabscale);
1803             ewitab           = _mm_cvttps_epi32(ewrt);
1804             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1805             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1806                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1807                                          &ewtabF,&ewtabFn);
1808             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1809             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1810
1811             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1812
1813             fscal            = felec;
1814
1815             fscal            = _mm_and_ps(fscal,cutoff_mask);
1816
1817             /* Calculate temporary vectorial force */
1818             tx               = _mm_mul_ps(fscal,dx02);
1819             ty               = _mm_mul_ps(fscal,dy02);
1820             tz               = _mm_mul_ps(fscal,dz02);
1821
1822             /* Update vectorial force */
1823             fix0             = _mm_add_ps(fix0,tx);
1824             fiy0             = _mm_add_ps(fiy0,ty);
1825             fiz0             = _mm_add_ps(fiz0,tz);
1826
1827             fjx2             = _mm_add_ps(fjx2,tx);
1828             fjy2             = _mm_add_ps(fjy2,ty);
1829             fjz2             = _mm_add_ps(fjz2,tz);
1830
1831             }
1832
1833             /**************************
1834              * CALCULATE INTERACTIONS *
1835              **************************/
1836
1837             if (gmx_mm_any_lt(rsq10,rcutoff2))
1838             {
1839
1840             r10              = _mm_mul_ps(rsq10,rinv10);
1841
1842             /* EWALD ELECTROSTATICS */
1843
1844             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1845             ewrt             = _mm_mul_ps(r10,ewtabscale);
1846             ewitab           = _mm_cvttps_epi32(ewrt);
1847             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1848             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1849                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1850                                          &ewtabF,&ewtabFn);
1851             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1852             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1853
1854             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1855
1856             fscal            = felec;
1857
1858             fscal            = _mm_and_ps(fscal,cutoff_mask);
1859
1860             /* Calculate temporary vectorial force */
1861             tx               = _mm_mul_ps(fscal,dx10);
1862             ty               = _mm_mul_ps(fscal,dy10);
1863             tz               = _mm_mul_ps(fscal,dz10);
1864
1865             /* Update vectorial force */
1866             fix1             = _mm_add_ps(fix1,tx);
1867             fiy1             = _mm_add_ps(fiy1,ty);
1868             fiz1             = _mm_add_ps(fiz1,tz);
1869
1870             fjx0             = _mm_add_ps(fjx0,tx);
1871             fjy0             = _mm_add_ps(fjy0,ty);
1872             fjz0             = _mm_add_ps(fjz0,tz);
1873
1874             }
1875
1876             /**************************
1877              * CALCULATE INTERACTIONS *
1878              **************************/
1879
1880             if (gmx_mm_any_lt(rsq11,rcutoff2))
1881             {
1882
1883             r11              = _mm_mul_ps(rsq11,rinv11);
1884
1885             /* EWALD ELECTROSTATICS */
1886
1887             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1888             ewrt             = _mm_mul_ps(r11,ewtabscale);
1889             ewitab           = _mm_cvttps_epi32(ewrt);
1890             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1891             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1892                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1893                                          &ewtabF,&ewtabFn);
1894             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1895             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1896
1897             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1898
1899             fscal            = felec;
1900
1901             fscal            = _mm_and_ps(fscal,cutoff_mask);
1902
1903             /* Calculate temporary vectorial force */
1904             tx               = _mm_mul_ps(fscal,dx11);
1905             ty               = _mm_mul_ps(fscal,dy11);
1906             tz               = _mm_mul_ps(fscal,dz11);
1907
1908             /* Update vectorial force */
1909             fix1             = _mm_add_ps(fix1,tx);
1910             fiy1             = _mm_add_ps(fiy1,ty);
1911             fiz1             = _mm_add_ps(fiz1,tz);
1912
1913             fjx1             = _mm_add_ps(fjx1,tx);
1914             fjy1             = _mm_add_ps(fjy1,ty);
1915             fjz1             = _mm_add_ps(fjz1,tz);
1916
1917             }
1918
1919             /**************************
1920              * CALCULATE INTERACTIONS *
1921              **************************/
1922
1923             if (gmx_mm_any_lt(rsq12,rcutoff2))
1924             {
1925
1926             r12              = _mm_mul_ps(rsq12,rinv12);
1927
1928             /* EWALD ELECTROSTATICS */
1929
1930             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1931             ewrt             = _mm_mul_ps(r12,ewtabscale);
1932             ewitab           = _mm_cvttps_epi32(ewrt);
1933             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1934             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1935                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1936                                          &ewtabF,&ewtabFn);
1937             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1938             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1939
1940             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1941
1942             fscal            = felec;
1943
1944             fscal            = _mm_and_ps(fscal,cutoff_mask);
1945
1946             /* Calculate temporary vectorial force */
1947             tx               = _mm_mul_ps(fscal,dx12);
1948             ty               = _mm_mul_ps(fscal,dy12);
1949             tz               = _mm_mul_ps(fscal,dz12);
1950
1951             /* Update vectorial force */
1952             fix1             = _mm_add_ps(fix1,tx);
1953             fiy1             = _mm_add_ps(fiy1,ty);
1954             fiz1             = _mm_add_ps(fiz1,tz);
1955
1956             fjx2             = _mm_add_ps(fjx2,tx);
1957             fjy2             = _mm_add_ps(fjy2,ty);
1958             fjz2             = _mm_add_ps(fjz2,tz);
1959
1960             }
1961
1962             /**************************
1963              * CALCULATE INTERACTIONS *
1964              **************************/
1965
1966             if (gmx_mm_any_lt(rsq20,rcutoff2))
1967             {
1968
1969             r20              = _mm_mul_ps(rsq20,rinv20);
1970
1971             /* EWALD ELECTROSTATICS */
1972
1973             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1974             ewrt             = _mm_mul_ps(r20,ewtabscale);
1975             ewitab           = _mm_cvttps_epi32(ewrt);
1976             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1977             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1978                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1979                                          &ewtabF,&ewtabFn);
1980             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1981             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1982
1983             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1984
1985             fscal            = felec;
1986
1987             fscal            = _mm_and_ps(fscal,cutoff_mask);
1988
1989             /* Calculate temporary vectorial force */
1990             tx               = _mm_mul_ps(fscal,dx20);
1991             ty               = _mm_mul_ps(fscal,dy20);
1992             tz               = _mm_mul_ps(fscal,dz20);
1993
1994             /* Update vectorial force */
1995             fix2             = _mm_add_ps(fix2,tx);
1996             fiy2             = _mm_add_ps(fiy2,ty);
1997             fiz2             = _mm_add_ps(fiz2,tz);
1998
1999             fjx0             = _mm_add_ps(fjx0,tx);
2000             fjy0             = _mm_add_ps(fjy0,ty);
2001             fjz0             = _mm_add_ps(fjz0,tz);
2002
2003             }
2004
2005             /**************************
2006              * CALCULATE INTERACTIONS *
2007              **************************/
2008
2009             if (gmx_mm_any_lt(rsq21,rcutoff2))
2010             {
2011
2012             r21              = _mm_mul_ps(rsq21,rinv21);
2013
2014             /* EWALD ELECTROSTATICS */
2015
2016             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2017             ewrt             = _mm_mul_ps(r21,ewtabscale);
2018             ewitab           = _mm_cvttps_epi32(ewrt);
2019             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2020             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2021                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2022                                          &ewtabF,&ewtabFn);
2023             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2024             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2025
2026             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2027
2028             fscal            = felec;
2029
2030             fscal            = _mm_and_ps(fscal,cutoff_mask);
2031
2032             /* Calculate temporary vectorial force */
2033             tx               = _mm_mul_ps(fscal,dx21);
2034             ty               = _mm_mul_ps(fscal,dy21);
2035             tz               = _mm_mul_ps(fscal,dz21);
2036
2037             /* Update vectorial force */
2038             fix2             = _mm_add_ps(fix2,tx);
2039             fiy2             = _mm_add_ps(fiy2,ty);
2040             fiz2             = _mm_add_ps(fiz2,tz);
2041
2042             fjx1             = _mm_add_ps(fjx1,tx);
2043             fjy1             = _mm_add_ps(fjy1,ty);
2044             fjz1             = _mm_add_ps(fjz1,tz);
2045
2046             }
2047
2048             /**************************
2049              * CALCULATE INTERACTIONS *
2050              **************************/
2051
2052             if (gmx_mm_any_lt(rsq22,rcutoff2))
2053             {
2054
2055             r22              = _mm_mul_ps(rsq22,rinv22);
2056
2057             /* EWALD ELECTROSTATICS */
2058
2059             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2060             ewrt             = _mm_mul_ps(r22,ewtabscale);
2061             ewitab           = _mm_cvttps_epi32(ewrt);
2062             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2063             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2064                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2065                                          &ewtabF,&ewtabFn);
2066             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2067             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2068
2069             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2070
2071             fscal            = felec;
2072
2073             fscal            = _mm_and_ps(fscal,cutoff_mask);
2074
2075             /* Calculate temporary vectorial force */
2076             tx               = _mm_mul_ps(fscal,dx22);
2077             ty               = _mm_mul_ps(fscal,dy22);
2078             tz               = _mm_mul_ps(fscal,dz22);
2079
2080             /* Update vectorial force */
2081             fix2             = _mm_add_ps(fix2,tx);
2082             fiy2             = _mm_add_ps(fiy2,ty);
2083             fiz2             = _mm_add_ps(fiz2,tz);
2084
2085             fjx2             = _mm_add_ps(fjx2,tx);
2086             fjy2             = _mm_add_ps(fjy2,ty);
2087             fjz2             = _mm_add_ps(fjz2,tz);
2088
2089             }
2090
2091             fjptrA             = f+j_coord_offsetA;
2092             fjptrB             = f+j_coord_offsetB;
2093             fjptrC             = f+j_coord_offsetC;
2094             fjptrD             = f+j_coord_offsetD;
2095
2096             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2097                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2098
2099             /* Inner loop uses 358 flops */
2100         }
2101
2102         if(jidx<j_index_end)
2103         {
2104
2105             /* Get j neighbor index, and coordinate index */
2106             jnrlistA         = jjnr[jidx];
2107             jnrlistB         = jjnr[jidx+1];
2108             jnrlistC         = jjnr[jidx+2];
2109             jnrlistD         = jjnr[jidx+3];
2110             /* Sign of each element will be negative for non-real atoms.
2111              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2112              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2113              */
2114             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2115             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2116             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2117             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2118             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2119             j_coord_offsetA  = DIM*jnrA;
2120             j_coord_offsetB  = DIM*jnrB;
2121             j_coord_offsetC  = DIM*jnrC;
2122             j_coord_offsetD  = DIM*jnrD;
2123
2124             /* load j atom coordinates */
2125             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2126                                               x+j_coord_offsetC,x+j_coord_offsetD,
2127                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2128
2129             /* Calculate displacement vector */
2130             dx00             = _mm_sub_ps(ix0,jx0);
2131             dy00             = _mm_sub_ps(iy0,jy0);
2132             dz00             = _mm_sub_ps(iz0,jz0);
2133             dx01             = _mm_sub_ps(ix0,jx1);
2134             dy01             = _mm_sub_ps(iy0,jy1);
2135             dz01             = _mm_sub_ps(iz0,jz1);
2136             dx02             = _mm_sub_ps(ix0,jx2);
2137             dy02             = _mm_sub_ps(iy0,jy2);
2138             dz02             = _mm_sub_ps(iz0,jz2);
2139             dx10             = _mm_sub_ps(ix1,jx0);
2140             dy10             = _mm_sub_ps(iy1,jy0);
2141             dz10             = _mm_sub_ps(iz1,jz0);
2142             dx11             = _mm_sub_ps(ix1,jx1);
2143             dy11             = _mm_sub_ps(iy1,jy1);
2144             dz11             = _mm_sub_ps(iz1,jz1);
2145             dx12             = _mm_sub_ps(ix1,jx2);
2146             dy12             = _mm_sub_ps(iy1,jy2);
2147             dz12             = _mm_sub_ps(iz1,jz2);
2148             dx20             = _mm_sub_ps(ix2,jx0);
2149             dy20             = _mm_sub_ps(iy2,jy0);
2150             dz20             = _mm_sub_ps(iz2,jz0);
2151             dx21             = _mm_sub_ps(ix2,jx1);
2152             dy21             = _mm_sub_ps(iy2,jy1);
2153             dz21             = _mm_sub_ps(iz2,jz1);
2154             dx22             = _mm_sub_ps(ix2,jx2);
2155             dy22             = _mm_sub_ps(iy2,jy2);
2156             dz22             = _mm_sub_ps(iz2,jz2);
2157
2158             /* Calculate squared distance and things based on it */
2159             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2160             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2161             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2162             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2163             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2164             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2165             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2166             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2167             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2168
2169             rinv00           = gmx_mm_invsqrt_ps(rsq00);
2170             rinv01           = gmx_mm_invsqrt_ps(rsq01);
2171             rinv02           = gmx_mm_invsqrt_ps(rsq02);
2172             rinv10           = gmx_mm_invsqrt_ps(rsq10);
2173             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2174             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2175             rinv20           = gmx_mm_invsqrt_ps(rsq20);
2176             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2177             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2178
2179             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
2180             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
2181             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
2182             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
2183             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2184             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2185             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
2186             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2187             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2188
2189             fjx0             = _mm_setzero_ps();
2190             fjy0             = _mm_setzero_ps();
2191             fjz0             = _mm_setzero_ps();
2192             fjx1             = _mm_setzero_ps();
2193             fjy1             = _mm_setzero_ps();
2194             fjz1             = _mm_setzero_ps();
2195             fjx2             = _mm_setzero_ps();
2196             fjy2             = _mm_setzero_ps();
2197             fjz2             = _mm_setzero_ps();
2198
2199             /**************************
2200              * CALCULATE INTERACTIONS *
2201              **************************/
2202
2203             if (gmx_mm_any_lt(rsq00,rcutoff2))
2204             {
2205
2206             r00              = _mm_mul_ps(rsq00,rinv00);
2207             r00              = _mm_andnot_ps(dummy_mask,r00);
2208
2209             /* EWALD ELECTROSTATICS */
2210
2211             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2212             ewrt             = _mm_mul_ps(r00,ewtabscale);
2213             ewitab           = _mm_cvttps_epi32(ewrt);
2214             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2215             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2216                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2217                                          &ewtabF,&ewtabFn);
2218             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2219             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
2220
2221             /* LENNARD-JONES DISPERSION/REPULSION */
2222
2223             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2224             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
2225
2226             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2227
2228             fscal            = _mm_add_ps(felec,fvdw);
2229
2230             fscal            = _mm_and_ps(fscal,cutoff_mask);
2231
2232             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2233
2234             /* Calculate temporary vectorial force */
2235             tx               = _mm_mul_ps(fscal,dx00);
2236             ty               = _mm_mul_ps(fscal,dy00);
2237             tz               = _mm_mul_ps(fscal,dz00);
2238
2239             /* Update vectorial force */
2240             fix0             = _mm_add_ps(fix0,tx);
2241             fiy0             = _mm_add_ps(fiy0,ty);
2242             fiz0             = _mm_add_ps(fiz0,tz);
2243
2244             fjx0             = _mm_add_ps(fjx0,tx);
2245             fjy0             = _mm_add_ps(fjy0,ty);
2246             fjz0             = _mm_add_ps(fjz0,tz);
2247
2248             }
2249
2250             /**************************
2251              * CALCULATE INTERACTIONS *
2252              **************************/
2253
2254             if (gmx_mm_any_lt(rsq01,rcutoff2))
2255             {
2256
2257             r01              = _mm_mul_ps(rsq01,rinv01);
2258             r01              = _mm_andnot_ps(dummy_mask,r01);
2259
2260             /* EWALD ELECTROSTATICS */
2261
2262             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2263             ewrt             = _mm_mul_ps(r01,ewtabscale);
2264             ewitab           = _mm_cvttps_epi32(ewrt);
2265             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2266             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2267                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2268                                          &ewtabF,&ewtabFn);
2269             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2270             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2271
2272             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
2273
2274             fscal            = felec;
2275
2276             fscal            = _mm_and_ps(fscal,cutoff_mask);
2277
2278             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2279
2280             /* Calculate temporary vectorial force */
2281             tx               = _mm_mul_ps(fscal,dx01);
2282             ty               = _mm_mul_ps(fscal,dy01);
2283             tz               = _mm_mul_ps(fscal,dz01);
2284
2285             /* Update vectorial force */
2286             fix0             = _mm_add_ps(fix0,tx);
2287             fiy0             = _mm_add_ps(fiy0,ty);
2288             fiz0             = _mm_add_ps(fiz0,tz);
2289
2290             fjx1             = _mm_add_ps(fjx1,tx);
2291             fjy1             = _mm_add_ps(fjy1,ty);
2292             fjz1             = _mm_add_ps(fjz1,tz);
2293
2294             }
2295
2296             /**************************
2297              * CALCULATE INTERACTIONS *
2298              **************************/
2299
2300             if (gmx_mm_any_lt(rsq02,rcutoff2))
2301             {
2302
2303             r02              = _mm_mul_ps(rsq02,rinv02);
2304             r02              = _mm_andnot_ps(dummy_mask,r02);
2305
2306             /* EWALD ELECTROSTATICS */
2307
2308             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2309             ewrt             = _mm_mul_ps(r02,ewtabscale);
2310             ewitab           = _mm_cvttps_epi32(ewrt);
2311             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2312             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2313                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2314                                          &ewtabF,&ewtabFn);
2315             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2316             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2317
2318             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
2319
2320             fscal            = felec;
2321
2322             fscal            = _mm_and_ps(fscal,cutoff_mask);
2323
2324             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2325
2326             /* Calculate temporary vectorial force */
2327             tx               = _mm_mul_ps(fscal,dx02);
2328             ty               = _mm_mul_ps(fscal,dy02);
2329             tz               = _mm_mul_ps(fscal,dz02);
2330
2331             /* Update vectorial force */
2332             fix0             = _mm_add_ps(fix0,tx);
2333             fiy0             = _mm_add_ps(fiy0,ty);
2334             fiz0             = _mm_add_ps(fiz0,tz);
2335
2336             fjx2             = _mm_add_ps(fjx2,tx);
2337             fjy2             = _mm_add_ps(fjy2,ty);
2338             fjz2             = _mm_add_ps(fjz2,tz);
2339
2340             }
2341
2342             /**************************
2343              * CALCULATE INTERACTIONS *
2344              **************************/
2345
2346             if (gmx_mm_any_lt(rsq10,rcutoff2))
2347             {
2348
2349             r10              = _mm_mul_ps(rsq10,rinv10);
2350             r10              = _mm_andnot_ps(dummy_mask,r10);
2351
2352             /* EWALD ELECTROSTATICS */
2353
2354             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2355             ewrt             = _mm_mul_ps(r10,ewtabscale);
2356             ewitab           = _mm_cvttps_epi32(ewrt);
2357             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2358             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2359                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2360                                          &ewtabF,&ewtabFn);
2361             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2362             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2363
2364             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
2365
2366             fscal            = felec;
2367
2368             fscal            = _mm_and_ps(fscal,cutoff_mask);
2369
2370             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2371
2372             /* Calculate temporary vectorial force */
2373             tx               = _mm_mul_ps(fscal,dx10);
2374             ty               = _mm_mul_ps(fscal,dy10);
2375             tz               = _mm_mul_ps(fscal,dz10);
2376
2377             /* Update vectorial force */
2378             fix1             = _mm_add_ps(fix1,tx);
2379             fiy1             = _mm_add_ps(fiy1,ty);
2380             fiz1             = _mm_add_ps(fiz1,tz);
2381
2382             fjx0             = _mm_add_ps(fjx0,tx);
2383             fjy0             = _mm_add_ps(fjy0,ty);
2384             fjz0             = _mm_add_ps(fjz0,tz);
2385
2386             }
2387
2388             /**************************
2389              * CALCULATE INTERACTIONS *
2390              **************************/
2391
2392             if (gmx_mm_any_lt(rsq11,rcutoff2))
2393             {
2394
2395             r11              = _mm_mul_ps(rsq11,rinv11);
2396             r11              = _mm_andnot_ps(dummy_mask,r11);
2397
2398             /* EWALD ELECTROSTATICS */
2399
2400             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2401             ewrt             = _mm_mul_ps(r11,ewtabscale);
2402             ewitab           = _mm_cvttps_epi32(ewrt);
2403             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2404             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2405                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2406                                          &ewtabF,&ewtabFn);
2407             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2408             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2409
2410             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2411
2412             fscal            = felec;
2413
2414             fscal            = _mm_and_ps(fscal,cutoff_mask);
2415
2416             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2417
2418             /* Calculate temporary vectorial force */
2419             tx               = _mm_mul_ps(fscal,dx11);
2420             ty               = _mm_mul_ps(fscal,dy11);
2421             tz               = _mm_mul_ps(fscal,dz11);
2422
2423             /* Update vectorial force */
2424             fix1             = _mm_add_ps(fix1,tx);
2425             fiy1             = _mm_add_ps(fiy1,ty);
2426             fiz1             = _mm_add_ps(fiz1,tz);
2427
2428             fjx1             = _mm_add_ps(fjx1,tx);
2429             fjy1             = _mm_add_ps(fjy1,ty);
2430             fjz1             = _mm_add_ps(fjz1,tz);
2431
2432             }
2433
2434             /**************************
2435              * CALCULATE INTERACTIONS *
2436              **************************/
2437
2438             if (gmx_mm_any_lt(rsq12,rcutoff2))
2439             {
2440
2441             r12              = _mm_mul_ps(rsq12,rinv12);
2442             r12              = _mm_andnot_ps(dummy_mask,r12);
2443
2444             /* EWALD ELECTROSTATICS */
2445
2446             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2447             ewrt             = _mm_mul_ps(r12,ewtabscale);
2448             ewitab           = _mm_cvttps_epi32(ewrt);
2449             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2450             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2451                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2452                                          &ewtabF,&ewtabFn);
2453             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2454             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2455
2456             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2457
2458             fscal            = felec;
2459
2460             fscal            = _mm_and_ps(fscal,cutoff_mask);
2461
2462             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2463
2464             /* Calculate temporary vectorial force */
2465             tx               = _mm_mul_ps(fscal,dx12);
2466             ty               = _mm_mul_ps(fscal,dy12);
2467             tz               = _mm_mul_ps(fscal,dz12);
2468
2469             /* Update vectorial force */
2470             fix1             = _mm_add_ps(fix1,tx);
2471             fiy1             = _mm_add_ps(fiy1,ty);
2472             fiz1             = _mm_add_ps(fiz1,tz);
2473
2474             fjx2             = _mm_add_ps(fjx2,tx);
2475             fjy2             = _mm_add_ps(fjy2,ty);
2476             fjz2             = _mm_add_ps(fjz2,tz);
2477
2478             }
2479
2480             /**************************
2481              * CALCULATE INTERACTIONS *
2482              **************************/
2483
2484             if (gmx_mm_any_lt(rsq20,rcutoff2))
2485             {
2486
2487             r20              = _mm_mul_ps(rsq20,rinv20);
2488             r20              = _mm_andnot_ps(dummy_mask,r20);
2489
2490             /* EWALD ELECTROSTATICS */
2491
2492             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2493             ewrt             = _mm_mul_ps(r20,ewtabscale);
2494             ewitab           = _mm_cvttps_epi32(ewrt);
2495             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2496             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2497                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2498                                          &ewtabF,&ewtabFn);
2499             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2500             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2501
2502             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
2503
2504             fscal            = felec;
2505
2506             fscal            = _mm_and_ps(fscal,cutoff_mask);
2507
2508             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2509
2510             /* Calculate temporary vectorial force */
2511             tx               = _mm_mul_ps(fscal,dx20);
2512             ty               = _mm_mul_ps(fscal,dy20);
2513             tz               = _mm_mul_ps(fscal,dz20);
2514
2515             /* Update vectorial force */
2516             fix2             = _mm_add_ps(fix2,tx);
2517             fiy2             = _mm_add_ps(fiy2,ty);
2518             fiz2             = _mm_add_ps(fiz2,tz);
2519
2520             fjx0             = _mm_add_ps(fjx0,tx);
2521             fjy0             = _mm_add_ps(fjy0,ty);
2522             fjz0             = _mm_add_ps(fjz0,tz);
2523
2524             }
2525
2526             /**************************
2527              * CALCULATE INTERACTIONS *
2528              **************************/
2529
2530             if (gmx_mm_any_lt(rsq21,rcutoff2))
2531             {
2532
2533             r21              = _mm_mul_ps(rsq21,rinv21);
2534             r21              = _mm_andnot_ps(dummy_mask,r21);
2535
2536             /* EWALD ELECTROSTATICS */
2537
2538             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2539             ewrt             = _mm_mul_ps(r21,ewtabscale);
2540             ewitab           = _mm_cvttps_epi32(ewrt);
2541             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2542             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2543                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2544                                          &ewtabF,&ewtabFn);
2545             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2546             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2547
2548             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2549
2550             fscal            = felec;
2551
2552             fscal            = _mm_and_ps(fscal,cutoff_mask);
2553
2554             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2555
2556             /* Calculate temporary vectorial force */
2557             tx               = _mm_mul_ps(fscal,dx21);
2558             ty               = _mm_mul_ps(fscal,dy21);
2559             tz               = _mm_mul_ps(fscal,dz21);
2560
2561             /* Update vectorial force */
2562             fix2             = _mm_add_ps(fix2,tx);
2563             fiy2             = _mm_add_ps(fiy2,ty);
2564             fiz2             = _mm_add_ps(fiz2,tz);
2565
2566             fjx1             = _mm_add_ps(fjx1,tx);
2567             fjy1             = _mm_add_ps(fjy1,ty);
2568             fjz1             = _mm_add_ps(fjz1,tz);
2569
2570             }
2571
2572             /**************************
2573              * CALCULATE INTERACTIONS *
2574              **************************/
2575
2576             if (gmx_mm_any_lt(rsq22,rcutoff2))
2577             {
2578
2579             r22              = _mm_mul_ps(rsq22,rinv22);
2580             r22              = _mm_andnot_ps(dummy_mask,r22);
2581
2582             /* EWALD ELECTROSTATICS */
2583
2584             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2585             ewrt             = _mm_mul_ps(r22,ewtabscale);
2586             ewitab           = _mm_cvttps_epi32(ewrt);
2587             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2588             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2589                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2590                                          &ewtabF,&ewtabFn);
2591             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2592             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2593
2594             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2595
2596             fscal            = felec;
2597
2598             fscal            = _mm_and_ps(fscal,cutoff_mask);
2599
2600             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2601
2602             /* Calculate temporary vectorial force */
2603             tx               = _mm_mul_ps(fscal,dx22);
2604             ty               = _mm_mul_ps(fscal,dy22);
2605             tz               = _mm_mul_ps(fscal,dz22);
2606
2607             /* Update vectorial force */
2608             fix2             = _mm_add_ps(fix2,tx);
2609             fiy2             = _mm_add_ps(fiy2,ty);
2610             fiz2             = _mm_add_ps(fiz2,tz);
2611
2612             fjx2             = _mm_add_ps(fjx2,tx);
2613             fjy2             = _mm_add_ps(fjy2,ty);
2614             fjz2             = _mm_add_ps(fjz2,tz);
2615
2616             }
2617
2618             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2619             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2620             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2621             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2622
2623             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2624                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2625
2626             /* Inner loop uses 367 flops */
2627         }
2628
2629         /* End of innermost loop */
2630
2631         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2632                                               f+i_coord_offset,fshift+i_shift_offset);
2633
2634         /* Increment number of inner iterations */
2635         inneriter                  += j_index_end - j_index_start;
2636
2637         /* Outer loop uses 18 flops */
2638     }
2639
2640     /* Increment number of outer iterations */
2641     outeriter        += nri;
2642
2643     /* Update outer/inner flops */
2644
2645     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*367);
2646 }