Remove all unnecessary HAVE_CONFIG_H
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_ElecEw_VdwLJ_GeomW3W3_sse4_1_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
46
47 #include "gromacs/simd/math_x86_sse4_1_single.h"
48 #include "kernelutil_x86_sse4_1_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse4_1_single
52  * Electrostatics interaction: Ewald
53  * VdW interaction:            LennardJones
54  * Geometry:                   Water3-Water3
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse4_1_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83     int              vdwioffset0;
84     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     int              vdwioffset1;
86     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     int              vdwioffset2;
88     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
105     real             *charge;
106     int              nvdwtype;
107     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
108     int              *vdwtype;
109     real             *vdwparam;
110     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
111     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
112     __m128i          ewitab;
113     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
114     real             *ewtab;
115     __m128           dummy_mask,cutoff_mask;
116     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
117     __m128           one     = _mm_set1_ps(1.0);
118     __m128           two     = _mm_set1_ps(2.0);
119     x                = xx[0];
120     f                = ff[0];
121
122     nri              = nlist->nri;
123     iinr             = nlist->iinr;
124     jindex           = nlist->jindex;
125     jjnr             = nlist->jjnr;
126     shiftidx         = nlist->shift;
127     gid              = nlist->gid;
128     shiftvec         = fr->shift_vec[0];
129     fshift           = fr->fshift[0];
130     facel            = _mm_set1_ps(fr->epsfac);
131     charge           = mdatoms->chargeA;
132     nvdwtype         = fr->ntype;
133     vdwparam         = fr->nbfp;
134     vdwtype          = mdatoms->typeA;
135
136     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
137     ewtab            = fr->ic->tabq_coul_FDV0;
138     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
139     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
140
141     /* Setup water-specific parameters */
142     inr              = nlist->iinr[0];
143     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
144     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
145     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
146     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
147
148     jq0              = _mm_set1_ps(charge[inr+0]);
149     jq1              = _mm_set1_ps(charge[inr+1]);
150     jq2              = _mm_set1_ps(charge[inr+2]);
151     vdwjidx0A        = 2*vdwtype[inr+0];
152     qq00             = _mm_mul_ps(iq0,jq0);
153     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
154     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
155     qq01             = _mm_mul_ps(iq0,jq1);
156     qq02             = _mm_mul_ps(iq0,jq2);
157     qq10             = _mm_mul_ps(iq1,jq0);
158     qq11             = _mm_mul_ps(iq1,jq1);
159     qq12             = _mm_mul_ps(iq1,jq2);
160     qq20             = _mm_mul_ps(iq2,jq0);
161     qq21             = _mm_mul_ps(iq2,jq1);
162     qq22             = _mm_mul_ps(iq2,jq2);
163
164     /* Avoid stupid compiler warnings */
165     jnrA = jnrB = jnrC = jnrD = 0;
166     j_coord_offsetA = 0;
167     j_coord_offsetB = 0;
168     j_coord_offsetC = 0;
169     j_coord_offsetD = 0;
170
171     outeriter        = 0;
172     inneriter        = 0;
173
174     for(iidx=0;iidx<4*DIM;iidx++)
175     {
176         scratch[iidx] = 0.0;
177     }
178
179     /* Start outer loop over neighborlists */
180     for(iidx=0; iidx<nri; iidx++)
181     {
182         /* Load shift vector for this list */
183         i_shift_offset   = DIM*shiftidx[iidx];
184
185         /* Load limits for loop over neighbors */
186         j_index_start    = jindex[iidx];
187         j_index_end      = jindex[iidx+1];
188
189         /* Get outer coordinate index */
190         inr              = iinr[iidx];
191         i_coord_offset   = DIM*inr;
192
193         /* Load i particle coords and add shift vector */
194         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
195                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
196
197         fix0             = _mm_setzero_ps();
198         fiy0             = _mm_setzero_ps();
199         fiz0             = _mm_setzero_ps();
200         fix1             = _mm_setzero_ps();
201         fiy1             = _mm_setzero_ps();
202         fiz1             = _mm_setzero_ps();
203         fix2             = _mm_setzero_ps();
204         fiy2             = _mm_setzero_ps();
205         fiz2             = _mm_setzero_ps();
206
207         /* Reset potential sums */
208         velecsum         = _mm_setzero_ps();
209         vvdwsum          = _mm_setzero_ps();
210
211         /* Start inner kernel loop */
212         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
213         {
214
215             /* Get j neighbor index, and coordinate index */
216             jnrA             = jjnr[jidx];
217             jnrB             = jjnr[jidx+1];
218             jnrC             = jjnr[jidx+2];
219             jnrD             = jjnr[jidx+3];
220             j_coord_offsetA  = DIM*jnrA;
221             j_coord_offsetB  = DIM*jnrB;
222             j_coord_offsetC  = DIM*jnrC;
223             j_coord_offsetD  = DIM*jnrD;
224
225             /* load j atom coordinates */
226             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
227                                               x+j_coord_offsetC,x+j_coord_offsetD,
228                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
229
230             /* Calculate displacement vector */
231             dx00             = _mm_sub_ps(ix0,jx0);
232             dy00             = _mm_sub_ps(iy0,jy0);
233             dz00             = _mm_sub_ps(iz0,jz0);
234             dx01             = _mm_sub_ps(ix0,jx1);
235             dy01             = _mm_sub_ps(iy0,jy1);
236             dz01             = _mm_sub_ps(iz0,jz1);
237             dx02             = _mm_sub_ps(ix0,jx2);
238             dy02             = _mm_sub_ps(iy0,jy2);
239             dz02             = _mm_sub_ps(iz0,jz2);
240             dx10             = _mm_sub_ps(ix1,jx0);
241             dy10             = _mm_sub_ps(iy1,jy0);
242             dz10             = _mm_sub_ps(iz1,jz0);
243             dx11             = _mm_sub_ps(ix1,jx1);
244             dy11             = _mm_sub_ps(iy1,jy1);
245             dz11             = _mm_sub_ps(iz1,jz1);
246             dx12             = _mm_sub_ps(ix1,jx2);
247             dy12             = _mm_sub_ps(iy1,jy2);
248             dz12             = _mm_sub_ps(iz1,jz2);
249             dx20             = _mm_sub_ps(ix2,jx0);
250             dy20             = _mm_sub_ps(iy2,jy0);
251             dz20             = _mm_sub_ps(iz2,jz0);
252             dx21             = _mm_sub_ps(ix2,jx1);
253             dy21             = _mm_sub_ps(iy2,jy1);
254             dz21             = _mm_sub_ps(iz2,jz1);
255             dx22             = _mm_sub_ps(ix2,jx2);
256             dy22             = _mm_sub_ps(iy2,jy2);
257             dz22             = _mm_sub_ps(iz2,jz2);
258
259             /* Calculate squared distance and things based on it */
260             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
261             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
262             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
263             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
264             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
265             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
266             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
267             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
268             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
269
270             rinv00           = gmx_mm_invsqrt_ps(rsq00);
271             rinv01           = gmx_mm_invsqrt_ps(rsq01);
272             rinv02           = gmx_mm_invsqrt_ps(rsq02);
273             rinv10           = gmx_mm_invsqrt_ps(rsq10);
274             rinv11           = gmx_mm_invsqrt_ps(rsq11);
275             rinv12           = gmx_mm_invsqrt_ps(rsq12);
276             rinv20           = gmx_mm_invsqrt_ps(rsq20);
277             rinv21           = gmx_mm_invsqrt_ps(rsq21);
278             rinv22           = gmx_mm_invsqrt_ps(rsq22);
279
280             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
281             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
282             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
283             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
284             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
285             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
286             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
287             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
288             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
289
290             fjx0             = _mm_setzero_ps();
291             fjy0             = _mm_setzero_ps();
292             fjz0             = _mm_setzero_ps();
293             fjx1             = _mm_setzero_ps();
294             fjy1             = _mm_setzero_ps();
295             fjz1             = _mm_setzero_ps();
296             fjx2             = _mm_setzero_ps();
297             fjy2             = _mm_setzero_ps();
298             fjz2             = _mm_setzero_ps();
299
300             /**************************
301              * CALCULATE INTERACTIONS *
302              **************************/
303
304             r00              = _mm_mul_ps(rsq00,rinv00);
305
306             /* EWALD ELECTROSTATICS */
307
308             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
309             ewrt             = _mm_mul_ps(r00,ewtabscale);
310             ewitab           = _mm_cvttps_epi32(ewrt);
311             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
312             ewitab           = _mm_slli_epi32(ewitab,2);
313             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
314             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
315             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
316             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
317             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
318             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
319             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
320             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
321             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
322
323             /* LENNARD-JONES DISPERSION/REPULSION */
324
325             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
326             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
327             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
328             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
329             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
330
331             /* Update potential sum for this i atom from the interaction with this j atom. */
332             velecsum         = _mm_add_ps(velecsum,velec);
333             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
334
335             fscal            = _mm_add_ps(felec,fvdw);
336
337             /* Calculate temporary vectorial force */
338             tx               = _mm_mul_ps(fscal,dx00);
339             ty               = _mm_mul_ps(fscal,dy00);
340             tz               = _mm_mul_ps(fscal,dz00);
341
342             /* Update vectorial force */
343             fix0             = _mm_add_ps(fix0,tx);
344             fiy0             = _mm_add_ps(fiy0,ty);
345             fiz0             = _mm_add_ps(fiz0,tz);
346
347             fjx0             = _mm_add_ps(fjx0,tx);
348             fjy0             = _mm_add_ps(fjy0,ty);
349             fjz0             = _mm_add_ps(fjz0,tz);
350
351             /**************************
352              * CALCULATE INTERACTIONS *
353              **************************/
354
355             r01              = _mm_mul_ps(rsq01,rinv01);
356
357             /* EWALD ELECTROSTATICS */
358
359             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
360             ewrt             = _mm_mul_ps(r01,ewtabscale);
361             ewitab           = _mm_cvttps_epi32(ewrt);
362             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
363             ewitab           = _mm_slli_epi32(ewitab,2);
364             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
365             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
366             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
367             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
368             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
369             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
370             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
371             velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
372             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
373
374             /* Update potential sum for this i atom from the interaction with this j atom. */
375             velecsum         = _mm_add_ps(velecsum,velec);
376
377             fscal            = felec;
378
379             /* Calculate temporary vectorial force */
380             tx               = _mm_mul_ps(fscal,dx01);
381             ty               = _mm_mul_ps(fscal,dy01);
382             tz               = _mm_mul_ps(fscal,dz01);
383
384             /* Update vectorial force */
385             fix0             = _mm_add_ps(fix0,tx);
386             fiy0             = _mm_add_ps(fiy0,ty);
387             fiz0             = _mm_add_ps(fiz0,tz);
388
389             fjx1             = _mm_add_ps(fjx1,tx);
390             fjy1             = _mm_add_ps(fjy1,ty);
391             fjz1             = _mm_add_ps(fjz1,tz);
392
393             /**************************
394              * CALCULATE INTERACTIONS *
395              **************************/
396
397             r02              = _mm_mul_ps(rsq02,rinv02);
398
399             /* EWALD ELECTROSTATICS */
400
401             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
402             ewrt             = _mm_mul_ps(r02,ewtabscale);
403             ewitab           = _mm_cvttps_epi32(ewrt);
404             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
405             ewitab           = _mm_slli_epi32(ewitab,2);
406             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
407             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
408             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
409             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
410             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
411             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
412             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
413             velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
414             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
415
416             /* Update potential sum for this i atom from the interaction with this j atom. */
417             velecsum         = _mm_add_ps(velecsum,velec);
418
419             fscal            = felec;
420
421             /* Calculate temporary vectorial force */
422             tx               = _mm_mul_ps(fscal,dx02);
423             ty               = _mm_mul_ps(fscal,dy02);
424             tz               = _mm_mul_ps(fscal,dz02);
425
426             /* Update vectorial force */
427             fix0             = _mm_add_ps(fix0,tx);
428             fiy0             = _mm_add_ps(fiy0,ty);
429             fiz0             = _mm_add_ps(fiz0,tz);
430
431             fjx2             = _mm_add_ps(fjx2,tx);
432             fjy2             = _mm_add_ps(fjy2,ty);
433             fjz2             = _mm_add_ps(fjz2,tz);
434
435             /**************************
436              * CALCULATE INTERACTIONS *
437              **************************/
438
439             r10              = _mm_mul_ps(rsq10,rinv10);
440
441             /* EWALD ELECTROSTATICS */
442
443             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
444             ewrt             = _mm_mul_ps(r10,ewtabscale);
445             ewitab           = _mm_cvttps_epi32(ewrt);
446             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
447             ewitab           = _mm_slli_epi32(ewitab,2);
448             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
449             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
450             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
451             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
452             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
453             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
454             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
455             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
456             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
457
458             /* Update potential sum for this i atom from the interaction with this j atom. */
459             velecsum         = _mm_add_ps(velecsum,velec);
460
461             fscal            = felec;
462
463             /* Calculate temporary vectorial force */
464             tx               = _mm_mul_ps(fscal,dx10);
465             ty               = _mm_mul_ps(fscal,dy10);
466             tz               = _mm_mul_ps(fscal,dz10);
467
468             /* Update vectorial force */
469             fix1             = _mm_add_ps(fix1,tx);
470             fiy1             = _mm_add_ps(fiy1,ty);
471             fiz1             = _mm_add_ps(fiz1,tz);
472
473             fjx0             = _mm_add_ps(fjx0,tx);
474             fjy0             = _mm_add_ps(fjy0,ty);
475             fjz0             = _mm_add_ps(fjz0,tz);
476
477             /**************************
478              * CALCULATE INTERACTIONS *
479              **************************/
480
481             r11              = _mm_mul_ps(rsq11,rinv11);
482
483             /* EWALD ELECTROSTATICS */
484
485             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
486             ewrt             = _mm_mul_ps(r11,ewtabscale);
487             ewitab           = _mm_cvttps_epi32(ewrt);
488             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
489             ewitab           = _mm_slli_epi32(ewitab,2);
490             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
491             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
492             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
493             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
494             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
495             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
496             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
497             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
498             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
499
500             /* Update potential sum for this i atom from the interaction with this j atom. */
501             velecsum         = _mm_add_ps(velecsum,velec);
502
503             fscal            = felec;
504
505             /* Calculate temporary vectorial force */
506             tx               = _mm_mul_ps(fscal,dx11);
507             ty               = _mm_mul_ps(fscal,dy11);
508             tz               = _mm_mul_ps(fscal,dz11);
509
510             /* Update vectorial force */
511             fix1             = _mm_add_ps(fix1,tx);
512             fiy1             = _mm_add_ps(fiy1,ty);
513             fiz1             = _mm_add_ps(fiz1,tz);
514
515             fjx1             = _mm_add_ps(fjx1,tx);
516             fjy1             = _mm_add_ps(fjy1,ty);
517             fjz1             = _mm_add_ps(fjz1,tz);
518
519             /**************************
520              * CALCULATE INTERACTIONS *
521              **************************/
522
523             r12              = _mm_mul_ps(rsq12,rinv12);
524
525             /* EWALD ELECTROSTATICS */
526
527             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
528             ewrt             = _mm_mul_ps(r12,ewtabscale);
529             ewitab           = _mm_cvttps_epi32(ewrt);
530             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
531             ewitab           = _mm_slli_epi32(ewitab,2);
532             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
533             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
534             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
535             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
536             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
537             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
538             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
539             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
540             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
541
542             /* Update potential sum for this i atom from the interaction with this j atom. */
543             velecsum         = _mm_add_ps(velecsum,velec);
544
545             fscal            = felec;
546
547             /* Calculate temporary vectorial force */
548             tx               = _mm_mul_ps(fscal,dx12);
549             ty               = _mm_mul_ps(fscal,dy12);
550             tz               = _mm_mul_ps(fscal,dz12);
551
552             /* Update vectorial force */
553             fix1             = _mm_add_ps(fix1,tx);
554             fiy1             = _mm_add_ps(fiy1,ty);
555             fiz1             = _mm_add_ps(fiz1,tz);
556
557             fjx2             = _mm_add_ps(fjx2,tx);
558             fjy2             = _mm_add_ps(fjy2,ty);
559             fjz2             = _mm_add_ps(fjz2,tz);
560
561             /**************************
562              * CALCULATE INTERACTIONS *
563              **************************/
564
565             r20              = _mm_mul_ps(rsq20,rinv20);
566
567             /* EWALD ELECTROSTATICS */
568
569             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
570             ewrt             = _mm_mul_ps(r20,ewtabscale);
571             ewitab           = _mm_cvttps_epi32(ewrt);
572             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
573             ewitab           = _mm_slli_epi32(ewitab,2);
574             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
575             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
576             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
577             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
578             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
579             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
580             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
581             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
582             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
583
584             /* Update potential sum for this i atom from the interaction with this j atom. */
585             velecsum         = _mm_add_ps(velecsum,velec);
586
587             fscal            = felec;
588
589             /* Calculate temporary vectorial force */
590             tx               = _mm_mul_ps(fscal,dx20);
591             ty               = _mm_mul_ps(fscal,dy20);
592             tz               = _mm_mul_ps(fscal,dz20);
593
594             /* Update vectorial force */
595             fix2             = _mm_add_ps(fix2,tx);
596             fiy2             = _mm_add_ps(fiy2,ty);
597             fiz2             = _mm_add_ps(fiz2,tz);
598
599             fjx0             = _mm_add_ps(fjx0,tx);
600             fjy0             = _mm_add_ps(fjy0,ty);
601             fjz0             = _mm_add_ps(fjz0,tz);
602
603             /**************************
604              * CALCULATE INTERACTIONS *
605              **************************/
606
607             r21              = _mm_mul_ps(rsq21,rinv21);
608
609             /* EWALD ELECTROSTATICS */
610
611             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
612             ewrt             = _mm_mul_ps(r21,ewtabscale);
613             ewitab           = _mm_cvttps_epi32(ewrt);
614             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
615             ewitab           = _mm_slli_epi32(ewitab,2);
616             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
617             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
618             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
619             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
620             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
621             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
622             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
623             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
624             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
625
626             /* Update potential sum for this i atom from the interaction with this j atom. */
627             velecsum         = _mm_add_ps(velecsum,velec);
628
629             fscal            = felec;
630
631             /* Calculate temporary vectorial force */
632             tx               = _mm_mul_ps(fscal,dx21);
633             ty               = _mm_mul_ps(fscal,dy21);
634             tz               = _mm_mul_ps(fscal,dz21);
635
636             /* Update vectorial force */
637             fix2             = _mm_add_ps(fix2,tx);
638             fiy2             = _mm_add_ps(fiy2,ty);
639             fiz2             = _mm_add_ps(fiz2,tz);
640
641             fjx1             = _mm_add_ps(fjx1,tx);
642             fjy1             = _mm_add_ps(fjy1,ty);
643             fjz1             = _mm_add_ps(fjz1,tz);
644
645             /**************************
646              * CALCULATE INTERACTIONS *
647              **************************/
648
649             r22              = _mm_mul_ps(rsq22,rinv22);
650
651             /* EWALD ELECTROSTATICS */
652
653             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
654             ewrt             = _mm_mul_ps(r22,ewtabscale);
655             ewitab           = _mm_cvttps_epi32(ewrt);
656             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
657             ewitab           = _mm_slli_epi32(ewitab,2);
658             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
659             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
660             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
661             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
662             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
663             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
664             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
665             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
666             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
667
668             /* Update potential sum for this i atom from the interaction with this j atom. */
669             velecsum         = _mm_add_ps(velecsum,velec);
670
671             fscal            = felec;
672
673             /* Calculate temporary vectorial force */
674             tx               = _mm_mul_ps(fscal,dx22);
675             ty               = _mm_mul_ps(fscal,dy22);
676             tz               = _mm_mul_ps(fscal,dz22);
677
678             /* Update vectorial force */
679             fix2             = _mm_add_ps(fix2,tx);
680             fiy2             = _mm_add_ps(fiy2,ty);
681             fiz2             = _mm_add_ps(fiz2,tz);
682
683             fjx2             = _mm_add_ps(fjx2,tx);
684             fjy2             = _mm_add_ps(fjy2,ty);
685             fjz2             = _mm_add_ps(fjz2,tz);
686
687             fjptrA             = f+j_coord_offsetA;
688             fjptrB             = f+j_coord_offsetB;
689             fjptrC             = f+j_coord_offsetC;
690             fjptrD             = f+j_coord_offsetD;
691
692             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
693                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
694
695             /* Inner loop uses 381 flops */
696         }
697
698         if(jidx<j_index_end)
699         {
700
701             /* Get j neighbor index, and coordinate index */
702             jnrlistA         = jjnr[jidx];
703             jnrlistB         = jjnr[jidx+1];
704             jnrlistC         = jjnr[jidx+2];
705             jnrlistD         = jjnr[jidx+3];
706             /* Sign of each element will be negative for non-real atoms.
707              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
708              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
709              */
710             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
711             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
712             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
713             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
714             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
715             j_coord_offsetA  = DIM*jnrA;
716             j_coord_offsetB  = DIM*jnrB;
717             j_coord_offsetC  = DIM*jnrC;
718             j_coord_offsetD  = DIM*jnrD;
719
720             /* load j atom coordinates */
721             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
722                                               x+j_coord_offsetC,x+j_coord_offsetD,
723                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
724
725             /* Calculate displacement vector */
726             dx00             = _mm_sub_ps(ix0,jx0);
727             dy00             = _mm_sub_ps(iy0,jy0);
728             dz00             = _mm_sub_ps(iz0,jz0);
729             dx01             = _mm_sub_ps(ix0,jx1);
730             dy01             = _mm_sub_ps(iy0,jy1);
731             dz01             = _mm_sub_ps(iz0,jz1);
732             dx02             = _mm_sub_ps(ix0,jx2);
733             dy02             = _mm_sub_ps(iy0,jy2);
734             dz02             = _mm_sub_ps(iz0,jz2);
735             dx10             = _mm_sub_ps(ix1,jx0);
736             dy10             = _mm_sub_ps(iy1,jy0);
737             dz10             = _mm_sub_ps(iz1,jz0);
738             dx11             = _mm_sub_ps(ix1,jx1);
739             dy11             = _mm_sub_ps(iy1,jy1);
740             dz11             = _mm_sub_ps(iz1,jz1);
741             dx12             = _mm_sub_ps(ix1,jx2);
742             dy12             = _mm_sub_ps(iy1,jy2);
743             dz12             = _mm_sub_ps(iz1,jz2);
744             dx20             = _mm_sub_ps(ix2,jx0);
745             dy20             = _mm_sub_ps(iy2,jy0);
746             dz20             = _mm_sub_ps(iz2,jz0);
747             dx21             = _mm_sub_ps(ix2,jx1);
748             dy21             = _mm_sub_ps(iy2,jy1);
749             dz21             = _mm_sub_ps(iz2,jz1);
750             dx22             = _mm_sub_ps(ix2,jx2);
751             dy22             = _mm_sub_ps(iy2,jy2);
752             dz22             = _mm_sub_ps(iz2,jz2);
753
754             /* Calculate squared distance and things based on it */
755             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
756             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
757             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
758             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
759             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
760             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
761             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
762             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
763             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
764
765             rinv00           = gmx_mm_invsqrt_ps(rsq00);
766             rinv01           = gmx_mm_invsqrt_ps(rsq01);
767             rinv02           = gmx_mm_invsqrt_ps(rsq02);
768             rinv10           = gmx_mm_invsqrt_ps(rsq10);
769             rinv11           = gmx_mm_invsqrt_ps(rsq11);
770             rinv12           = gmx_mm_invsqrt_ps(rsq12);
771             rinv20           = gmx_mm_invsqrt_ps(rsq20);
772             rinv21           = gmx_mm_invsqrt_ps(rsq21);
773             rinv22           = gmx_mm_invsqrt_ps(rsq22);
774
775             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
776             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
777             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
778             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
779             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
780             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
781             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
782             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
783             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
784
785             fjx0             = _mm_setzero_ps();
786             fjy0             = _mm_setzero_ps();
787             fjz0             = _mm_setzero_ps();
788             fjx1             = _mm_setzero_ps();
789             fjy1             = _mm_setzero_ps();
790             fjz1             = _mm_setzero_ps();
791             fjx2             = _mm_setzero_ps();
792             fjy2             = _mm_setzero_ps();
793             fjz2             = _mm_setzero_ps();
794
795             /**************************
796              * CALCULATE INTERACTIONS *
797              **************************/
798
799             r00              = _mm_mul_ps(rsq00,rinv00);
800             r00              = _mm_andnot_ps(dummy_mask,r00);
801
802             /* EWALD ELECTROSTATICS */
803
804             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
805             ewrt             = _mm_mul_ps(r00,ewtabscale);
806             ewitab           = _mm_cvttps_epi32(ewrt);
807             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
808             ewitab           = _mm_slli_epi32(ewitab,2);
809             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
810             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
811             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
812             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
813             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
814             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
815             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
816             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
817             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
818
819             /* LENNARD-JONES DISPERSION/REPULSION */
820
821             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
822             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
823             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
824             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
825             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
826
827             /* Update potential sum for this i atom from the interaction with this j atom. */
828             velec            = _mm_andnot_ps(dummy_mask,velec);
829             velecsum         = _mm_add_ps(velecsum,velec);
830             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
831             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
832
833             fscal            = _mm_add_ps(felec,fvdw);
834
835             fscal            = _mm_andnot_ps(dummy_mask,fscal);
836
837             /* Calculate temporary vectorial force */
838             tx               = _mm_mul_ps(fscal,dx00);
839             ty               = _mm_mul_ps(fscal,dy00);
840             tz               = _mm_mul_ps(fscal,dz00);
841
842             /* Update vectorial force */
843             fix0             = _mm_add_ps(fix0,tx);
844             fiy0             = _mm_add_ps(fiy0,ty);
845             fiz0             = _mm_add_ps(fiz0,tz);
846
847             fjx0             = _mm_add_ps(fjx0,tx);
848             fjy0             = _mm_add_ps(fjy0,ty);
849             fjz0             = _mm_add_ps(fjz0,tz);
850
851             /**************************
852              * CALCULATE INTERACTIONS *
853              **************************/
854
855             r01              = _mm_mul_ps(rsq01,rinv01);
856             r01              = _mm_andnot_ps(dummy_mask,r01);
857
858             /* EWALD ELECTROSTATICS */
859
860             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
861             ewrt             = _mm_mul_ps(r01,ewtabscale);
862             ewitab           = _mm_cvttps_epi32(ewrt);
863             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
864             ewitab           = _mm_slli_epi32(ewitab,2);
865             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
866             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
867             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
868             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
869             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
870             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
871             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
872             velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
873             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
874
875             /* Update potential sum for this i atom from the interaction with this j atom. */
876             velec            = _mm_andnot_ps(dummy_mask,velec);
877             velecsum         = _mm_add_ps(velecsum,velec);
878
879             fscal            = felec;
880
881             fscal            = _mm_andnot_ps(dummy_mask,fscal);
882
883             /* Calculate temporary vectorial force */
884             tx               = _mm_mul_ps(fscal,dx01);
885             ty               = _mm_mul_ps(fscal,dy01);
886             tz               = _mm_mul_ps(fscal,dz01);
887
888             /* Update vectorial force */
889             fix0             = _mm_add_ps(fix0,tx);
890             fiy0             = _mm_add_ps(fiy0,ty);
891             fiz0             = _mm_add_ps(fiz0,tz);
892
893             fjx1             = _mm_add_ps(fjx1,tx);
894             fjy1             = _mm_add_ps(fjy1,ty);
895             fjz1             = _mm_add_ps(fjz1,tz);
896
897             /**************************
898              * CALCULATE INTERACTIONS *
899              **************************/
900
901             r02              = _mm_mul_ps(rsq02,rinv02);
902             r02              = _mm_andnot_ps(dummy_mask,r02);
903
904             /* EWALD ELECTROSTATICS */
905
906             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
907             ewrt             = _mm_mul_ps(r02,ewtabscale);
908             ewitab           = _mm_cvttps_epi32(ewrt);
909             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
910             ewitab           = _mm_slli_epi32(ewitab,2);
911             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
912             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
913             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
914             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
915             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
916             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
917             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
918             velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
919             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
920
921             /* Update potential sum for this i atom from the interaction with this j atom. */
922             velec            = _mm_andnot_ps(dummy_mask,velec);
923             velecsum         = _mm_add_ps(velecsum,velec);
924
925             fscal            = felec;
926
927             fscal            = _mm_andnot_ps(dummy_mask,fscal);
928
929             /* Calculate temporary vectorial force */
930             tx               = _mm_mul_ps(fscal,dx02);
931             ty               = _mm_mul_ps(fscal,dy02);
932             tz               = _mm_mul_ps(fscal,dz02);
933
934             /* Update vectorial force */
935             fix0             = _mm_add_ps(fix0,tx);
936             fiy0             = _mm_add_ps(fiy0,ty);
937             fiz0             = _mm_add_ps(fiz0,tz);
938
939             fjx2             = _mm_add_ps(fjx2,tx);
940             fjy2             = _mm_add_ps(fjy2,ty);
941             fjz2             = _mm_add_ps(fjz2,tz);
942
943             /**************************
944              * CALCULATE INTERACTIONS *
945              **************************/
946
947             r10              = _mm_mul_ps(rsq10,rinv10);
948             r10              = _mm_andnot_ps(dummy_mask,r10);
949
950             /* EWALD ELECTROSTATICS */
951
952             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
953             ewrt             = _mm_mul_ps(r10,ewtabscale);
954             ewitab           = _mm_cvttps_epi32(ewrt);
955             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
956             ewitab           = _mm_slli_epi32(ewitab,2);
957             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
958             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
959             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
960             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
961             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
962             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
963             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
964             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
965             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
966
967             /* Update potential sum for this i atom from the interaction with this j atom. */
968             velec            = _mm_andnot_ps(dummy_mask,velec);
969             velecsum         = _mm_add_ps(velecsum,velec);
970
971             fscal            = felec;
972
973             fscal            = _mm_andnot_ps(dummy_mask,fscal);
974
975             /* Calculate temporary vectorial force */
976             tx               = _mm_mul_ps(fscal,dx10);
977             ty               = _mm_mul_ps(fscal,dy10);
978             tz               = _mm_mul_ps(fscal,dz10);
979
980             /* Update vectorial force */
981             fix1             = _mm_add_ps(fix1,tx);
982             fiy1             = _mm_add_ps(fiy1,ty);
983             fiz1             = _mm_add_ps(fiz1,tz);
984
985             fjx0             = _mm_add_ps(fjx0,tx);
986             fjy0             = _mm_add_ps(fjy0,ty);
987             fjz0             = _mm_add_ps(fjz0,tz);
988
989             /**************************
990              * CALCULATE INTERACTIONS *
991              **************************/
992
993             r11              = _mm_mul_ps(rsq11,rinv11);
994             r11              = _mm_andnot_ps(dummy_mask,r11);
995
996             /* EWALD ELECTROSTATICS */
997
998             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
999             ewrt             = _mm_mul_ps(r11,ewtabscale);
1000             ewitab           = _mm_cvttps_epi32(ewrt);
1001             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1002             ewitab           = _mm_slli_epi32(ewitab,2);
1003             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1004             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1005             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1006             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1007             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1008             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1009             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1010             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1011             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1012
1013             /* Update potential sum for this i atom from the interaction with this j atom. */
1014             velec            = _mm_andnot_ps(dummy_mask,velec);
1015             velecsum         = _mm_add_ps(velecsum,velec);
1016
1017             fscal            = felec;
1018
1019             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1020
1021             /* Calculate temporary vectorial force */
1022             tx               = _mm_mul_ps(fscal,dx11);
1023             ty               = _mm_mul_ps(fscal,dy11);
1024             tz               = _mm_mul_ps(fscal,dz11);
1025
1026             /* Update vectorial force */
1027             fix1             = _mm_add_ps(fix1,tx);
1028             fiy1             = _mm_add_ps(fiy1,ty);
1029             fiz1             = _mm_add_ps(fiz1,tz);
1030
1031             fjx1             = _mm_add_ps(fjx1,tx);
1032             fjy1             = _mm_add_ps(fjy1,ty);
1033             fjz1             = _mm_add_ps(fjz1,tz);
1034
1035             /**************************
1036              * CALCULATE INTERACTIONS *
1037              **************************/
1038
1039             r12              = _mm_mul_ps(rsq12,rinv12);
1040             r12              = _mm_andnot_ps(dummy_mask,r12);
1041
1042             /* EWALD ELECTROSTATICS */
1043
1044             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1045             ewrt             = _mm_mul_ps(r12,ewtabscale);
1046             ewitab           = _mm_cvttps_epi32(ewrt);
1047             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1048             ewitab           = _mm_slli_epi32(ewitab,2);
1049             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1050             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1051             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1052             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1053             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1054             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1055             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1056             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1057             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1058
1059             /* Update potential sum for this i atom from the interaction with this j atom. */
1060             velec            = _mm_andnot_ps(dummy_mask,velec);
1061             velecsum         = _mm_add_ps(velecsum,velec);
1062
1063             fscal            = felec;
1064
1065             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1066
1067             /* Calculate temporary vectorial force */
1068             tx               = _mm_mul_ps(fscal,dx12);
1069             ty               = _mm_mul_ps(fscal,dy12);
1070             tz               = _mm_mul_ps(fscal,dz12);
1071
1072             /* Update vectorial force */
1073             fix1             = _mm_add_ps(fix1,tx);
1074             fiy1             = _mm_add_ps(fiy1,ty);
1075             fiz1             = _mm_add_ps(fiz1,tz);
1076
1077             fjx2             = _mm_add_ps(fjx2,tx);
1078             fjy2             = _mm_add_ps(fjy2,ty);
1079             fjz2             = _mm_add_ps(fjz2,tz);
1080
1081             /**************************
1082              * CALCULATE INTERACTIONS *
1083              **************************/
1084
1085             r20              = _mm_mul_ps(rsq20,rinv20);
1086             r20              = _mm_andnot_ps(dummy_mask,r20);
1087
1088             /* EWALD ELECTROSTATICS */
1089
1090             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1091             ewrt             = _mm_mul_ps(r20,ewtabscale);
1092             ewitab           = _mm_cvttps_epi32(ewrt);
1093             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1094             ewitab           = _mm_slli_epi32(ewitab,2);
1095             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1096             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1097             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1098             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1099             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1100             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1101             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1102             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1103             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1104
1105             /* Update potential sum for this i atom from the interaction with this j atom. */
1106             velec            = _mm_andnot_ps(dummy_mask,velec);
1107             velecsum         = _mm_add_ps(velecsum,velec);
1108
1109             fscal            = felec;
1110
1111             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1112
1113             /* Calculate temporary vectorial force */
1114             tx               = _mm_mul_ps(fscal,dx20);
1115             ty               = _mm_mul_ps(fscal,dy20);
1116             tz               = _mm_mul_ps(fscal,dz20);
1117
1118             /* Update vectorial force */
1119             fix2             = _mm_add_ps(fix2,tx);
1120             fiy2             = _mm_add_ps(fiy2,ty);
1121             fiz2             = _mm_add_ps(fiz2,tz);
1122
1123             fjx0             = _mm_add_ps(fjx0,tx);
1124             fjy0             = _mm_add_ps(fjy0,ty);
1125             fjz0             = _mm_add_ps(fjz0,tz);
1126
1127             /**************************
1128              * CALCULATE INTERACTIONS *
1129              **************************/
1130
1131             r21              = _mm_mul_ps(rsq21,rinv21);
1132             r21              = _mm_andnot_ps(dummy_mask,r21);
1133
1134             /* EWALD ELECTROSTATICS */
1135
1136             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1137             ewrt             = _mm_mul_ps(r21,ewtabscale);
1138             ewitab           = _mm_cvttps_epi32(ewrt);
1139             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1140             ewitab           = _mm_slli_epi32(ewitab,2);
1141             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1142             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1143             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1144             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1145             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1146             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1147             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1148             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1149             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1150
1151             /* Update potential sum for this i atom from the interaction with this j atom. */
1152             velec            = _mm_andnot_ps(dummy_mask,velec);
1153             velecsum         = _mm_add_ps(velecsum,velec);
1154
1155             fscal            = felec;
1156
1157             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1158
1159             /* Calculate temporary vectorial force */
1160             tx               = _mm_mul_ps(fscal,dx21);
1161             ty               = _mm_mul_ps(fscal,dy21);
1162             tz               = _mm_mul_ps(fscal,dz21);
1163
1164             /* Update vectorial force */
1165             fix2             = _mm_add_ps(fix2,tx);
1166             fiy2             = _mm_add_ps(fiy2,ty);
1167             fiz2             = _mm_add_ps(fiz2,tz);
1168
1169             fjx1             = _mm_add_ps(fjx1,tx);
1170             fjy1             = _mm_add_ps(fjy1,ty);
1171             fjz1             = _mm_add_ps(fjz1,tz);
1172
1173             /**************************
1174              * CALCULATE INTERACTIONS *
1175              **************************/
1176
1177             r22              = _mm_mul_ps(rsq22,rinv22);
1178             r22              = _mm_andnot_ps(dummy_mask,r22);
1179
1180             /* EWALD ELECTROSTATICS */
1181
1182             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1183             ewrt             = _mm_mul_ps(r22,ewtabscale);
1184             ewitab           = _mm_cvttps_epi32(ewrt);
1185             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1186             ewitab           = _mm_slli_epi32(ewitab,2);
1187             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1188             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1189             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1190             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1191             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1192             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1193             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1194             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1195             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1196
1197             /* Update potential sum for this i atom from the interaction with this j atom. */
1198             velec            = _mm_andnot_ps(dummy_mask,velec);
1199             velecsum         = _mm_add_ps(velecsum,velec);
1200
1201             fscal            = felec;
1202
1203             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1204
1205             /* Calculate temporary vectorial force */
1206             tx               = _mm_mul_ps(fscal,dx22);
1207             ty               = _mm_mul_ps(fscal,dy22);
1208             tz               = _mm_mul_ps(fscal,dz22);
1209
1210             /* Update vectorial force */
1211             fix2             = _mm_add_ps(fix2,tx);
1212             fiy2             = _mm_add_ps(fiy2,ty);
1213             fiz2             = _mm_add_ps(fiz2,tz);
1214
1215             fjx2             = _mm_add_ps(fjx2,tx);
1216             fjy2             = _mm_add_ps(fjy2,ty);
1217             fjz2             = _mm_add_ps(fjz2,tz);
1218
1219             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1220             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1221             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1222             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1223
1224             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1225                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1226
1227             /* Inner loop uses 390 flops */
1228         }
1229
1230         /* End of innermost loop */
1231
1232         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1233                                               f+i_coord_offset,fshift+i_shift_offset);
1234
1235         ggid                        = gid[iidx];
1236         /* Update potential energies */
1237         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1238         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1239
1240         /* Increment number of inner iterations */
1241         inneriter                  += j_index_end - j_index_start;
1242
1243         /* Outer loop uses 20 flops */
1244     }
1245
1246     /* Increment number of outer iterations */
1247     outeriter        += nri;
1248
1249     /* Update outer/inner flops */
1250
1251     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*390);
1252 }
1253 /*
1254  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse4_1_single
1255  * Electrostatics interaction: Ewald
1256  * VdW interaction:            LennardJones
1257  * Geometry:                   Water3-Water3
1258  * Calculate force/pot:        Force
1259  */
1260 void
1261 nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse4_1_single
1262                     (t_nblist                    * gmx_restrict       nlist,
1263                      rvec                        * gmx_restrict          xx,
1264                      rvec                        * gmx_restrict          ff,
1265                      t_forcerec                  * gmx_restrict          fr,
1266                      t_mdatoms                   * gmx_restrict     mdatoms,
1267                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1268                      t_nrnb                      * gmx_restrict        nrnb)
1269 {
1270     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1271      * just 0 for non-waters.
1272      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1273      * jnr indices corresponding to data put in the four positions in the SIMD register.
1274      */
1275     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1276     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1277     int              jnrA,jnrB,jnrC,jnrD;
1278     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1279     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1280     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1281     real             rcutoff_scalar;
1282     real             *shiftvec,*fshift,*x,*f;
1283     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1284     real             scratch[4*DIM];
1285     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1286     int              vdwioffset0;
1287     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1288     int              vdwioffset1;
1289     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1290     int              vdwioffset2;
1291     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1292     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1293     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1294     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1295     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1296     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1297     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1298     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1299     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1300     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1301     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1302     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1303     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1304     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1305     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1306     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1307     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1308     real             *charge;
1309     int              nvdwtype;
1310     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1311     int              *vdwtype;
1312     real             *vdwparam;
1313     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1314     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1315     __m128i          ewitab;
1316     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1317     real             *ewtab;
1318     __m128           dummy_mask,cutoff_mask;
1319     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1320     __m128           one     = _mm_set1_ps(1.0);
1321     __m128           two     = _mm_set1_ps(2.0);
1322     x                = xx[0];
1323     f                = ff[0];
1324
1325     nri              = nlist->nri;
1326     iinr             = nlist->iinr;
1327     jindex           = nlist->jindex;
1328     jjnr             = nlist->jjnr;
1329     shiftidx         = nlist->shift;
1330     gid              = nlist->gid;
1331     shiftvec         = fr->shift_vec[0];
1332     fshift           = fr->fshift[0];
1333     facel            = _mm_set1_ps(fr->epsfac);
1334     charge           = mdatoms->chargeA;
1335     nvdwtype         = fr->ntype;
1336     vdwparam         = fr->nbfp;
1337     vdwtype          = mdatoms->typeA;
1338
1339     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1340     ewtab            = fr->ic->tabq_coul_F;
1341     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1342     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1343
1344     /* Setup water-specific parameters */
1345     inr              = nlist->iinr[0];
1346     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1347     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1348     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1349     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1350
1351     jq0              = _mm_set1_ps(charge[inr+0]);
1352     jq1              = _mm_set1_ps(charge[inr+1]);
1353     jq2              = _mm_set1_ps(charge[inr+2]);
1354     vdwjidx0A        = 2*vdwtype[inr+0];
1355     qq00             = _mm_mul_ps(iq0,jq0);
1356     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1357     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1358     qq01             = _mm_mul_ps(iq0,jq1);
1359     qq02             = _mm_mul_ps(iq0,jq2);
1360     qq10             = _mm_mul_ps(iq1,jq0);
1361     qq11             = _mm_mul_ps(iq1,jq1);
1362     qq12             = _mm_mul_ps(iq1,jq2);
1363     qq20             = _mm_mul_ps(iq2,jq0);
1364     qq21             = _mm_mul_ps(iq2,jq1);
1365     qq22             = _mm_mul_ps(iq2,jq2);
1366
1367     /* Avoid stupid compiler warnings */
1368     jnrA = jnrB = jnrC = jnrD = 0;
1369     j_coord_offsetA = 0;
1370     j_coord_offsetB = 0;
1371     j_coord_offsetC = 0;
1372     j_coord_offsetD = 0;
1373
1374     outeriter        = 0;
1375     inneriter        = 0;
1376
1377     for(iidx=0;iidx<4*DIM;iidx++)
1378     {
1379         scratch[iidx] = 0.0;
1380     }
1381
1382     /* Start outer loop over neighborlists */
1383     for(iidx=0; iidx<nri; iidx++)
1384     {
1385         /* Load shift vector for this list */
1386         i_shift_offset   = DIM*shiftidx[iidx];
1387
1388         /* Load limits for loop over neighbors */
1389         j_index_start    = jindex[iidx];
1390         j_index_end      = jindex[iidx+1];
1391
1392         /* Get outer coordinate index */
1393         inr              = iinr[iidx];
1394         i_coord_offset   = DIM*inr;
1395
1396         /* Load i particle coords and add shift vector */
1397         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1398                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1399
1400         fix0             = _mm_setzero_ps();
1401         fiy0             = _mm_setzero_ps();
1402         fiz0             = _mm_setzero_ps();
1403         fix1             = _mm_setzero_ps();
1404         fiy1             = _mm_setzero_ps();
1405         fiz1             = _mm_setzero_ps();
1406         fix2             = _mm_setzero_ps();
1407         fiy2             = _mm_setzero_ps();
1408         fiz2             = _mm_setzero_ps();
1409
1410         /* Start inner kernel loop */
1411         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1412         {
1413
1414             /* Get j neighbor index, and coordinate index */
1415             jnrA             = jjnr[jidx];
1416             jnrB             = jjnr[jidx+1];
1417             jnrC             = jjnr[jidx+2];
1418             jnrD             = jjnr[jidx+3];
1419             j_coord_offsetA  = DIM*jnrA;
1420             j_coord_offsetB  = DIM*jnrB;
1421             j_coord_offsetC  = DIM*jnrC;
1422             j_coord_offsetD  = DIM*jnrD;
1423
1424             /* load j atom coordinates */
1425             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1426                                               x+j_coord_offsetC,x+j_coord_offsetD,
1427                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1428
1429             /* Calculate displacement vector */
1430             dx00             = _mm_sub_ps(ix0,jx0);
1431             dy00             = _mm_sub_ps(iy0,jy0);
1432             dz00             = _mm_sub_ps(iz0,jz0);
1433             dx01             = _mm_sub_ps(ix0,jx1);
1434             dy01             = _mm_sub_ps(iy0,jy1);
1435             dz01             = _mm_sub_ps(iz0,jz1);
1436             dx02             = _mm_sub_ps(ix0,jx2);
1437             dy02             = _mm_sub_ps(iy0,jy2);
1438             dz02             = _mm_sub_ps(iz0,jz2);
1439             dx10             = _mm_sub_ps(ix1,jx0);
1440             dy10             = _mm_sub_ps(iy1,jy0);
1441             dz10             = _mm_sub_ps(iz1,jz0);
1442             dx11             = _mm_sub_ps(ix1,jx1);
1443             dy11             = _mm_sub_ps(iy1,jy1);
1444             dz11             = _mm_sub_ps(iz1,jz1);
1445             dx12             = _mm_sub_ps(ix1,jx2);
1446             dy12             = _mm_sub_ps(iy1,jy2);
1447             dz12             = _mm_sub_ps(iz1,jz2);
1448             dx20             = _mm_sub_ps(ix2,jx0);
1449             dy20             = _mm_sub_ps(iy2,jy0);
1450             dz20             = _mm_sub_ps(iz2,jz0);
1451             dx21             = _mm_sub_ps(ix2,jx1);
1452             dy21             = _mm_sub_ps(iy2,jy1);
1453             dz21             = _mm_sub_ps(iz2,jz1);
1454             dx22             = _mm_sub_ps(ix2,jx2);
1455             dy22             = _mm_sub_ps(iy2,jy2);
1456             dz22             = _mm_sub_ps(iz2,jz2);
1457
1458             /* Calculate squared distance and things based on it */
1459             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1460             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1461             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1462             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1463             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1464             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1465             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1466             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1467             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1468
1469             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1470             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1471             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1472             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1473             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1474             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1475             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1476             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1477             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1478
1479             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1480             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1481             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1482             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1483             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1484             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1485             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1486             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1487             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1488
1489             fjx0             = _mm_setzero_ps();
1490             fjy0             = _mm_setzero_ps();
1491             fjz0             = _mm_setzero_ps();
1492             fjx1             = _mm_setzero_ps();
1493             fjy1             = _mm_setzero_ps();
1494             fjz1             = _mm_setzero_ps();
1495             fjx2             = _mm_setzero_ps();
1496             fjy2             = _mm_setzero_ps();
1497             fjz2             = _mm_setzero_ps();
1498
1499             /**************************
1500              * CALCULATE INTERACTIONS *
1501              **************************/
1502
1503             r00              = _mm_mul_ps(rsq00,rinv00);
1504
1505             /* EWALD ELECTROSTATICS */
1506
1507             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1508             ewrt             = _mm_mul_ps(r00,ewtabscale);
1509             ewitab           = _mm_cvttps_epi32(ewrt);
1510             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1511             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1512                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1513                                          &ewtabF,&ewtabFn);
1514             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1515             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1516
1517             /* LENNARD-JONES DISPERSION/REPULSION */
1518
1519             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1520             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1521
1522             fscal            = _mm_add_ps(felec,fvdw);
1523
1524             /* Calculate temporary vectorial force */
1525             tx               = _mm_mul_ps(fscal,dx00);
1526             ty               = _mm_mul_ps(fscal,dy00);
1527             tz               = _mm_mul_ps(fscal,dz00);
1528
1529             /* Update vectorial force */
1530             fix0             = _mm_add_ps(fix0,tx);
1531             fiy0             = _mm_add_ps(fiy0,ty);
1532             fiz0             = _mm_add_ps(fiz0,tz);
1533
1534             fjx0             = _mm_add_ps(fjx0,tx);
1535             fjy0             = _mm_add_ps(fjy0,ty);
1536             fjz0             = _mm_add_ps(fjz0,tz);
1537
1538             /**************************
1539              * CALCULATE INTERACTIONS *
1540              **************************/
1541
1542             r01              = _mm_mul_ps(rsq01,rinv01);
1543
1544             /* EWALD ELECTROSTATICS */
1545
1546             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1547             ewrt             = _mm_mul_ps(r01,ewtabscale);
1548             ewitab           = _mm_cvttps_epi32(ewrt);
1549             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1550             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1551                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1552                                          &ewtabF,&ewtabFn);
1553             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1554             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1555
1556             fscal            = felec;
1557
1558             /* Calculate temporary vectorial force */
1559             tx               = _mm_mul_ps(fscal,dx01);
1560             ty               = _mm_mul_ps(fscal,dy01);
1561             tz               = _mm_mul_ps(fscal,dz01);
1562
1563             /* Update vectorial force */
1564             fix0             = _mm_add_ps(fix0,tx);
1565             fiy0             = _mm_add_ps(fiy0,ty);
1566             fiz0             = _mm_add_ps(fiz0,tz);
1567
1568             fjx1             = _mm_add_ps(fjx1,tx);
1569             fjy1             = _mm_add_ps(fjy1,ty);
1570             fjz1             = _mm_add_ps(fjz1,tz);
1571
1572             /**************************
1573              * CALCULATE INTERACTIONS *
1574              **************************/
1575
1576             r02              = _mm_mul_ps(rsq02,rinv02);
1577
1578             /* EWALD ELECTROSTATICS */
1579
1580             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1581             ewrt             = _mm_mul_ps(r02,ewtabscale);
1582             ewitab           = _mm_cvttps_epi32(ewrt);
1583             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1584             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1585                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1586                                          &ewtabF,&ewtabFn);
1587             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1588             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1589
1590             fscal            = felec;
1591
1592             /* Calculate temporary vectorial force */
1593             tx               = _mm_mul_ps(fscal,dx02);
1594             ty               = _mm_mul_ps(fscal,dy02);
1595             tz               = _mm_mul_ps(fscal,dz02);
1596
1597             /* Update vectorial force */
1598             fix0             = _mm_add_ps(fix0,tx);
1599             fiy0             = _mm_add_ps(fiy0,ty);
1600             fiz0             = _mm_add_ps(fiz0,tz);
1601
1602             fjx2             = _mm_add_ps(fjx2,tx);
1603             fjy2             = _mm_add_ps(fjy2,ty);
1604             fjz2             = _mm_add_ps(fjz2,tz);
1605
1606             /**************************
1607              * CALCULATE INTERACTIONS *
1608              **************************/
1609
1610             r10              = _mm_mul_ps(rsq10,rinv10);
1611
1612             /* EWALD ELECTROSTATICS */
1613
1614             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1615             ewrt             = _mm_mul_ps(r10,ewtabscale);
1616             ewitab           = _mm_cvttps_epi32(ewrt);
1617             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1618             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1619                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1620                                          &ewtabF,&ewtabFn);
1621             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1622             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1623
1624             fscal            = felec;
1625
1626             /* Calculate temporary vectorial force */
1627             tx               = _mm_mul_ps(fscal,dx10);
1628             ty               = _mm_mul_ps(fscal,dy10);
1629             tz               = _mm_mul_ps(fscal,dz10);
1630
1631             /* Update vectorial force */
1632             fix1             = _mm_add_ps(fix1,tx);
1633             fiy1             = _mm_add_ps(fiy1,ty);
1634             fiz1             = _mm_add_ps(fiz1,tz);
1635
1636             fjx0             = _mm_add_ps(fjx0,tx);
1637             fjy0             = _mm_add_ps(fjy0,ty);
1638             fjz0             = _mm_add_ps(fjz0,tz);
1639
1640             /**************************
1641              * CALCULATE INTERACTIONS *
1642              **************************/
1643
1644             r11              = _mm_mul_ps(rsq11,rinv11);
1645
1646             /* EWALD ELECTROSTATICS */
1647
1648             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1649             ewrt             = _mm_mul_ps(r11,ewtabscale);
1650             ewitab           = _mm_cvttps_epi32(ewrt);
1651             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1652             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1653                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1654                                          &ewtabF,&ewtabFn);
1655             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1656             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1657
1658             fscal            = felec;
1659
1660             /* Calculate temporary vectorial force */
1661             tx               = _mm_mul_ps(fscal,dx11);
1662             ty               = _mm_mul_ps(fscal,dy11);
1663             tz               = _mm_mul_ps(fscal,dz11);
1664
1665             /* Update vectorial force */
1666             fix1             = _mm_add_ps(fix1,tx);
1667             fiy1             = _mm_add_ps(fiy1,ty);
1668             fiz1             = _mm_add_ps(fiz1,tz);
1669
1670             fjx1             = _mm_add_ps(fjx1,tx);
1671             fjy1             = _mm_add_ps(fjy1,ty);
1672             fjz1             = _mm_add_ps(fjz1,tz);
1673
1674             /**************************
1675              * CALCULATE INTERACTIONS *
1676              **************************/
1677
1678             r12              = _mm_mul_ps(rsq12,rinv12);
1679
1680             /* EWALD ELECTROSTATICS */
1681
1682             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1683             ewrt             = _mm_mul_ps(r12,ewtabscale);
1684             ewitab           = _mm_cvttps_epi32(ewrt);
1685             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1686             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1687                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1688                                          &ewtabF,&ewtabFn);
1689             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1690             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1691
1692             fscal            = felec;
1693
1694             /* Calculate temporary vectorial force */
1695             tx               = _mm_mul_ps(fscal,dx12);
1696             ty               = _mm_mul_ps(fscal,dy12);
1697             tz               = _mm_mul_ps(fscal,dz12);
1698
1699             /* Update vectorial force */
1700             fix1             = _mm_add_ps(fix1,tx);
1701             fiy1             = _mm_add_ps(fiy1,ty);
1702             fiz1             = _mm_add_ps(fiz1,tz);
1703
1704             fjx2             = _mm_add_ps(fjx2,tx);
1705             fjy2             = _mm_add_ps(fjy2,ty);
1706             fjz2             = _mm_add_ps(fjz2,tz);
1707
1708             /**************************
1709              * CALCULATE INTERACTIONS *
1710              **************************/
1711
1712             r20              = _mm_mul_ps(rsq20,rinv20);
1713
1714             /* EWALD ELECTROSTATICS */
1715
1716             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1717             ewrt             = _mm_mul_ps(r20,ewtabscale);
1718             ewitab           = _mm_cvttps_epi32(ewrt);
1719             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1720             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1721                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1722                                          &ewtabF,&ewtabFn);
1723             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1724             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1725
1726             fscal            = felec;
1727
1728             /* Calculate temporary vectorial force */
1729             tx               = _mm_mul_ps(fscal,dx20);
1730             ty               = _mm_mul_ps(fscal,dy20);
1731             tz               = _mm_mul_ps(fscal,dz20);
1732
1733             /* Update vectorial force */
1734             fix2             = _mm_add_ps(fix2,tx);
1735             fiy2             = _mm_add_ps(fiy2,ty);
1736             fiz2             = _mm_add_ps(fiz2,tz);
1737
1738             fjx0             = _mm_add_ps(fjx0,tx);
1739             fjy0             = _mm_add_ps(fjy0,ty);
1740             fjz0             = _mm_add_ps(fjz0,tz);
1741
1742             /**************************
1743              * CALCULATE INTERACTIONS *
1744              **************************/
1745
1746             r21              = _mm_mul_ps(rsq21,rinv21);
1747
1748             /* EWALD ELECTROSTATICS */
1749
1750             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1751             ewrt             = _mm_mul_ps(r21,ewtabscale);
1752             ewitab           = _mm_cvttps_epi32(ewrt);
1753             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1754             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1755                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1756                                          &ewtabF,&ewtabFn);
1757             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1758             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1759
1760             fscal            = felec;
1761
1762             /* Calculate temporary vectorial force */
1763             tx               = _mm_mul_ps(fscal,dx21);
1764             ty               = _mm_mul_ps(fscal,dy21);
1765             tz               = _mm_mul_ps(fscal,dz21);
1766
1767             /* Update vectorial force */
1768             fix2             = _mm_add_ps(fix2,tx);
1769             fiy2             = _mm_add_ps(fiy2,ty);
1770             fiz2             = _mm_add_ps(fiz2,tz);
1771
1772             fjx1             = _mm_add_ps(fjx1,tx);
1773             fjy1             = _mm_add_ps(fjy1,ty);
1774             fjz1             = _mm_add_ps(fjz1,tz);
1775
1776             /**************************
1777              * CALCULATE INTERACTIONS *
1778              **************************/
1779
1780             r22              = _mm_mul_ps(rsq22,rinv22);
1781
1782             /* EWALD ELECTROSTATICS */
1783
1784             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1785             ewrt             = _mm_mul_ps(r22,ewtabscale);
1786             ewitab           = _mm_cvttps_epi32(ewrt);
1787             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1788             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1789                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1790                                          &ewtabF,&ewtabFn);
1791             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1792             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1793
1794             fscal            = felec;
1795
1796             /* Calculate temporary vectorial force */
1797             tx               = _mm_mul_ps(fscal,dx22);
1798             ty               = _mm_mul_ps(fscal,dy22);
1799             tz               = _mm_mul_ps(fscal,dz22);
1800
1801             /* Update vectorial force */
1802             fix2             = _mm_add_ps(fix2,tx);
1803             fiy2             = _mm_add_ps(fiy2,ty);
1804             fiz2             = _mm_add_ps(fiz2,tz);
1805
1806             fjx2             = _mm_add_ps(fjx2,tx);
1807             fjy2             = _mm_add_ps(fjy2,ty);
1808             fjz2             = _mm_add_ps(fjz2,tz);
1809
1810             fjptrA             = f+j_coord_offsetA;
1811             fjptrB             = f+j_coord_offsetB;
1812             fjptrC             = f+j_coord_offsetC;
1813             fjptrD             = f+j_coord_offsetD;
1814
1815             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1816                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1817
1818             /* Inner loop uses 331 flops */
1819         }
1820
1821         if(jidx<j_index_end)
1822         {
1823
1824             /* Get j neighbor index, and coordinate index */
1825             jnrlistA         = jjnr[jidx];
1826             jnrlistB         = jjnr[jidx+1];
1827             jnrlistC         = jjnr[jidx+2];
1828             jnrlistD         = jjnr[jidx+3];
1829             /* Sign of each element will be negative for non-real atoms.
1830              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1831              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1832              */
1833             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1834             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1835             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1836             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1837             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1838             j_coord_offsetA  = DIM*jnrA;
1839             j_coord_offsetB  = DIM*jnrB;
1840             j_coord_offsetC  = DIM*jnrC;
1841             j_coord_offsetD  = DIM*jnrD;
1842
1843             /* load j atom coordinates */
1844             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1845                                               x+j_coord_offsetC,x+j_coord_offsetD,
1846                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1847
1848             /* Calculate displacement vector */
1849             dx00             = _mm_sub_ps(ix0,jx0);
1850             dy00             = _mm_sub_ps(iy0,jy0);
1851             dz00             = _mm_sub_ps(iz0,jz0);
1852             dx01             = _mm_sub_ps(ix0,jx1);
1853             dy01             = _mm_sub_ps(iy0,jy1);
1854             dz01             = _mm_sub_ps(iz0,jz1);
1855             dx02             = _mm_sub_ps(ix0,jx2);
1856             dy02             = _mm_sub_ps(iy0,jy2);
1857             dz02             = _mm_sub_ps(iz0,jz2);
1858             dx10             = _mm_sub_ps(ix1,jx0);
1859             dy10             = _mm_sub_ps(iy1,jy0);
1860             dz10             = _mm_sub_ps(iz1,jz0);
1861             dx11             = _mm_sub_ps(ix1,jx1);
1862             dy11             = _mm_sub_ps(iy1,jy1);
1863             dz11             = _mm_sub_ps(iz1,jz1);
1864             dx12             = _mm_sub_ps(ix1,jx2);
1865             dy12             = _mm_sub_ps(iy1,jy2);
1866             dz12             = _mm_sub_ps(iz1,jz2);
1867             dx20             = _mm_sub_ps(ix2,jx0);
1868             dy20             = _mm_sub_ps(iy2,jy0);
1869             dz20             = _mm_sub_ps(iz2,jz0);
1870             dx21             = _mm_sub_ps(ix2,jx1);
1871             dy21             = _mm_sub_ps(iy2,jy1);
1872             dz21             = _mm_sub_ps(iz2,jz1);
1873             dx22             = _mm_sub_ps(ix2,jx2);
1874             dy22             = _mm_sub_ps(iy2,jy2);
1875             dz22             = _mm_sub_ps(iz2,jz2);
1876
1877             /* Calculate squared distance and things based on it */
1878             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1879             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1880             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1881             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1882             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1883             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1884             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1885             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1886             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1887
1888             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1889             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1890             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1891             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1892             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1893             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1894             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1895             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1896             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1897
1898             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1899             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1900             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1901             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1902             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1903             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1904             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1905             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1906             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1907
1908             fjx0             = _mm_setzero_ps();
1909             fjy0             = _mm_setzero_ps();
1910             fjz0             = _mm_setzero_ps();
1911             fjx1             = _mm_setzero_ps();
1912             fjy1             = _mm_setzero_ps();
1913             fjz1             = _mm_setzero_ps();
1914             fjx2             = _mm_setzero_ps();
1915             fjy2             = _mm_setzero_ps();
1916             fjz2             = _mm_setzero_ps();
1917
1918             /**************************
1919              * CALCULATE INTERACTIONS *
1920              **************************/
1921
1922             r00              = _mm_mul_ps(rsq00,rinv00);
1923             r00              = _mm_andnot_ps(dummy_mask,r00);
1924
1925             /* EWALD ELECTROSTATICS */
1926
1927             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1928             ewrt             = _mm_mul_ps(r00,ewtabscale);
1929             ewitab           = _mm_cvttps_epi32(ewrt);
1930             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1931             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1932                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1933                                          &ewtabF,&ewtabFn);
1934             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1935             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1936
1937             /* LENNARD-JONES DISPERSION/REPULSION */
1938
1939             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1940             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1941
1942             fscal            = _mm_add_ps(felec,fvdw);
1943
1944             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1945
1946             /* Calculate temporary vectorial force */
1947             tx               = _mm_mul_ps(fscal,dx00);
1948             ty               = _mm_mul_ps(fscal,dy00);
1949             tz               = _mm_mul_ps(fscal,dz00);
1950
1951             /* Update vectorial force */
1952             fix0             = _mm_add_ps(fix0,tx);
1953             fiy0             = _mm_add_ps(fiy0,ty);
1954             fiz0             = _mm_add_ps(fiz0,tz);
1955
1956             fjx0             = _mm_add_ps(fjx0,tx);
1957             fjy0             = _mm_add_ps(fjy0,ty);
1958             fjz0             = _mm_add_ps(fjz0,tz);
1959
1960             /**************************
1961              * CALCULATE INTERACTIONS *
1962              **************************/
1963
1964             r01              = _mm_mul_ps(rsq01,rinv01);
1965             r01              = _mm_andnot_ps(dummy_mask,r01);
1966
1967             /* EWALD ELECTROSTATICS */
1968
1969             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1970             ewrt             = _mm_mul_ps(r01,ewtabscale);
1971             ewitab           = _mm_cvttps_epi32(ewrt);
1972             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1973             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1974                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1975                                          &ewtabF,&ewtabFn);
1976             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1977             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1978
1979             fscal            = felec;
1980
1981             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1982
1983             /* Calculate temporary vectorial force */
1984             tx               = _mm_mul_ps(fscal,dx01);
1985             ty               = _mm_mul_ps(fscal,dy01);
1986             tz               = _mm_mul_ps(fscal,dz01);
1987
1988             /* Update vectorial force */
1989             fix0             = _mm_add_ps(fix0,tx);
1990             fiy0             = _mm_add_ps(fiy0,ty);
1991             fiz0             = _mm_add_ps(fiz0,tz);
1992
1993             fjx1             = _mm_add_ps(fjx1,tx);
1994             fjy1             = _mm_add_ps(fjy1,ty);
1995             fjz1             = _mm_add_ps(fjz1,tz);
1996
1997             /**************************
1998              * CALCULATE INTERACTIONS *
1999              **************************/
2000
2001             r02              = _mm_mul_ps(rsq02,rinv02);
2002             r02              = _mm_andnot_ps(dummy_mask,r02);
2003
2004             /* EWALD ELECTROSTATICS */
2005
2006             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2007             ewrt             = _mm_mul_ps(r02,ewtabscale);
2008             ewitab           = _mm_cvttps_epi32(ewrt);
2009             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2010             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2011                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2012                                          &ewtabF,&ewtabFn);
2013             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2014             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2015
2016             fscal            = felec;
2017
2018             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2019
2020             /* Calculate temporary vectorial force */
2021             tx               = _mm_mul_ps(fscal,dx02);
2022             ty               = _mm_mul_ps(fscal,dy02);
2023             tz               = _mm_mul_ps(fscal,dz02);
2024
2025             /* Update vectorial force */
2026             fix0             = _mm_add_ps(fix0,tx);
2027             fiy0             = _mm_add_ps(fiy0,ty);
2028             fiz0             = _mm_add_ps(fiz0,tz);
2029
2030             fjx2             = _mm_add_ps(fjx2,tx);
2031             fjy2             = _mm_add_ps(fjy2,ty);
2032             fjz2             = _mm_add_ps(fjz2,tz);
2033
2034             /**************************
2035              * CALCULATE INTERACTIONS *
2036              **************************/
2037
2038             r10              = _mm_mul_ps(rsq10,rinv10);
2039             r10              = _mm_andnot_ps(dummy_mask,r10);
2040
2041             /* EWALD ELECTROSTATICS */
2042
2043             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2044             ewrt             = _mm_mul_ps(r10,ewtabscale);
2045             ewitab           = _mm_cvttps_epi32(ewrt);
2046             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2047             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2048                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2049                                          &ewtabF,&ewtabFn);
2050             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2051             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2052
2053             fscal            = felec;
2054
2055             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2056
2057             /* Calculate temporary vectorial force */
2058             tx               = _mm_mul_ps(fscal,dx10);
2059             ty               = _mm_mul_ps(fscal,dy10);
2060             tz               = _mm_mul_ps(fscal,dz10);
2061
2062             /* Update vectorial force */
2063             fix1             = _mm_add_ps(fix1,tx);
2064             fiy1             = _mm_add_ps(fiy1,ty);
2065             fiz1             = _mm_add_ps(fiz1,tz);
2066
2067             fjx0             = _mm_add_ps(fjx0,tx);
2068             fjy0             = _mm_add_ps(fjy0,ty);
2069             fjz0             = _mm_add_ps(fjz0,tz);
2070
2071             /**************************
2072              * CALCULATE INTERACTIONS *
2073              **************************/
2074
2075             r11              = _mm_mul_ps(rsq11,rinv11);
2076             r11              = _mm_andnot_ps(dummy_mask,r11);
2077
2078             /* EWALD ELECTROSTATICS */
2079
2080             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2081             ewrt             = _mm_mul_ps(r11,ewtabscale);
2082             ewitab           = _mm_cvttps_epi32(ewrt);
2083             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2084             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2085                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2086                                          &ewtabF,&ewtabFn);
2087             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2088             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2089
2090             fscal            = felec;
2091
2092             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2093
2094             /* Calculate temporary vectorial force */
2095             tx               = _mm_mul_ps(fscal,dx11);
2096             ty               = _mm_mul_ps(fscal,dy11);
2097             tz               = _mm_mul_ps(fscal,dz11);
2098
2099             /* Update vectorial force */
2100             fix1             = _mm_add_ps(fix1,tx);
2101             fiy1             = _mm_add_ps(fiy1,ty);
2102             fiz1             = _mm_add_ps(fiz1,tz);
2103
2104             fjx1             = _mm_add_ps(fjx1,tx);
2105             fjy1             = _mm_add_ps(fjy1,ty);
2106             fjz1             = _mm_add_ps(fjz1,tz);
2107
2108             /**************************
2109              * CALCULATE INTERACTIONS *
2110              **************************/
2111
2112             r12              = _mm_mul_ps(rsq12,rinv12);
2113             r12              = _mm_andnot_ps(dummy_mask,r12);
2114
2115             /* EWALD ELECTROSTATICS */
2116
2117             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2118             ewrt             = _mm_mul_ps(r12,ewtabscale);
2119             ewitab           = _mm_cvttps_epi32(ewrt);
2120             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2121             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2122                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2123                                          &ewtabF,&ewtabFn);
2124             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2125             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2126
2127             fscal            = felec;
2128
2129             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2130
2131             /* Calculate temporary vectorial force */
2132             tx               = _mm_mul_ps(fscal,dx12);
2133             ty               = _mm_mul_ps(fscal,dy12);
2134             tz               = _mm_mul_ps(fscal,dz12);
2135
2136             /* Update vectorial force */
2137             fix1             = _mm_add_ps(fix1,tx);
2138             fiy1             = _mm_add_ps(fiy1,ty);
2139             fiz1             = _mm_add_ps(fiz1,tz);
2140
2141             fjx2             = _mm_add_ps(fjx2,tx);
2142             fjy2             = _mm_add_ps(fjy2,ty);
2143             fjz2             = _mm_add_ps(fjz2,tz);
2144
2145             /**************************
2146              * CALCULATE INTERACTIONS *
2147              **************************/
2148
2149             r20              = _mm_mul_ps(rsq20,rinv20);
2150             r20              = _mm_andnot_ps(dummy_mask,r20);
2151
2152             /* EWALD ELECTROSTATICS */
2153
2154             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2155             ewrt             = _mm_mul_ps(r20,ewtabscale);
2156             ewitab           = _mm_cvttps_epi32(ewrt);
2157             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2158             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2159                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2160                                          &ewtabF,&ewtabFn);
2161             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2162             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2163
2164             fscal            = felec;
2165
2166             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2167
2168             /* Calculate temporary vectorial force */
2169             tx               = _mm_mul_ps(fscal,dx20);
2170             ty               = _mm_mul_ps(fscal,dy20);
2171             tz               = _mm_mul_ps(fscal,dz20);
2172
2173             /* Update vectorial force */
2174             fix2             = _mm_add_ps(fix2,tx);
2175             fiy2             = _mm_add_ps(fiy2,ty);
2176             fiz2             = _mm_add_ps(fiz2,tz);
2177
2178             fjx0             = _mm_add_ps(fjx0,tx);
2179             fjy0             = _mm_add_ps(fjy0,ty);
2180             fjz0             = _mm_add_ps(fjz0,tz);
2181
2182             /**************************
2183              * CALCULATE INTERACTIONS *
2184              **************************/
2185
2186             r21              = _mm_mul_ps(rsq21,rinv21);
2187             r21              = _mm_andnot_ps(dummy_mask,r21);
2188
2189             /* EWALD ELECTROSTATICS */
2190
2191             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2192             ewrt             = _mm_mul_ps(r21,ewtabscale);
2193             ewitab           = _mm_cvttps_epi32(ewrt);
2194             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2195             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2196                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2197                                          &ewtabF,&ewtabFn);
2198             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2199             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2200
2201             fscal            = felec;
2202
2203             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2204
2205             /* Calculate temporary vectorial force */
2206             tx               = _mm_mul_ps(fscal,dx21);
2207             ty               = _mm_mul_ps(fscal,dy21);
2208             tz               = _mm_mul_ps(fscal,dz21);
2209
2210             /* Update vectorial force */
2211             fix2             = _mm_add_ps(fix2,tx);
2212             fiy2             = _mm_add_ps(fiy2,ty);
2213             fiz2             = _mm_add_ps(fiz2,tz);
2214
2215             fjx1             = _mm_add_ps(fjx1,tx);
2216             fjy1             = _mm_add_ps(fjy1,ty);
2217             fjz1             = _mm_add_ps(fjz1,tz);
2218
2219             /**************************
2220              * CALCULATE INTERACTIONS *
2221              **************************/
2222
2223             r22              = _mm_mul_ps(rsq22,rinv22);
2224             r22              = _mm_andnot_ps(dummy_mask,r22);
2225
2226             /* EWALD ELECTROSTATICS */
2227
2228             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2229             ewrt             = _mm_mul_ps(r22,ewtabscale);
2230             ewitab           = _mm_cvttps_epi32(ewrt);
2231             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2232             gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2233                                          ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2234                                          &ewtabF,&ewtabFn);
2235             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2236             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2237
2238             fscal            = felec;
2239
2240             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2241
2242             /* Calculate temporary vectorial force */
2243             tx               = _mm_mul_ps(fscal,dx22);
2244             ty               = _mm_mul_ps(fscal,dy22);
2245             tz               = _mm_mul_ps(fscal,dz22);
2246
2247             /* Update vectorial force */
2248             fix2             = _mm_add_ps(fix2,tx);
2249             fiy2             = _mm_add_ps(fiy2,ty);
2250             fiz2             = _mm_add_ps(fiz2,tz);
2251
2252             fjx2             = _mm_add_ps(fjx2,tx);
2253             fjy2             = _mm_add_ps(fjy2,ty);
2254             fjz2             = _mm_add_ps(fjz2,tz);
2255
2256             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2257             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2258             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2259             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2260
2261             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2262                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2263
2264             /* Inner loop uses 340 flops */
2265         }
2266
2267         /* End of innermost loop */
2268
2269         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2270                                               f+i_coord_offset,fshift+i_shift_offset);
2271
2272         /* Increment number of inner iterations */
2273         inneriter                  += j_index_end - j_index_start;
2274
2275         /* Outer loop uses 18 flops */
2276     }
2277
2278     /* Increment number of outer iterations */
2279     outeriter        += nri;
2280
2281     /* Update outer/inner flops */
2282
2283     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*340);
2284 }