Merge 'release-4-6' into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_sse2_single.c
1 /*
2  * Note: this file was generated by the Gromacs sse2_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_single
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water3-Particle
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
63     real             shX,shY,shZ,rcutoff_scalar;
64     real             *shiftvec,*fshift,*x,*f;
65     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66     int              vdwioffset0;
67     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68     int              vdwioffset1;
69     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
70     int              vdwioffset2;
71     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
73     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
75     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
76     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
77     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
78     real             *charge;
79     int              nvdwtype;
80     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
81     int              *vdwtype;
82     real             *vdwparam;
83     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
84     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
85     __m128i          ewitab;
86     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
87     real             *ewtab;
88     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
89     real             rswitch_scalar,d_scalar;
90     __m128           dummy_mask,cutoff_mask;
91     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
92     __m128           one     = _mm_set1_ps(1.0);
93     __m128           two     = _mm_set1_ps(2.0);
94     x                = xx[0];
95     f                = ff[0];
96
97     nri              = nlist->nri;
98     iinr             = nlist->iinr;
99     jindex           = nlist->jindex;
100     jjnr             = nlist->jjnr;
101     shiftidx         = nlist->shift;
102     gid              = nlist->gid;
103     shiftvec         = fr->shift_vec[0];
104     fshift           = fr->fshift[0];
105     facel            = _mm_set1_ps(fr->epsfac);
106     charge           = mdatoms->chargeA;
107     nvdwtype         = fr->ntype;
108     vdwparam         = fr->nbfp;
109     vdwtype          = mdatoms->typeA;
110
111     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
112     ewtab            = fr->ic->tabq_coul_FDV0;
113     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
114     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
115
116     /* Setup water-specific parameters */
117     inr              = nlist->iinr[0];
118     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
119     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
120     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
121     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
122
123     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
124     rcutoff_scalar   = fr->rcoulomb;
125     rcutoff          = _mm_set1_ps(rcutoff_scalar);
126     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
127
128     rswitch_scalar   = fr->rcoulomb_switch;
129     rswitch          = _mm_set1_ps(rswitch_scalar);
130     /* Setup switch parameters */
131     d_scalar         = rcutoff_scalar-rswitch_scalar;
132     d                = _mm_set1_ps(d_scalar);
133     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
134     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
135     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
136     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
137     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
138     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
139
140     /* Avoid stupid compiler warnings */
141     jnrA = jnrB = jnrC = jnrD = 0;
142     j_coord_offsetA = 0;
143     j_coord_offsetB = 0;
144     j_coord_offsetC = 0;
145     j_coord_offsetD = 0;
146
147     outeriter        = 0;
148     inneriter        = 0;
149
150     /* Start outer loop over neighborlists */
151     for(iidx=0; iidx<nri; iidx++)
152     {
153         /* Load shift vector for this list */
154         i_shift_offset   = DIM*shiftidx[iidx];
155         shX              = shiftvec[i_shift_offset+XX];
156         shY              = shiftvec[i_shift_offset+YY];
157         shZ              = shiftvec[i_shift_offset+ZZ];
158
159         /* Load limits for loop over neighbors */
160         j_index_start    = jindex[iidx];
161         j_index_end      = jindex[iidx+1];
162
163         /* Get outer coordinate index */
164         inr              = iinr[iidx];
165         i_coord_offset   = DIM*inr;
166
167         /* Load i particle coords and add shift vector */
168         ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
169         iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
170         iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
171         ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
172         iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
173         iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
174         ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
175         iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
176         iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
177
178         fix0             = _mm_setzero_ps();
179         fiy0             = _mm_setzero_ps();
180         fiz0             = _mm_setzero_ps();
181         fix1             = _mm_setzero_ps();
182         fiy1             = _mm_setzero_ps();
183         fiz1             = _mm_setzero_ps();
184         fix2             = _mm_setzero_ps();
185         fiy2             = _mm_setzero_ps();
186         fiz2             = _mm_setzero_ps();
187
188         /* Reset potential sums */
189         velecsum         = _mm_setzero_ps();
190         vvdwsum          = _mm_setzero_ps();
191
192         /* Start inner kernel loop */
193         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
194         {
195
196             /* Get j neighbor index, and coordinate index */
197             jnrA             = jjnr[jidx];
198             jnrB             = jjnr[jidx+1];
199             jnrC             = jjnr[jidx+2];
200             jnrD             = jjnr[jidx+3];
201
202             j_coord_offsetA  = DIM*jnrA;
203             j_coord_offsetB  = DIM*jnrB;
204             j_coord_offsetC  = DIM*jnrC;
205             j_coord_offsetD  = DIM*jnrD;
206
207             /* load j atom coordinates */
208             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
209                                               x+j_coord_offsetC,x+j_coord_offsetD,
210                                               &jx0,&jy0,&jz0);
211
212             /* Calculate displacement vector */
213             dx00             = _mm_sub_ps(ix0,jx0);
214             dy00             = _mm_sub_ps(iy0,jy0);
215             dz00             = _mm_sub_ps(iz0,jz0);
216             dx10             = _mm_sub_ps(ix1,jx0);
217             dy10             = _mm_sub_ps(iy1,jy0);
218             dz10             = _mm_sub_ps(iz1,jz0);
219             dx20             = _mm_sub_ps(ix2,jx0);
220             dy20             = _mm_sub_ps(iy2,jy0);
221             dz20             = _mm_sub_ps(iz2,jz0);
222
223             /* Calculate squared distance and things based on it */
224             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
225             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
226             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
227
228             rinv00           = gmx_mm_invsqrt_ps(rsq00);
229             rinv10           = gmx_mm_invsqrt_ps(rsq10);
230             rinv20           = gmx_mm_invsqrt_ps(rsq20);
231
232             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
233             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
234             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
235
236             /* Load parameters for j particles */
237             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
238                                                               charge+jnrC+0,charge+jnrD+0);
239             vdwjidx0A        = 2*vdwtype[jnrA+0];
240             vdwjidx0B        = 2*vdwtype[jnrB+0];
241             vdwjidx0C        = 2*vdwtype[jnrC+0];
242             vdwjidx0D        = 2*vdwtype[jnrD+0];
243
244             /**************************
245              * CALCULATE INTERACTIONS *
246              **************************/
247
248             if (gmx_mm_any_lt(rsq00,rcutoff2))
249             {
250
251             r00              = _mm_mul_ps(rsq00,rinv00);
252
253             /* Compute parameters for interactions between i and j atoms */
254             qq00             = _mm_mul_ps(iq0,jq0);
255             gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
256                                          vdwparam+vdwioffset0+vdwjidx0B,
257                                          vdwparam+vdwioffset0+vdwjidx0C,
258                                          vdwparam+vdwioffset0+vdwjidx0D,
259                                          &c6_00,&c12_00);
260
261             /* EWALD ELECTROSTATICS */
262
263             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
264             ewrt             = _mm_mul_ps(r00,ewtabscale);
265             ewitab           = _mm_cvttps_epi32(ewrt);
266             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
267             ewitab           = _mm_slli_epi32(ewitab,2);
268             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
269             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
270             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
271             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
272             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
273             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
274             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
275             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
276             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
277
278             /* LENNARD-JONES DISPERSION/REPULSION */
279
280             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
281             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
282             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
283             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
284             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
285
286             d                = _mm_sub_ps(r00,rswitch);
287             d                = _mm_max_ps(d,_mm_setzero_ps());
288             d2               = _mm_mul_ps(d,d);
289             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
290
291             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
292
293             /* Evaluate switch function */
294             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
295             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
296             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
297             velec            = _mm_mul_ps(velec,sw);
298             vvdw             = _mm_mul_ps(vvdw,sw);
299             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
300
301             /* Update potential sum for this i atom from the interaction with this j atom. */
302             velec            = _mm_and_ps(velec,cutoff_mask);
303             velecsum         = _mm_add_ps(velecsum,velec);
304             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
305             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
306
307             fscal            = _mm_add_ps(felec,fvdw);
308
309             fscal            = _mm_and_ps(fscal,cutoff_mask);
310
311             /* Calculate temporary vectorial force */
312             tx               = _mm_mul_ps(fscal,dx00);
313             ty               = _mm_mul_ps(fscal,dy00);
314             tz               = _mm_mul_ps(fscal,dz00);
315
316             /* Update vectorial force */
317             fix0             = _mm_add_ps(fix0,tx);
318             fiy0             = _mm_add_ps(fiy0,ty);
319             fiz0             = _mm_add_ps(fiz0,tz);
320
321             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
322                                                    f+j_coord_offsetC,f+j_coord_offsetD,
323                                                    tx,ty,tz);
324
325             }
326
327             /**************************
328              * CALCULATE INTERACTIONS *
329              **************************/
330
331             if (gmx_mm_any_lt(rsq10,rcutoff2))
332             {
333
334             r10              = _mm_mul_ps(rsq10,rinv10);
335
336             /* Compute parameters for interactions between i and j atoms */
337             qq10             = _mm_mul_ps(iq1,jq0);
338
339             /* EWALD ELECTROSTATICS */
340
341             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
342             ewrt             = _mm_mul_ps(r10,ewtabscale);
343             ewitab           = _mm_cvttps_epi32(ewrt);
344             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
345             ewitab           = _mm_slli_epi32(ewitab,2);
346             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
347             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
348             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
349             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
350             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
351             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
352             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
353             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
354             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
355
356             d                = _mm_sub_ps(r10,rswitch);
357             d                = _mm_max_ps(d,_mm_setzero_ps());
358             d2               = _mm_mul_ps(d,d);
359             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
360
361             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
362
363             /* Evaluate switch function */
364             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
365             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
366             velec            = _mm_mul_ps(velec,sw);
367             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
368
369             /* Update potential sum for this i atom from the interaction with this j atom. */
370             velec            = _mm_and_ps(velec,cutoff_mask);
371             velecsum         = _mm_add_ps(velecsum,velec);
372
373             fscal            = felec;
374
375             fscal            = _mm_and_ps(fscal,cutoff_mask);
376
377             /* Calculate temporary vectorial force */
378             tx               = _mm_mul_ps(fscal,dx10);
379             ty               = _mm_mul_ps(fscal,dy10);
380             tz               = _mm_mul_ps(fscal,dz10);
381
382             /* Update vectorial force */
383             fix1             = _mm_add_ps(fix1,tx);
384             fiy1             = _mm_add_ps(fiy1,ty);
385             fiz1             = _mm_add_ps(fiz1,tz);
386
387             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
388                                                    f+j_coord_offsetC,f+j_coord_offsetD,
389                                                    tx,ty,tz);
390
391             }
392
393             /**************************
394              * CALCULATE INTERACTIONS *
395              **************************/
396
397             if (gmx_mm_any_lt(rsq20,rcutoff2))
398             {
399
400             r20              = _mm_mul_ps(rsq20,rinv20);
401
402             /* Compute parameters for interactions between i and j atoms */
403             qq20             = _mm_mul_ps(iq2,jq0);
404
405             /* EWALD ELECTROSTATICS */
406
407             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
408             ewrt             = _mm_mul_ps(r20,ewtabscale);
409             ewitab           = _mm_cvttps_epi32(ewrt);
410             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
411             ewitab           = _mm_slli_epi32(ewitab,2);
412             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
413             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
414             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
415             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
416             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
417             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
418             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
419             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
420             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
421
422             d                = _mm_sub_ps(r20,rswitch);
423             d                = _mm_max_ps(d,_mm_setzero_ps());
424             d2               = _mm_mul_ps(d,d);
425             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
426
427             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
428
429             /* Evaluate switch function */
430             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
431             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
432             velec            = _mm_mul_ps(velec,sw);
433             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
434
435             /* Update potential sum for this i atom from the interaction with this j atom. */
436             velec            = _mm_and_ps(velec,cutoff_mask);
437             velecsum         = _mm_add_ps(velecsum,velec);
438
439             fscal            = felec;
440
441             fscal            = _mm_and_ps(fscal,cutoff_mask);
442
443             /* Calculate temporary vectorial force */
444             tx               = _mm_mul_ps(fscal,dx20);
445             ty               = _mm_mul_ps(fscal,dy20);
446             tz               = _mm_mul_ps(fscal,dz20);
447
448             /* Update vectorial force */
449             fix2             = _mm_add_ps(fix2,tx);
450             fiy2             = _mm_add_ps(fiy2,ty);
451             fiz2             = _mm_add_ps(fiz2,tz);
452
453             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
454                                                    f+j_coord_offsetC,f+j_coord_offsetD,
455                                                    tx,ty,tz);
456
457             }
458
459             /* Inner loop uses 213 flops */
460         }
461
462         if(jidx<j_index_end)
463         {
464
465             /* Get j neighbor index, and coordinate index */
466             jnrA             = jjnr[jidx];
467             jnrB             = jjnr[jidx+1];
468             jnrC             = jjnr[jidx+2];
469             jnrD             = jjnr[jidx+3];
470
471             /* Sign of each element will be negative for non-real atoms.
472              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
473              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
474              */
475             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
476             jnrA       = (jnrA>=0) ? jnrA : 0;
477             jnrB       = (jnrB>=0) ? jnrB : 0;
478             jnrC       = (jnrC>=0) ? jnrC : 0;
479             jnrD       = (jnrD>=0) ? jnrD : 0;
480
481             j_coord_offsetA  = DIM*jnrA;
482             j_coord_offsetB  = DIM*jnrB;
483             j_coord_offsetC  = DIM*jnrC;
484             j_coord_offsetD  = DIM*jnrD;
485
486             /* load j atom coordinates */
487             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
488                                               x+j_coord_offsetC,x+j_coord_offsetD,
489                                               &jx0,&jy0,&jz0);
490
491             /* Calculate displacement vector */
492             dx00             = _mm_sub_ps(ix0,jx0);
493             dy00             = _mm_sub_ps(iy0,jy0);
494             dz00             = _mm_sub_ps(iz0,jz0);
495             dx10             = _mm_sub_ps(ix1,jx0);
496             dy10             = _mm_sub_ps(iy1,jy0);
497             dz10             = _mm_sub_ps(iz1,jz0);
498             dx20             = _mm_sub_ps(ix2,jx0);
499             dy20             = _mm_sub_ps(iy2,jy0);
500             dz20             = _mm_sub_ps(iz2,jz0);
501
502             /* Calculate squared distance and things based on it */
503             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
504             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
505             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
506
507             rinv00           = gmx_mm_invsqrt_ps(rsq00);
508             rinv10           = gmx_mm_invsqrt_ps(rsq10);
509             rinv20           = gmx_mm_invsqrt_ps(rsq20);
510
511             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
512             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
513             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
514
515             /* Load parameters for j particles */
516             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
517                                                               charge+jnrC+0,charge+jnrD+0);
518             vdwjidx0A        = 2*vdwtype[jnrA+0];
519             vdwjidx0B        = 2*vdwtype[jnrB+0];
520             vdwjidx0C        = 2*vdwtype[jnrC+0];
521             vdwjidx0D        = 2*vdwtype[jnrD+0];
522
523             /**************************
524              * CALCULATE INTERACTIONS *
525              **************************/
526
527             if (gmx_mm_any_lt(rsq00,rcutoff2))
528             {
529
530             r00              = _mm_mul_ps(rsq00,rinv00);
531             r00              = _mm_andnot_ps(dummy_mask,r00);
532
533             /* Compute parameters for interactions between i and j atoms */
534             qq00             = _mm_mul_ps(iq0,jq0);
535             gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
536                                          vdwparam+vdwioffset0+vdwjidx0B,
537                                          vdwparam+vdwioffset0+vdwjidx0C,
538                                          vdwparam+vdwioffset0+vdwjidx0D,
539                                          &c6_00,&c12_00);
540
541             /* EWALD ELECTROSTATICS */
542
543             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
544             ewrt             = _mm_mul_ps(r00,ewtabscale);
545             ewitab           = _mm_cvttps_epi32(ewrt);
546             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
547             ewitab           = _mm_slli_epi32(ewitab,2);
548             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
549             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
550             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
551             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
552             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
553             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
554             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
555             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
556             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
557
558             /* LENNARD-JONES DISPERSION/REPULSION */
559
560             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
561             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
562             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
563             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
564             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
565
566             d                = _mm_sub_ps(r00,rswitch);
567             d                = _mm_max_ps(d,_mm_setzero_ps());
568             d2               = _mm_mul_ps(d,d);
569             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
570
571             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
572
573             /* Evaluate switch function */
574             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
575             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
576             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
577             velec            = _mm_mul_ps(velec,sw);
578             vvdw             = _mm_mul_ps(vvdw,sw);
579             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
580
581             /* Update potential sum for this i atom from the interaction with this j atom. */
582             velec            = _mm_and_ps(velec,cutoff_mask);
583             velec            = _mm_andnot_ps(dummy_mask,velec);
584             velecsum         = _mm_add_ps(velecsum,velec);
585             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
586             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
587             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
588
589             fscal            = _mm_add_ps(felec,fvdw);
590
591             fscal            = _mm_and_ps(fscal,cutoff_mask);
592
593             fscal            = _mm_andnot_ps(dummy_mask,fscal);
594
595             /* Calculate temporary vectorial force */
596             tx               = _mm_mul_ps(fscal,dx00);
597             ty               = _mm_mul_ps(fscal,dy00);
598             tz               = _mm_mul_ps(fscal,dz00);
599
600             /* Update vectorial force */
601             fix0             = _mm_add_ps(fix0,tx);
602             fiy0             = _mm_add_ps(fiy0,ty);
603             fiz0             = _mm_add_ps(fiz0,tz);
604
605             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
606                                                    f+j_coord_offsetC,f+j_coord_offsetD,
607                                                    tx,ty,tz);
608
609             }
610
611             /**************************
612              * CALCULATE INTERACTIONS *
613              **************************/
614
615             if (gmx_mm_any_lt(rsq10,rcutoff2))
616             {
617
618             r10              = _mm_mul_ps(rsq10,rinv10);
619             r10              = _mm_andnot_ps(dummy_mask,r10);
620
621             /* Compute parameters for interactions between i and j atoms */
622             qq10             = _mm_mul_ps(iq1,jq0);
623
624             /* EWALD ELECTROSTATICS */
625
626             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
627             ewrt             = _mm_mul_ps(r10,ewtabscale);
628             ewitab           = _mm_cvttps_epi32(ewrt);
629             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
630             ewitab           = _mm_slli_epi32(ewitab,2);
631             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
632             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
633             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
634             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
635             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
636             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
637             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
638             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
639             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
640
641             d                = _mm_sub_ps(r10,rswitch);
642             d                = _mm_max_ps(d,_mm_setzero_ps());
643             d2               = _mm_mul_ps(d,d);
644             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
645
646             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
647
648             /* Evaluate switch function */
649             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
650             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
651             velec            = _mm_mul_ps(velec,sw);
652             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
653
654             /* Update potential sum for this i atom from the interaction with this j atom. */
655             velec            = _mm_and_ps(velec,cutoff_mask);
656             velec            = _mm_andnot_ps(dummy_mask,velec);
657             velecsum         = _mm_add_ps(velecsum,velec);
658
659             fscal            = felec;
660
661             fscal            = _mm_and_ps(fscal,cutoff_mask);
662
663             fscal            = _mm_andnot_ps(dummy_mask,fscal);
664
665             /* Calculate temporary vectorial force */
666             tx               = _mm_mul_ps(fscal,dx10);
667             ty               = _mm_mul_ps(fscal,dy10);
668             tz               = _mm_mul_ps(fscal,dz10);
669
670             /* Update vectorial force */
671             fix1             = _mm_add_ps(fix1,tx);
672             fiy1             = _mm_add_ps(fiy1,ty);
673             fiz1             = _mm_add_ps(fiz1,tz);
674
675             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
676                                                    f+j_coord_offsetC,f+j_coord_offsetD,
677                                                    tx,ty,tz);
678
679             }
680
681             /**************************
682              * CALCULATE INTERACTIONS *
683              **************************/
684
685             if (gmx_mm_any_lt(rsq20,rcutoff2))
686             {
687
688             r20              = _mm_mul_ps(rsq20,rinv20);
689             r20              = _mm_andnot_ps(dummy_mask,r20);
690
691             /* Compute parameters for interactions between i and j atoms */
692             qq20             = _mm_mul_ps(iq2,jq0);
693
694             /* EWALD ELECTROSTATICS */
695
696             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
697             ewrt             = _mm_mul_ps(r20,ewtabscale);
698             ewitab           = _mm_cvttps_epi32(ewrt);
699             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
700             ewitab           = _mm_slli_epi32(ewitab,2);
701             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
702             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
703             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
704             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
705             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
706             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
707             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
708             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
709             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
710
711             d                = _mm_sub_ps(r20,rswitch);
712             d                = _mm_max_ps(d,_mm_setzero_ps());
713             d2               = _mm_mul_ps(d,d);
714             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
715
716             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
717
718             /* Evaluate switch function */
719             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
720             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
721             velec            = _mm_mul_ps(velec,sw);
722             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
723
724             /* Update potential sum for this i atom from the interaction with this j atom. */
725             velec            = _mm_and_ps(velec,cutoff_mask);
726             velec            = _mm_andnot_ps(dummy_mask,velec);
727             velecsum         = _mm_add_ps(velecsum,velec);
728
729             fscal            = felec;
730
731             fscal            = _mm_and_ps(fscal,cutoff_mask);
732
733             fscal            = _mm_andnot_ps(dummy_mask,fscal);
734
735             /* Calculate temporary vectorial force */
736             tx               = _mm_mul_ps(fscal,dx20);
737             ty               = _mm_mul_ps(fscal,dy20);
738             tz               = _mm_mul_ps(fscal,dz20);
739
740             /* Update vectorial force */
741             fix2             = _mm_add_ps(fix2,tx);
742             fiy2             = _mm_add_ps(fiy2,ty);
743             fiz2             = _mm_add_ps(fiz2,tz);
744
745             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
746                                                    f+j_coord_offsetC,f+j_coord_offsetD,
747                                                    tx,ty,tz);
748
749             }
750
751             /* Inner loop uses 216 flops */
752         }
753
754         /* End of innermost loop */
755
756         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
757                                               f+i_coord_offset,fshift+i_shift_offset);
758
759         ggid                        = gid[iidx];
760         /* Update potential energies */
761         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
762         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
763
764         /* Increment number of inner iterations */
765         inneriter                  += j_index_end - j_index_start;
766
767         /* Outer loop uses 29 flops */
768     }
769
770     /* Increment number of outer iterations */
771     outeriter        += nri;
772
773     /* Update outer/inner flops */
774
775     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*216);
776 }
777 /*
778  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_single
779  * Electrostatics interaction: Ewald
780  * VdW interaction:            LennardJones
781  * Geometry:                   Water3-Particle
782  * Calculate force/pot:        Force
783  */
784 void
785 nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_single
786                     (t_nblist * gmx_restrict                nlist,
787                      rvec * gmx_restrict                    xx,
788                      rvec * gmx_restrict                    ff,
789                      t_forcerec * gmx_restrict              fr,
790                      t_mdatoms * gmx_restrict               mdatoms,
791                      nb_kernel_data_t * gmx_restrict        kernel_data,
792                      t_nrnb * gmx_restrict                  nrnb)
793 {
794     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
795      * just 0 for non-waters.
796      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
797      * jnr indices corresponding to data put in the four positions in the SIMD register.
798      */
799     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
800     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
801     int              jnrA,jnrB,jnrC,jnrD;
802     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
803     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
804     real             shX,shY,shZ,rcutoff_scalar;
805     real             *shiftvec,*fshift,*x,*f;
806     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
807     int              vdwioffset0;
808     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
809     int              vdwioffset1;
810     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
811     int              vdwioffset2;
812     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
813     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
814     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
815     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
816     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
817     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
818     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
819     real             *charge;
820     int              nvdwtype;
821     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
822     int              *vdwtype;
823     real             *vdwparam;
824     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
825     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
826     __m128i          ewitab;
827     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
828     real             *ewtab;
829     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
830     real             rswitch_scalar,d_scalar;
831     __m128           dummy_mask,cutoff_mask;
832     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
833     __m128           one     = _mm_set1_ps(1.0);
834     __m128           two     = _mm_set1_ps(2.0);
835     x                = xx[0];
836     f                = ff[0];
837
838     nri              = nlist->nri;
839     iinr             = nlist->iinr;
840     jindex           = nlist->jindex;
841     jjnr             = nlist->jjnr;
842     shiftidx         = nlist->shift;
843     gid              = nlist->gid;
844     shiftvec         = fr->shift_vec[0];
845     fshift           = fr->fshift[0];
846     facel            = _mm_set1_ps(fr->epsfac);
847     charge           = mdatoms->chargeA;
848     nvdwtype         = fr->ntype;
849     vdwparam         = fr->nbfp;
850     vdwtype          = mdatoms->typeA;
851
852     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
853     ewtab            = fr->ic->tabq_coul_FDV0;
854     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
855     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
856
857     /* Setup water-specific parameters */
858     inr              = nlist->iinr[0];
859     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
860     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
861     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
862     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
863
864     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
865     rcutoff_scalar   = fr->rcoulomb;
866     rcutoff          = _mm_set1_ps(rcutoff_scalar);
867     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
868
869     rswitch_scalar   = fr->rcoulomb_switch;
870     rswitch          = _mm_set1_ps(rswitch_scalar);
871     /* Setup switch parameters */
872     d_scalar         = rcutoff_scalar-rswitch_scalar;
873     d                = _mm_set1_ps(d_scalar);
874     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
875     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
876     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
877     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
878     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
879     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
880
881     /* Avoid stupid compiler warnings */
882     jnrA = jnrB = jnrC = jnrD = 0;
883     j_coord_offsetA = 0;
884     j_coord_offsetB = 0;
885     j_coord_offsetC = 0;
886     j_coord_offsetD = 0;
887
888     outeriter        = 0;
889     inneriter        = 0;
890
891     /* Start outer loop over neighborlists */
892     for(iidx=0; iidx<nri; iidx++)
893     {
894         /* Load shift vector for this list */
895         i_shift_offset   = DIM*shiftidx[iidx];
896         shX              = shiftvec[i_shift_offset+XX];
897         shY              = shiftvec[i_shift_offset+YY];
898         shZ              = shiftvec[i_shift_offset+ZZ];
899
900         /* Load limits for loop over neighbors */
901         j_index_start    = jindex[iidx];
902         j_index_end      = jindex[iidx+1];
903
904         /* Get outer coordinate index */
905         inr              = iinr[iidx];
906         i_coord_offset   = DIM*inr;
907
908         /* Load i particle coords and add shift vector */
909         ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
910         iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
911         iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
912         ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
913         iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
914         iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
915         ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
916         iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
917         iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
918
919         fix0             = _mm_setzero_ps();
920         fiy0             = _mm_setzero_ps();
921         fiz0             = _mm_setzero_ps();
922         fix1             = _mm_setzero_ps();
923         fiy1             = _mm_setzero_ps();
924         fiz1             = _mm_setzero_ps();
925         fix2             = _mm_setzero_ps();
926         fiy2             = _mm_setzero_ps();
927         fiz2             = _mm_setzero_ps();
928
929         /* Start inner kernel loop */
930         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
931         {
932
933             /* Get j neighbor index, and coordinate index */
934             jnrA             = jjnr[jidx];
935             jnrB             = jjnr[jidx+1];
936             jnrC             = jjnr[jidx+2];
937             jnrD             = jjnr[jidx+3];
938
939             j_coord_offsetA  = DIM*jnrA;
940             j_coord_offsetB  = DIM*jnrB;
941             j_coord_offsetC  = DIM*jnrC;
942             j_coord_offsetD  = DIM*jnrD;
943
944             /* load j atom coordinates */
945             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
946                                               x+j_coord_offsetC,x+j_coord_offsetD,
947                                               &jx0,&jy0,&jz0);
948
949             /* Calculate displacement vector */
950             dx00             = _mm_sub_ps(ix0,jx0);
951             dy00             = _mm_sub_ps(iy0,jy0);
952             dz00             = _mm_sub_ps(iz0,jz0);
953             dx10             = _mm_sub_ps(ix1,jx0);
954             dy10             = _mm_sub_ps(iy1,jy0);
955             dz10             = _mm_sub_ps(iz1,jz0);
956             dx20             = _mm_sub_ps(ix2,jx0);
957             dy20             = _mm_sub_ps(iy2,jy0);
958             dz20             = _mm_sub_ps(iz2,jz0);
959
960             /* Calculate squared distance and things based on it */
961             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
962             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
963             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
964
965             rinv00           = gmx_mm_invsqrt_ps(rsq00);
966             rinv10           = gmx_mm_invsqrt_ps(rsq10);
967             rinv20           = gmx_mm_invsqrt_ps(rsq20);
968
969             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
970             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
971             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
972
973             /* Load parameters for j particles */
974             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
975                                                               charge+jnrC+0,charge+jnrD+0);
976             vdwjidx0A        = 2*vdwtype[jnrA+0];
977             vdwjidx0B        = 2*vdwtype[jnrB+0];
978             vdwjidx0C        = 2*vdwtype[jnrC+0];
979             vdwjidx0D        = 2*vdwtype[jnrD+0];
980
981             /**************************
982              * CALCULATE INTERACTIONS *
983              **************************/
984
985             if (gmx_mm_any_lt(rsq00,rcutoff2))
986             {
987
988             r00              = _mm_mul_ps(rsq00,rinv00);
989
990             /* Compute parameters for interactions between i and j atoms */
991             qq00             = _mm_mul_ps(iq0,jq0);
992             gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
993                                          vdwparam+vdwioffset0+vdwjidx0B,
994                                          vdwparam+vdwioffset0+vdwjidx0C,
995                                          vdwparam+vdwioffset0+vdwjidx0D,
996                                          &c6_00,&c12_00);
997
998             /* EWALD ELECTROSTATICS */
999
1000             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1001             ewrt             = _mm_mul_ps(r00,ewtabscale);
1002             ewitab           = _mm_cvttps_epi32(ewrt);
1003             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1004             ewitab           = _mm_slli_epi32(ewitab,2);
1005             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1006             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1007             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1008             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1009             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1010             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1011             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1012             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
1013             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1014
1015             /* LENNARD-JONES DISPERSION/REPULSION */
1016
1017             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1018             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
1019             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1020             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1021             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1022
1023             d                = _mm_sub_ps(r00,rswitch);
1024             d                = _mm_max_ps(d,_mm_setzero_ps());
1025             d2               = _mm_mul_ps(d,d);
1026             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1027
1028             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1029
1030             /* Evaluate switch function */
1031             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1032             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1033             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1034             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1035
1036             fscal            = _mm_add_ps(felec,fvdw);
1037
1038             fscal            = _mm_and_ps(fscal,cutoff_mask);
1039
1040             /* Calculate temporary vectorial force */
1041             tx               = _mm_mul_ps(fscal,dx00);
1042             ty               = _mm_mul_ps(fscal,dy00);
1043             tz               = _mm_mul_ps(fscal,dz00);
1044
1045             /* Update vectorial force */
1046             fix0             = _mm_add_ps(fix0,tx);
1047             fiy0             = _mm_add_ps(fiy0,ty);
1048             fiz0             = _mm_add_ps(fiz0,tz);
1049
1050             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1051                                                    f+j_coord_offsetC,f+j_coord_offsetD,
1052                                                    tx,ty,tz);
1053
1054             }
1055
1056             /**************************
1057              * CALCULATE INTERACTIONS *
1058              **************************/
1059
1060             if (gmx_mm_any_lt(rsq10,rcutoff2))
1061             {
1062
1063             r10              = _mm_mul_ps(rsq10,rinv10);
1064
1065             /* Compute parameters for interactions between i and j atoms */
1066             qq10             = _mm_mul_ps(iq1,jq0);
1067
1068             /* EWALD ELECTROSTATICS */
1069
1070             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1071             ewrt             = _mm_mul_ps(r10,ewtabscale);
1072             ewitab           = _mm_cvttps_epi32(ewrt);
1073             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1074             ewitab           = _mm_slli_epi32(ewitab,2);
1075             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1076             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1077             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1078             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1079             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1080             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1081             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1082             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
1083             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1084
1085             d                = _mm_sub_ps(r10,rswitch);
1086             d                = _mm_max_ps(d,_mm_setzero_ps());
1087             d2               = _mm_mul_ps(d,d);
1088             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1089
1090             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1091
1092             /* Evaluate switch function */
1093             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1094             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1095             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1096
1097             fscal            = felec;
1098
1099             fscal            = _mm_and_ps(fscal,cutoff_mask);
1100
1101             /* Calculate temporary vectorial force */
1102             tx               = _mm_mul_ps(fscal,dx10);
1103             ty               = _mm_mul_ps(fscal,dy10);
1104             tz               = _mm_mul_ps(fscal,dz10);
1105
1106             /* Update vectorial force */
1107             fix1             = _mm_add_ps(fix1,tx);
1108             fiy1             = _mm_add_ps(fiy1,ty);
1109             fiz1             = _mm_add_ps(fiz1,tz);
1110
1111             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1112                                                    f+j_coord_offsetC,f+j_coord_offsetD,
1113                                                    tx,ty,tz);
1114
1115             }
1116
1117             /**************************
1118              * CALCULATE INTERACTIONS *
1119              **************************/
1120
1121             if (gmx_mm_any_lt(rsq20,rcutoff2))
1122             {
1123
1124             r20              = _mm_mul_ps(rsq20,rinv20);
1125
1126             /* Compute parameters for interactions between i and j atoms */
1127             qq20             = _mm_mul_ps(iq2,jq0);
1128
1129             /* EWALD ELECTROSTATICS */
1130
1131             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1132             ewrt             = _mm_mul_ps(r20,ewtabscale);
1133             ewitab           = _mm_cvttps_epi32(ewrt);
1134             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1135             ewitab           = _mm_slli_epi32(ewitab,2);
1136             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1137             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1138             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1139             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1140             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1141             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1142             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1143             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1144             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1145
1146             d                = _mm_sub_ps(r20,rswitch);
1147             d                = _mm_max_ps(d,_mm_setzero_ps());
1148             d2               = _mm_mul_ps(d,d);
1149             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1150
1151             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1152
1153             /* Evaluate switch function */
1154             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1155             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1156             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1157
1158             fscal            = felec;
1159
1160             fscal            = _mm_and_ps(fscal,cutoff_mask);
1161
1162             /* Calculate temporary vectorial force */
1163             tx               = _mm_mul_ps(fscal,dx20);
1164             ty               = _mm_mul_ps(fscal,dy20);
1165             tz               = _mm_mul_ps(fscal,dz20);
1166
1167             /* Update vectorial force */
1168             fix2             = _mm_add_ps(fix2,tx);
1169             fiy2             = _mm_add_ps(fiy2,ty);
1170             fiz2             = _mm_add_ps(fiz2,tz);
1171
1172             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1173                                                    f+j_coord_offsetC,f+j_coord_offsetD,
1174                                                    tx,ty,tz);
1175
1176             }
1177
1178             /* Inner loop uses 201 flops */
1179         }
1180
1181         if(jidx<j_index_end)
1182         {
1183
1184             /* Get j neighbor index, and coordinate index */
1185             jnrA             = jjnr[jidx];
1186             jnrB             = jjnr[jidx+1];
1187             jnrC             = jjnr[jidx+2];
1188             jnrD             = jjnr[jidx+3];
1189
1190             /* Sign of each element will be negative for non-real atoms.
1191              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1192              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1193              */
1194             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1195             jnrA       = (jnrA>=0) ? jnrA : 0;
1196             jnrB       = (jnrB>=0) ? jnrB : 0;
1197             jnrC       = (jnrC>=0) ? jnrC : 0;
1198             jnrD       = (jnrD>=0) ? jnrD : 0;
1199
1200             j_coord_offsetA  = DIM*jnrA;
1201             j_coord_offsetB  = DIM*jnrB;
1202             j_coord_offsetC  = DIM*jnrC;
1203             j_coord_offsetD  = DIM*jnrD;
1204
1205             /* load j atom coordinates */
1206             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1207                                               x+j_coord_offsetC,x+j_coord_offsetD,
1208                                               &jx0,&jy0,&jz0);
1209
1210             /* Calculate displacement vector */
1211             dx00             = _mm_sub_ps(ix0,jx0);
1212             dy00             = _mm_sub_ps(iy0,jy0);
1213             dz00             = _mm_sub_ps(iz0,jz0);
1214             dx10             = _mm_sub_ps(ix1,jx0);
1215             dy10             = _mm_sub_ps(iy1,jy0);
1216             dz10             = _mm_sub_ps(iz1,jz0);
1217             dx20             = _mm_sub_ps(ix2,jx0);
1218             dy20             = _mm_sub_ps(iy2,jy0);
1219             dz20             = _mm_sub_ps(iz2,jz0);
1220
1221             /* Calculate squared distance and things based on it */
1222             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1223             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1224             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1225
1226             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1227             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1228             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1229
1230             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1231             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1232             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1233
1234             /* Load parameters for j particles */
1235             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1236                                                               charge+jnrC+0,charge+jnrD+0);
1237             vdwjidx0A        = 2*vdwtype[jnrA+0];
1238             vdwjidx0B        = 2*vdwtype[jnrB+0];
1239             vdwjidx0C        = 2*vdwtype[jnrC+0];
1240             vdwjidx0D        = 2*vdwtype[jnrD+0];
1241
1242             /**************************
1243              * CALCULATE INTERACTIONS *
1244              **************************/
1245
1246             if (gmx_mm_any_lt(rsq00,rcutoff2))
1247             {
1248
1249             r00              = _mm_mul_ps(rsq00,rinv00);
1250             r00              = _mm_andnot_ps(dummy_mask,r00);
1251
1252             /* Compute parameters for interactions between i and j atoms */
1253             qq00             = _mm_mul_ps(iq0,jq0);
1254             gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1255                                          vdwparam+vdwioffset0+vdwjidx0B,
1256                                          vdwparam+vdwioffset0+vdwjidx0C,
1257                                          vdwparam+vdwioffset0+vdwjidx0D,
1258                                          &c6_00,&c12_00);
1259
1260             /* EWALD ELECTROSTATICS */
1261
1262             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1263             ewrt             = _mm_mul_ps(r00,ewtabscale);
1264             ewitab           = _mm_cvttps_epi32(ewrt);
1265             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1266             ewitab           = _mm_slli_epi32(ewitab,2);
1267             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1268             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1269             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1270             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1271             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1272             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1273             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1274             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
1275             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1276
1277             /* LENNARD-JONES DISPERSION/REPULSION */
1278
1279             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1280             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
1281             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1282             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1283             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1284
1285             d                = _mm_sub_ps(r00,rswitch);
1286             d                = _mm_max_ps(d,_mm_setzero_ps());
1287             d2               = _mm_mul_ps(d,d);
1288             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1289
1290             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1291
1292             /* Evaluate switch function */
1293             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1294             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1295             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1296             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1297
1298             fscal            = _mm_add_ps(felec,fvdw);
1299
1300             fscal            = _mm_and_ps(fscal,cutoff_mask);
1301
1302             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1303
1304             /* Calculate temporary vectorial force */
1305             tx               = _mm_mul_ps(fscal,dx00);
1306             ty               = _mm_mul_ps(fscal,dy00);
1307             tz               = _mm_mul_ps(fscal,dz00);
1308
1309             /* Update vectorial force */
1310             fix0             = _mm_add_ps(fix0,tx);
1311             fiy0             = _mm_add_ps(fiy0,ty);
1312             fiz0             = _mm_add_ps(fiz0,tz);
1313
1314             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1315                                                    f+j_coord_offsetC,f+j_coord_offsetD,
1316                                                    tx,ty,tz);
1317
1318             }
1319
1320             /**************************
1321              * CALCULATE INTERACTIONS *
1322              **************************/
1323
1324             if (gmx_mm_any_lt(rsq10,rcutoff2))
1325             {
1326
1327             r10              = _mm_mul_ps(rsq10,rinv10);
1328             r10              = _mm_andnot_ps(dummy_mask,r10);
1329
1330             /* Compute parameters for interactions between i and j atoms */
1331             qq10             = _mm_mul_ps(iq1,jq0);
1332
1333             /* EWALD ELECTROSTATICS */
1334
1335             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1336             ewrt             = _mm_mul_ps(r10,ewtabscale);
1337             ewitab           = _mm_cvttps_epi32(ewrt);
1338             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1339             ewitab           = _mm_slli_epi32(ewitab,2);
1340             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1341             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1342             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1343             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1344             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1345             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1346             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1347             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
1348             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1349
1350             d                = _mm_sub_ps(r10,rswitch);
1351             d                = _mm_max_ps(d,_mm_setzero_ps());
1352             d2               = _mm_mul_ps(d,d);
1353             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1354
1355             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1356
1357             /* Evaluate switch function */
1358             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1359             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1360             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1361
1362             fscal            = felec;
1363
1364             fscal            = _mm_and_ps(fscal,cutoff_mask);
1365
1366             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1367
1368             /* Calculate temporary vectorial force */
1369             tx               = _mm_mul_ps(fscal,dx10);
1370             ty               = _mm_mul_ps(fscal,dy10);
1371             tz               = _mm_mul_ps(fscal,dz10);
1372
1373             /* Update vectorial force */
1374             fix1             = _mm_add_ps(fix1,tx);
1375             fiy1             = _mm_add_ps(fiy1,ty);
1376             fiz1             = _mm_add_ps(fiz1,tz);
1377
1378             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1379                                                    f+j_coord_offsetC,f+j_coord_offsetD,
1380                                                    tx,ty,tz);
1381
1382             }
1383
1384             /**************************
1385              * CALCULATE INTERACTIONS *
1386              **************************/
1387
1388             if (gmx_mm_any_lt(rsq20,rcutoff2))
1389             {
1390
1391             r20              = _mm_mul_ps(rsq20,rinv20);
1392             r20              = _mm_andnot_ps(dummy_mask,r20);
1393
1394             /* Compute parameters for interactions between i and j atoms */
1395             qq20             = _mm_mul_ps(iq2,jq0);
1396
1397             /* EWALD ELECTROSTATICS */
1398
1399             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1400             ewrt             = _mm_mul_ps(r20,ewtabscale);
1401             ewitab           = _mm_cvttps_epi32(ewrt);
1402             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1403             ewitab           = _mm_slli_epi32(ewitab,2);
1404             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1405             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1406             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1407             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1408             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1409             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1410             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1411             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1412             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1413
1414             d                = _mm_sub_ps(r20,rswitch);
1415             d                = _mm_max_ps(d,_mm_setzero_ps());
1416             d2               = _mm_mul_ps(d,d);
1417             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1418
1419             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1420
1421             /* Evaluate switch function */
1422             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1423             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1424             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1425
1426             fscal            = felec;
1427
1428             fscal            = _mm_and_ps(fscal,cutoff_mask);
1429
1430             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1431
1432             /* Calculate temporary vectorial force */
1433             tx               = _mm_mul_ps(fscal,dx20);
1434             ty               = _mm_mul_ps(fscal,dy20);
1435             tz               = _mm_mul_ps(fscal,dz20);
1436
1437             /* Update vectorial force */
1438             fix2             = _mm_add_ps(fix2,tx);
1439             fiy2             = _mm_add_ps(fiy2,ty);
1440             fiz2             = _mm_add_ps(fiz2,tz);
1441
1442             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1443                                                    f+j_coord_offsetC,f+j_coord_offsetD,
1444                                                    tx,ty,tz);
1445
1446             }
1447
1448             /* Inner loop uses 204 flops */
1449         }
1450
1451         /* End of innermost loop */
1452
1453         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1454                                               f+i_coord_offset,fshift+i_shift_offset);
1455
1456         /* Increment number of inner iterations */
1457         inneriter                  += j_index_end - j_index_start;
1458
1459         /* Outer loop uses 27 flops */
1460     }
1461
1462     /* Increment number of outer iterations */
1463     outeriter        += nri;
1464
1465     /* Update outer/inner flops */
1466
1467     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*204);
1468 }