Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_sse4_1_single.c
1 /*
2  * Note: this file was generated by the Gromacs sse4_1_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse4_1_single.h"
34 #include "kernelutil_x86_sse4_1_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse4_1_single
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water4-Particle
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse4_1_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
64     real             rcutoff_scalar;
65     real             *shiftvec,*fshift,*x,*f;
66     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
67     real             scratch[4*DIM];
68     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
69     int              vdwioffset0;
70     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
71     int              vdwioffset1;
72     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
73     int              vdwioffset2;
74     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75     int              vdwioffset3;
76     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
77     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
78     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
79     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
80     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
81     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
82     __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
83     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
84     real             *charge;
85     int              nvdwtype;
86     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
87     int              *vdwtype;
88     real             *vdwparam;
89     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
90     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
91     __m128i          ewitab;
92     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
93     real             *ewtab;
94     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
95     real             rswitch_scalar,d_scalar;
96     __m128           dummy_mask,cutoff_mask;
97     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
98     __m128           one     = _mm_set1_ps(1.0);
99     __m128           two     = _mm_set1_ps(2.0);
100     x                = xx[0];
101     f                = ff[0];
102
103     nri              = nlist->nri;
104     iinr             = nlist->iinr;
105     jindex           = nlist->jindex;
106     jjnr             = nlist->jjnr;
107     shiftidx         = nlist->shift;
108     gid              = nlist->gid;
109     shiftvec         = fr->shift_vec[0];
110     fshift           = fr->fshift[0];
111     facel            = _mm_set1_ps(fr->epsfac);
112     charge           = mdatoms->chargeA;
113     nvdwtype         = fr->ntype;
114     vdwparam         = fr->nbfp;
115     vdwtype          = mdatoms->typeA;
116
117     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
118     ewtab            = fr->ic->tabq_coul_FDV0;
119     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
120     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
121
122     /* Setup water-specific parameters */
123     inr              = nlist->iinr[0];
124     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
125     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
126     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
127     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
128
129     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
130     rcutoff_scalar   = fr->rcoulomb;
131     rcutoff          = _mm_set1_ps(rcutoff_scalar);
132     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
133
134     rswitch_scalar   = fr->rcoulomb_switch;
135     rswitch          = _mm_set1_ps(rswitch_scalar);
136     /* Setup switch parameters */
137     d_scalar         = rcutoff_scalar-rswitch_scalar;
138     d                = _mm_set1_ps(d_scalar);
139     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
140     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
141     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
142     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
143     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
144     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
145
146     /* Avoid stupid compiler warnings */
147     jnrA = jnrB = jnrC = jnrD = 0;
148     j_coord_offsetA = 0;
149     j_coord_offsetB = 0;
150     j_coord_offsetC = 0;
151     j_coord_offsetD = 0;
152
153     outeriter        = 0;
154     inneriter        = 0;
155
156     for(iidx=0;iidx<4*DIM;iidx++)
157     {
158         scratch[iidx] = 0.0;
159     }
160
161     /* Start outer loop over neighborlists */
162     for(iidx=0; iidx<nri; iidx++)
163     {
164         /* Load shift vector for this list */
165         i_shift_offset   = DIM*shiftidx[iidx];
166
167         /* Load limits for loop over neighbors */
168         j_index_start    = jindex[iidx];
169         j_index_end      = jindex[iidx+1];
170
171         /* Get outer coordinate index */
172         inr              = iinr[iidx];
173         i_coord_offset   = DIM*inr;
174
175         /* Load i particle coords and add shift vector */
176         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
177                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
178
179         fix0             = _mm_setzero_ps();
180         fiy0             = _mm_setzero_ps();
181         fiz0             = _mm_setzero_ps();
182         fix1             = _mm_setzero_ps();
183         fiy1             = _mm_setzero_ps();
184         fiz1             = _mm_setzero_ps();
185         fix2             = _mm_setzero_ps();
186         fiy2             = _mm_setzero_ps();
187         fiz2             = _mm_setzero_ps();
188         fix3             = _mm_setzero_ps();
189         fiy3             = _mm_setzero_ps();
190         fiz3             = _mm_setzero_ps();
191
192         /* Reset potential sums */
193         velecsum         = _mm_setzero_ps();
194         vvdwsum          = _mm_setzero_ps();
195
196         /* Start inner kernel loop */
197         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
198         {
199
200             /* Get j neighbor index, and coordinate index */
201             jnrA             = jjnr[jidx];
202             jnrB             = jjnr[jidx+1];
203             jnrC             = jjnr[jidx+2];
204             jnrD             = jjnr[jidx+3];
205             j_coord_offsetA  = DIM*jnrA;
206             j_coord_offsetB  = DIM*jnrB;
207             j_coord_offsetC  = DIM*jnrC;
208             j_coord_offsetD  = DIM*jnrD;
209
210             /* load j atom coordinates */
211             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
212                                               x+j_coord_offsetC,x+j_coord_offsetD,
213                                               &jx0,&jy0,&jz0);
214
215             /* Calculate displacement vector */
216             dx00             = _mm_sub_ps(ix0,jx0);
217             dy00             = _mm_sub_ps(iy0,jy0);
218             dz00             = _mm_sub_ps(iz0,jz0);
219             dx10             = _mm_sub_ps(ix1,jx0);
220             dy10             = _mm_sub_ps(iy1,jy0);
221             dz10             = _mm_sub_ps(iz1,jz0);
222             dx20             = _mm_sub_ps(ix2,jx0);
223             dy20             = _mm_sub_ps(iy2,jy0);
224             dz20             = _mm_sub_ps(iz2,jz0);
225             dx30             = _mm_sub_ps(ix3,jx0);
226             dy30             = _mm_sub_ps(iy3,jy0);
227             dz30             = _mm_sub_ps(iz3,jz0);
228
229             /* Calculate squared distance and things based on it */
230             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
231             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
232             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
233             rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
234
235             rinv00           = gmx_mm_invsqrt_ps(rsq00);
236             rinv10           = gmx_mm_invsqrt_ps(rsq10);
237             rinv20           = gmx_mm_invsqrt_ps(rsq20);
238             rinv30           = gmx_mm_invsqrt_ps(rsq30);
239
240             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
241             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
242             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
243             rinvsq30         = _mm_mul_ps(rinv30,rinv30);
244
245             /* Load parameters for j particles */
246             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
247                                                               charge+jnrC+0,charge+jnrD+0);
248             vdwjidx0A        = 2*vdwtype[jnrA+0];
249             vdwjidx0B        = 2*vdwtype[jnrB+0];
250             vdwjidx0C        = 2*vdwtype[jnrC+0];
251             vdwjidx0D        = 2*vdwtype[jnrD+0];
252
253             fjx0             = _mm_setzero_ps();
254             fjy0             = _mm_setzero_ps();
255             fjz0             = _mm_setzero_ps();
256
257             /**************************
258              * CALCULATE INTERACTIONS *
259              **************************/
260
261             if (gmx_mm_any_lt(rsq00,rcutoff2))
262             {
263
264             r00              = _mm_mul_ps(rsq00,rinv00);
265
266             /* Compute parameters for interactions between i and j atoms */
267             gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
268                                          vdwparam+vdwioffset0+vdwjidx0B,
269                                          vdwparam+vdwioffset0+vdwjidx0C,
270                                          vdwparam+vdwioffset0+vdwjidx0D,
271                                          &c6_00,&c12_00);
272
273             /* LENNARD-JONES DISPERSION/REPULSION */
274
275             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
276             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
277             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
278             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
279             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
280
281             d                = _mm_sub_ps(r00,rswitch);
282             d                = _mm_max_ps(d,_mm_setzero_ps());
283             d2               = _mm_mul_ps(d,d);
284             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
285
286             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
287
288             /* Evaluate switch function */
289             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
290             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
291             vvdw             = _mm_mul_ps(vvdw,sw);
292             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
293
294             /* Update potential sum for this i atom from the interaction with this j atom. */
295             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
296             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
297
298             fscal            = fvdw;
299
300             fscal            = _mm_and_ps(fscal,cutoff_mask);
301
302             /* Calculate temporary vectorial force */
303             tx               = _mm_mul_ps(fscal,dx00);
304             ty               = _mm_mul_ps(fscal,dy00);
305             tz               = _mm_mul_ps(fscal,dz00);
306
307             /* Update vectorial force */
308             fix0             = _mm_add_ps(fix0,tx);
309             fiy0             = _mm_add_ps(fiy0,ty);
310             fiz0             = _mm_add_ps(fiz0,tz);
311
312             fjx0             = _mm_add_ps(fjx0,tx);
313             fjy0             = _mm_add_ps(fjy0,ty);
314             fjz0             = _mm_add_ps(fjz0,tz);
315
316             }
317
318             /**************************
319              * CALCULATE INTERACTIONS *
320              **************************/
321
322             if (gmx_mm_any_lt(rsq10,rcutoff2))
323             {
324
325             r10              = _mm_mul_ps(rsq10,rinv10);
326
327             /* Compute parameters for interactions between i and j atoms */
328             qq10             = _mm_mul_ps(iq1,jq0);
329
330             /* EWALD ELECTROSTATICS */
331
332             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
333             ewrt             = _mm_mul_ps(r10,ewtabscale);
334             ewitab           = _mm_cvttps_epi32(ewrt);
335             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
336             ewitab           = _mm_slli_epi32(ewitab,2);
337             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
338             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
339             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
340             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
341             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
342             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
343             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
344             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
345             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
346
347             d                = _mm_sub_ps(r10,rswitch);
348             d                = _mm_max_ps(d,_mm_setzero_ps());
349             d2               = _mm_mul_ps(d,d);
350             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
351
352             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
353
354             /* Evaluate switch function */
355             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
356             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
357             velec            = _mm_mul_ps(velec,sw);
358             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
359
360             /* Update potential sum for this i atom from the interaction with this j atom. */
361             velec            = _mm_and_ps(velec,cutoff_mask);
362             velecsum         = _mm_add_ps(velecsum,velec);
363
364             fscal            = felec;
365
366             fscal            = _mm_and_ps(fscal,cutoff_mask);
367
368             /* Calculate temporary vectorial force */
369             tx               = _mm_mul_ps(fscal,dx10);
370             ty               = _mm_mul_ps(fscal,dy10);
371             tz               = _mm_mul_ps(fscal,dz10);
372
373             /* Update vectorial force */
374             fix1             = _mm_add_ps(fix1,tx);
375             fiy1             = _mm_add_ps(fiy1,ty);
376             fiz1             = _mm_add_ps(fiz1,tz);
377
378             fjx0             = _mm_add_ps(fjx0,tx);
379             fjy0             = _mm_add_ps(fjy0,ty);
380             fjz0             = _mm_add_ps(fjz0,tz);
381
382             }
383
384             /**************************
385              * CALCULATE INTERACTIONS *
386              **************************/
387
388             if (gmx_mm_any_lt(rsq20,rcutoff2))
389             {
390
391             r20              = _mm_mul_ps(rsq20,rinv20);
392
393             /* Compute parameters for interactions between i and j atoms */
394             qq20             = _mm_mul_ps(iq2,jq0);
395
396             /* EWALD ELECTROSTATICS */
397
398             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
399             ewrt             = _mm_mul_ps(r20,ewtabscale);
400             ewitab           = _mm_cvttps_epi32(ewrt);
401             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
402             ewitab           = _mm_slli_epi32(ewitab,2);
403             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
404             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
405             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
406             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
407             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
408             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
409             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
410             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
411             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
412
413             d                = _mm_sub_ps(r20,rswitch);
414             d                = _mm_max_ps(d,_mm_setzero_ps());
415             d2               = _mm_mul_ps(d,d);
416             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
417
418             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
419
420             /* Evaluate switch function */
421             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
422             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
423             velec            = _mm_mul_ps(velec,sw);
424             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
425
426             /* Update potential sum for this i atom from the interaction with this j atom. */
427             velec            = _mm_and_ps(velec,cutoff_mask);
428             velecsum         = _mm_add_ps(velecsum,velec);
429
430             fscal            = felec;
431
432             fscal            = _mm_and_ps(fscal,cutoff_mask);
433
434             /* Calculate temporary vectorial force */
435             tx               = _mm_mul_ps(fscal,dx20);
436             ty               = _mm_mul_ps(fscal,dy20);
437             tz               = _mm_mul_ps(fscal,dz20);
438
439             /* Update vectorial force */
440             fix2             = _mm_add_ps(fix2,tx);
441             fiy2             = _mm_add_ps(fiy2,ty);
442             fiz2             = _mm_add_ps(fiz2,tz);
443
444             fjx0             = _mm_add_ps(fjx0,tx);
445             fjy0             = _mm_add_ps(fjy0,ty);
446             fjz0             = _mm_add_ps(fjz0,tz);
447
448             }
449
450             /**************************
451              * CALCULATE INTERACTIONS *
452              **************************/
453
454             if (gmx_mm_any_lt(rsq30,rcutoff2))
455             {
456
457             r30              = _mm_mul_ps(rsq30,rinv30);
458
459             /* Compute parameters for interactions between i and j atoms */
460             qq30             = _mm_mul_ps(iq3,jq0);
461
462             /* EWALD ELECTROSTATICS */
463
464             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
465             ewrt             = _mm_mul_ps(r30,ewtabscale);
466             ewitab           = _mm_cvttps_epi32(ewrt);
467             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
468             ewitab           = _mm_slli_epi32(ewitab,2);
469             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
470             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
471             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
472             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
473             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
474             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
475             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
476             velec            = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
477             felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
478
479             d                = _mm_sub_ps(r30,rswitch);
480             d                = _mm_max_ps(d,_mm_setzero_ps());
481             d2               = _mm_mul_ps(d,d);
482             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
483
484             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
485
486             /* Evaluate switch function */
487             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
488             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
489             velec            = _mm_mul_ps(velec,sw);
490             cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
491
492             /* Update potential sum for this i atom from the interaction with this j atom. */
493             velec            = _mm_and_ps(velec,cutoff_mask);
494             velecsum         = _mm_add_ps(velecsum,velec);
495
496             fscal            = felec;
497
498             fscal            = _mm_and_ps(fscal,cutoff_mask);
499
500             /* Calculate temporary vectorial force */
501             tx               = _mm_mul_ps(fscal,dx30);
502             ty               = _mm_mul_ps(fscal,dy30);
503             tz               = _mm_mul_ps(fscal,dz30);
504
505             /* Update vectorial force */
506             fix3             = _mm_add_ps(fix3,tx);
507             fiy3             = _mm_add_ps(fiy3,ty);
508             fiz3             = _mm_add_ps(fiz3,tz);
509
510             fjx0             = _mm_add_ps(fjx0,tx);
511             fjy0             = _mm_add_ps(fjy0,ty);
512             fjz0             = _mm_add_ps(fjz0,tz);
513
514             }
515
516             fjptrA             = f+j_coord_offsetA;
517             fjptrB             = f+j_coord_offsetB;
518             fjptrC             = f+j_coord_offsetC;
519             fjptrD             = f+j_coord_offsetD;
520
521             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
522
523             /* Inner loop uses 254 flops */
524         }
525
526         if(jidx<j_index_end)
527         {
528
529             /* Get j neighbor index, and coordinate index */
530             jnrlistA         = jjnr[jidx];
531             jnrlistB         = jjnr[jidx+1];
532             jnrlistC         = jjnr[jidx+2];
533             jnrlistD         = jjnr[jidx+3];
534             /* Sign of each element will be negative for non-real atoms.
535              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
536              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
537              */
538             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
539             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
540             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
541             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
542             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
543             j_coord_offsetA  = DIM*jnrA;
544             j_coord_offsetB  = DIM*jnrB;
545             j_coord_offsetC  = DIM*jnrC;
546             j_coord_offsetD  = DIM*jnrD;
547
548             /* load j atom coordinates */
549             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
550                                               x+j_coord_offsetC,x+j_coord_offsetD,
551                                               &jx0,&jy0,&jz0);
552
553             /* Calculate displacement vector */
554             dx00             = _mm_sub_ps(ix0,jx0);
555             dy00             = _mm_sub_ps(iy0,jy0);
556             dz00             = _mm_sub_ps(iz0,jz0);
557             dx10             = _mm_sub_ps(ix1,jx0);
558             dy10             = _mm_sub_ps(iy1,jy0);
559             dz10             = _mm_sub_ps(iz1,jz0);
560             dx20             = _mm_sub_ps(ix2,jx0);
561             dy20             = _mm_sub_ps(iy2,jy0);
562             dz20             = _mm_sub_ps(iz2,jz0);
563             dx30             = _mm_sub_ps(ix3,jx0);
564             dy30             = _mm_sub_ps(iy3,jy0);
565             dz30             = _mm_sub_ps(iz3,jz0);
566
567             /* Calculate squared distance and things based on it */
568             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
569             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
570             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
571             rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
572
573             rinv00           = gmx_mm_invsqrt_ps(rsq00);
574             rinv10           = gmx_mm_invsqrt_ps(rsq10);
575             rinv20           = gmx_mm_invsqrt_ps(rsq20);
576             rinv30           = gmx_mm_invsqrt_ps(rsq30);
577
578             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
579             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
580             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
581             rinvsq30         = _mm_mul_ps(rinv30,rinv30);
582
583             /* Load parameters for j particles */
584             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
585                                                               charge+jnrC+0,charge+jnrD+0);
586             vdwjidx0A        = 2*vdwtype[jnrA+0];
587             vdwjidx0B        = 2*vdwtype[jnrB+0];
588             vdwjidx0C        = 2*vdwtype[jnrC+0];
589             vdwjidx0D        = 2*vdwtype[jnrD+0];
590
591             fjx0             = _mm_setzero_ps();
592             fjy0             = _mm_setzero_ps();
593             fjz0             = _mm_setzero_ps();
594
595             /**************************
596              * CALCULATE INTERACTIONS *
597              **************************/
598
599             if (gmx_mm_any_lt(rsq00,rcutoff2))
600             {
601
602             r00              = _mm_mul_ps(rsq00,rinv00);
603             r00              = _mm_andnot_ps(dummy_mask,r00);
604
605             /* Compute parameters for interactions between i and j atoms */
606             gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
607                                          vdwparam+vdwioffset0+vdwjidx0B,
608                                          vdwparam+vdwioffset0+vdwjidx0C,
609                                          vdwparam+vdwioffset0+vdwjidx0D,
610                                          &c6_00,&c12_00);
611
612             /* LENNARD-JONES DISPERSION/REPULSION */
613
614             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
615             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
616             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
617             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
618             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
619
620             d                = _mm_sub_ps(r00,rswitch);
621             d                = _mm_max_ps(d,_mm_setzero_ps());
622             d2               = _mm_mul_ps(d,d);
623             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
624
625             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
626
627             /* Evaluate switch function */
628             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
629             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
630             vvdw             = _mm_mul_ps(vvdw,sw);
631             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
632
633             /* Update potential sum for this i atom from the interaction with this j atom. */
634             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
635             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
636             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
637
638             fscal            = fvdw;
639
640             fscal            = _mm_and_ps(fscal,cutoff_mask);
641
642             fscal            = _mm_andnot_ps(dummy_mask,fscal);
643
644             /* Calculate temporary vectorial force */
645             tx               = _mm_mul_ps(fscal,dx00);
646             ty               = _mm_mul_ps(fscal,dy00);
647             tz               = _mm_mul_ps(fscal,dz00);
648
649             /* Update vectorial force */
650             fix0             = _mm_add_ps(fix0,tx);
651             fiy0             = _mm_add_ps(fiy0,ty);
652             fiz0             = _mm_add_ps(fiz0,tz);
653
654             fjx0             = _mm_add_ps(fjx0,tx);
655             fjy0             = _mm_add_ps(fjy0,ty);
656             fjz0             = _mm_add_ps(fjz0,tz);
657
658             }
659
660             /**************************
661              * CALCULATE INTERACTIONS *
662              **************************/
663
664             if (gmx_mm_any_lt(rsq10,rcutoff2))
665             {
666
667             r10              = _mm_mul_ps(rsq10,rinv10);
668             r10              = _mm_andnot_ps(dummy_mask,r10);
669
670             /* Compute parameters for interactions between i and j atoms */
671             qq10             = _mm_mul_ps(iq1,jq0);
672
673             /* EWALD ELECTROSTATICS */
674
675             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
676             ewrt             = _mm_mul_ps(r10,ewtabscale);
677             ewitab           = _mm_cvttps_epi32(ewrt);
678             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
679             ewitab           = _mm_slli_epi32(ewitab,2);
680             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
681             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
682             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
683             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
684             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
685             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
686             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
687             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
688             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
689
690             d                = _mm_sub_ps(r10,rswitch);
691             d                = _mm_max_ps(d,_mm_setzero_ps());
692             d2               = _mm_mul_ps(d,d);
693             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
694
695             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
696
697             /* Evaluate switch function */
698             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
699             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
700             velec            = _mm_mul_ps(velec,sw);
701             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
702
703             /* Update potential sum for this i atom from the interaction with this j atom. */
704             velec            = _mm_and_ps(velec,cutoff_mask);
705             velec            = _mm_andnot_ps(dummy_mask,velec);
706             velecsum         = _mm_add_ps(velecsum,velec);
707
708             fscal            = felec;
709
710             fscal            = _mm_and_ps(fscal,cutoff_mask);
711
712             fscal            = _mm_andnot_ps(dummy_mask,fscal);
713
714             /* Calculate temporary vectorial force */
715             tx               = _mm_mul_ps(fscal,dx10);
716             ty               = _mm_mul_ps(fscal,dy10);
717             tz               = _mm_mul_ps(fscal,dz10);
718
719             /* Update vectorial force */
720             fix1             = _mm_add_ps(fix1,tx);
721             fiy1             = _mm_add_ps(fiy1,ty);
722             fiz1             = _mm_add_ps(fiz1,tz);
723
724             fjx0             = _mm_add_ps(fjx0,tx);
725             fjy0             = _mm_add_ps(fjy0,ty);
726             fjz0             = _mm_add_ps(fjz0,tz);
727
728             }
729
730             /**************************
731              * CALCULATE INTERACTIONS *
732              **************************/
733
734             if (gmx_mm_any_lt(rsq20,rcutoff2))
735             {
736
737             r20              = _mm_mul_ps(rsq20,rinv20);
738             r20              = _mm_andnot_ps(dummy_mask,r20);
739
740             /* Compute parameters for interactions between i and j atoms */
741             qq20             = _mm_mul_ps(iq2,jq0);
742
743             /* EWALD ELECTROSTATICS */
744
745             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
746             ewrt             = _mm_mul_ps(r20,ewtabscale);
747             ewitab           = _mm_cvttps_epi32(ewrt);
748             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
749             ewitab           = _mm_slli_epi32(ewitab,2);
750             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
751             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
752             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
753             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
754             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
755             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
756             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
757             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
758             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
759
760             d                = _mm_sub_ps(r20,rswitch);
761             d                = _mm_max_ps(d,_mm_setzero_ps());
762             d2               = _mm_mul_ps(d,d);
763             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
764
765             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
766
767             /* Evaluate switch function */
768             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
769             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
770             velec            = _mm_mul_ps(velec,sw);
771             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
772
773             /* Update potential sum for this i atom from the interaction with this j atom. */
774             velec            = _mm_and_ps(velec,cutoff_mask);
775             velec            = _mm_andnot_ps(dummy_mask,velec);
776             velecsum         = _mm_add_ps(velecsum,velec);
777
778             fscal            = felec;
779
780             fscal            = _mm_and_ps(fscal,cutoff_mask);
781
782             fscal            = _mm_andnot_ps(dummy_mask,fscal);
783
784             /* Calculate temporary vectorial force */
785             tx               = _mm_mul_ps(fscal,dx20);
786             ty               = _mm_mul_ps(fscal,dy20);
787             tz               = _mm_mul_ps(fscal,dz20);
788
789             /* Update vectorial force */
790             fix2             = _mm_add_ps(fix2,tx);
791             fiy2             = _mm_add_ps(fiy2,ty);
792             fiz2             = _mm_add_ps(fiz2,tz);
793
794             fjx0             = _mm_add_ps(fjx0,tx);
795             fjy0             = _mm_add_ps(fjy0,ty);
796             fjz0             = _mm_add_ps(fjz0,tz);
797
798             }
799
800             /**************************
801              * CALCULATE INTERACTIONS *
802              **************************/
803
804             if (gmx_mm_any_lt(rsq30,rcutoff2))
805             {
806
807             r30              = _mm_mul_ps(rsq30,rinv30);
808             r30              = _mm_andnot_ps(dummy_mask,r30);
809
810             /* Compute parameters for interactions between i and j atoms */
811             qq30             = _mm_mul_ps(iq3,jq0);
812
813             /* EWALD ELECTROSTATICS */
814
815             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
816             ewrt             = _mm_mul_ps(r30,ewtabscale);
817             ewitab           = _mm_cvttps_epi32(ewrt);
818             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
819             ewitab           = _mm_slli_epi32(ewitab,2);
820             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
821             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
822             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
823             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
824             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
825             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
826             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
827             velec            = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
828             felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
829
830             d                = _mm_sub_ps(r30,rswitch);
831             d                = _mm_max_ps(d,_mm_setzero_ps());
832             d2               = _mm_mul_ps(d,d);
833             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
834
835             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
836
837             /* Evaluate switch function */
838             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
839             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
840             velec            = _mm_mul_ps(velec,sw);
841             cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
842
843             /* Update potential sum for this i atom from the interaction with this j atom. */
844             velec            = _mm_and_ps(velec,cutoff_mask);
845             velec            = _mm_andnot_ps(dummy_mask,velec);
846             velecsum         = _mm_add_ps(velecsum,velec);
847
848             fscal            = felec;
849
850             fscal            = _mm_and_ps(fscal,cutoff_mask);
851
852             fscal            = _mm_andnot_ps(dummy_mask,fscal);
853
854             /* Calculate temporary vectorial force */
855             tx               = _mm_mul_ps(fscal,dx30);
856             ty               = _mm_mul_ps(fscal,dy30);
857             tz               = _mm_mul_ps(fscal,dz30);
858
859             /* Update vectorial force */
860             fix3             = _mm_add_ps(fix3,tx);
861             fiy3             = _mm_add_ps(fiy3,ty);
862             fiz3             = _mm_add_ps(fiz3,tz);
863
864             fjx0             = _mm_add_ps(fjx0,tx);
865             fjy0             = _mm_add_ps(fjy0,ty);
866             fjz0             = _mm_add_ps(fjz0,tz);
867
868             }
869
870             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
871             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
872             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
873             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
874
875             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
876
877             /* Inner loop uses 258 flops */
878         }
879
880         /* End of innermost loop */
881
882         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
883                                               f+i_coord_offset,fshift+i_shift_offset);
884
885         ggid                        = gid[iidx];
886         /* Update potential energies */
887         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
888         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
889
890         /* Increment number of inner iterations */
891         inneriter                  += j_index_end - j_index_start;
892
893         /* Outer loop uses 26 flops */
894     }
895
896     /* Increment number of outer iterations */
897     outeriter        += nri;
898
899     /* Update outer/inner flops */
900
901     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*258);
902 }
903 /*
904  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse4_1_single
905  * Electrostatics interaction: Ewald
906  * VdW interaction:            LennardJones
907  * Geometry:                   Water4-Particle
908  * Calculate force/pot:        Force
909  */
910 void
911 nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse4_1_single
912                     (t_nblist * gmx_restrict                nlist,
913                      rvec * gmx_restrict                    xx,
914                      rvec * gmx_restrict                    ff,
915                      t_forcerec * gmx_restrict              fr,
916                      t_mdatoms * gmx_restrict               mdatoms,
917                      nb_kernel_data_t * gmx_restrict        kernel_data,
918                      t_nrnb * gmx_restrict                  nrnb)
919 {
920     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
921      * just 0 for non-waters.
922      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
923      * jnr indices corresponding to data put in the four positions in the SIMD register.
924      */
925     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
926     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
927     int              jnrA,jnrB,jnrC,jnrD;
928     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
929     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
930     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
931     real             rcutoff_scalar;
932     real             *shiftvec,*fshift,*x,*f;
933     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
934     real             scratch[4*DIM];
935     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
936     int              vdwioffset0;
937     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
938     int              vdwioffset1;
939     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
940     int              vdwioffset2;
941     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
942     int              vdwioffset3;
943     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
944     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
945     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
946     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
947     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
948     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
949     __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
950     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
951     real             *charge;
952     int              nvdwtype;
953     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
954     int              *vdwtype;
955     real             *vdwparam;
956     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
957     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
958     __m128i          ewitab;
959     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
960     real             *ewtab;
961     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
962     real             rswitch_scalar,d_scalar;
963     __m128           dummy_mask,cutoff_mask;
964     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
965     __m128           one     = _mm_set1_ps(1.0);
966     __m128           two     = _mm_set1_ps(2.0);
967     x                = xx[0];
968     f                = ff[0];
969
970     nri              = nlist->nri;
971     iinr             = nlist->iinr;
972     jindex           = nlist->jindex;
973     jjnr             = nlist->jjnr;
974     shiftidx         = nlist->shift;
975     gid              = nlist->gid;
976     shiftvec         = fr->shift_vec[0];
977     fshift           = fr->fshift[0];
978     facel            = _mm_set1_ps(fr->epsfac);
979     charge           = mdatoms->chargeA;
980     nvdwtype         = fr->ntype;
981     vdwparam         = fr->nbfp;
982     vdwtype          = mdatoms->typeA;
983
984     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
985     ewtab            = fr->ic->tabq_coul_FDV0;
986     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
987     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
988
989     /* Setup water-specific parameters */
990     inr              = nlist->iinr[0];
991     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
992     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
993     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
994     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
995
996     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
997     rcutoff_scalar   = fr->rcoulomb;
998     rcutoff          = _mm_set1_ps(rcutoff_scalar);
999     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1000
1001     rswitch_scalar   = fr->rcoulomb_switch;
1002     rswitch          = _mm_set1_ps(rswitch_scalar);
1003     /* Setup switch parameters */
1004     d_scalar         = rcutoff_scalar-rswitch_scalar;
1005     d                = _mm_set1_ps(d_scalar);
1006     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1007     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1008     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1009     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1010     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1011     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1012
1013     /* Avoid stupid compiler warnings */
1014     jnrA = jnrB = jnrC = jnrD = 0;
1015     j_coord_offsetA = 0;
1016     j_coord_offsetB = 0;
1017     j_coord_offsetC = 0;
1018     j_coord_offsetD = 0;
1019
1020     outeriter        = 0;
1021     inneriter        = 0;
1022
1023     for(iidx=0;iidx<4*DIM;iidx++)
1024     {
1025         scratch[iidx] = 0.0;
1026     }
1027
1028     /* Start outer loop over neighborlists */
1029     for(iidx=0; iidx<nri; iidx++)
1030     {
1031         /* Load shift vector for this list */
1032         i_shift_offset   = DIM*shiftidx[iidx];
1033
1034         /* Load limits for loop over neighbors */
1035         j_index_start    = jindex[iidx];
1036         j_index_end      = jindex[iidx+1];
1037
1038         /* Get outer coordinate index */
1039         inr              = iinr[iidx];
1040         i_coord_offset   = DIM*inr;
1041
1042         /* Load i particle coords and add shift vector */
1043         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1044                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1045
1046         fix0             = _mm_setzero_ps();
1047         fiy0             = _mm_setzero_ps();
1048         fiz0             = _mm_setzero_ps();
1049         fix1             = _mm_setzero_ps();
1050         fiy1             = _mm_setzero_ps();
1051         fiz1             = _mm_setzero_ps();
1052         fix2             = _mm_setzero_ps();
1053         fiy2             = _mm_setzero_ps();
1054         fiz2             = _mm_setzero_ps();
1055         fix3             = _mm_setzero_ps();
1056         fiy3             = _mm_setzero_ps();
1057         fiz3             = _mm_setzero_ps();
1058
1059         /* Start inner kernel loop */
1060         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1061         {
1062
1063             /* Get j neighbor index, and coordinate index */
1064             jnrA             = jjnr[jidx];
1065             jnrB             = jjnr[jidx+1];
1066             jnrC             = jjnr[jidx+2];
1067             jnrD             = jjnr[jidx+3];
1068             j_coord_offsetA  = DIM*jnrA;
1069             j_coord_offsetB  = DIM*jnrB;
1070             j_coord_offsetC  = DIM*jnrC;
1071             j_coord_offsetD  = DIM*jnrD;
1072
1073             /* load j atom coordinates */
1074             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1075                                               x+j_coord_offsetC,x+j_coord_offsetD,
1076                                               &jx0,&jy0,&jz0);
1077
1078             /* Calculate displacement vector */
1079             dx00             = _mm_sub_ps(ix0,jx0);
1080             dy00             = _mm_sub_ps(iy0,jy0);
1081             dz00             = _mm_sub_ps(iz0,jz0);
1082             dx10             = _mm_sub_ps(ix1,jx0);
1083             dy10             = _mm_sub_ps(iy1,jy0);
1084             dz10             = _mm_sub_ps(iz1,jz0);
1085             dx20             = _mm_sub_ps(ix2,jx0);
1086             dy20             = _mm_sub_ps(iy2,jy0);
1087             dz20             = _mm_sub_ps(iz2,jz0);
1088             dx30             = _mm_sub_ps(ix3,jx0);
1089             dy30             = _mm_sub_ps(iy3,jy0);
1090             dz30             = _mm_sub_ps(iz3,jz0);
1091
1092             /* Calculate squared distance and things based on it */
1093             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1094             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1095             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1096             rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1097
1098             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1099             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1100             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1101             rinv30           = gmx_mm_invsqrt_ps(rsq30);
1102
1103             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1104             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1105             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1106             rinvsq30         = _mm_mul_ps(rinv30,rinv30);
1107
1108             /* Load parameters for j particles */
1109             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1110                                                               charge+jnrC+0,charge+jnrD+0);
1111             vdwjidx0A        = 2*vdwtype[jnrA+0];
1112             vdwjidx0B        = 2*vdwtype[jnrB+0];
1113             vdwjidx0C        = 2*vdwtype[jnrC+0];
1114             vdwjidx0D        = 2*vdwtype[jnrD+0];
1115
1116             fjx0             = _mm_setzero_ps();
1117             fjy0             = _mm_setzero_ps();
1118             fjz0             = _mm_setzero_ps();
1119
1120             /**************************
1121              * CALCULATE INTERACTIONS *
1122              **************************/
1123
1124             if (gmx_mm_any_lt(rsq00,rcutoff2))
1125             {
1126
1127             r00              = _mm_mul_ps(rsq00,rinv00);
1128
1129             /* Compute parameters for interactions between i and j atoms */
1130             gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1131                                          vdwparam+vdwioffset0+vdwjidx0B,
1132                                          vdwparam+vdwioffset0+vdwjidx0C,
1133                                          vdwparam+vdwioffset0+vdwjidx0D,
1134                                          &c6_00,&c12_00);
1135
1136             /* LENNARD-JONES DISPERSION/REPULSION */
1137
1138             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1139             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
1140             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1141             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1142             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1143
1144             d                = _mm_sub_ps(r00,rswitch);
1145             d                = _mm_max_ps(d,_mm_setzero_ps());
1146             d2               = _mm_mul_ps(d,d);
1147             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1148
1149             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1150
1151             /* Evaluate switch function */
1152             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1153             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1154             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1155
1156             fscal            = fvdw;
1157
1158             fscal            = _mm_and_ps(fscal,cutoff_mask);
1159
1160             /* Calculate temporary vectorial force */
1161             tx               = _mm_mul_ps(fscal,dx00);
1162             ty               = _mm_mul_ps(fscal,dy00);
1163             tz               = _mm_mul_ps(fscal,dz00);
1164
1165             /* Update vectorial force */
1166             fix0             = _mm_add_ps(fix0,tx);
1167             fiy0             = _mm_add_ps(fiy0,ty);
1168             fiz0             = _mm_add_ps(fiz0,tz);
1169
1170             fjx0             = _mm_add_ps(fjx0,tx);
1171             fjy0             = _mm_add_ps(fjy0,ty);
1172             fjz0             = _mm_add_ps(fjz0,tz);
1173
1174             }
1175
1176             /**************************
1177              * CALCULATE INTERACTIONS *
1178              **************************/
1179
1180             if (gmx_mm_any_lt(rsq10,rcutoff2))
1181             {
1182
1183             r10              = _mm_mul_ps(rsq10,rinv10);
1184
1185             /* Compute parameters for interactions between i and j atoms */
1186             qq10             = _mm_mul_ps(iq1,jq0);
1187
1188             /* EWALD ELECTROSTATICS */
1189
1190             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1191             ewrt             = _mm_mul_ps(r10,ewtabscale);
1192             ewitab           = _mm_cvttps_epi32(ewrt);
1193             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1194             ewitab           = _mm_slli_epi32(ewitab,2);
1195             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1196             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1197             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1198             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1199             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1200             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1201             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1202             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
1203             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1204
1205             d                = _mm_sub_ps(r10,rswitch);
1206             d                = _mm_max_ps(d,_mm_setzero_ps());
1207             d2               = _mm_mul_ps(d,d);
1208             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1209
1210             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1211
1212             /* Evaluate switch function */
1213             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1214             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1215             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1216
1217             fscal            = felec;
1218
1219             fscal            = _mm_and_ps(fscal,cutoff_mask);
1220
1221             /* Calculate temporary vectorial force */
1222             tx               = _mm_mul_ps(fscal,dx10);
1223             ty               = _mm_mul_ps(fscal,dy10);
1224             tz               = _mm_mul_ps(fscal,dz10);
1225
1226             /* Update vectorial force */
1227             fix1             = _mm_add_ps(fix1,tx);
1228             fiy1             = _mm_add_ps(fiy1,ty);
1229             fiz1             = _mm_add_ps(fiz1,tz);
1230
1231             fjx0             = _mm_add_ps(fjx0,tx);
1232             fjy0             = _mm_add_ps(fjy0,ty);
1233             fjz0             = _mm_add_ps(fjz0,tz);
1234
1235             }
1236
1237             /**************************
1238              * CALCULATE INTERACTIONS *
1239              **************************/
1240
1241             if (gmx_mm_any_lt(rsq20,rcutoff2))
1242             {
1243
1244             r20              = _mm_mul_ps(rsq20,rinv20);
1245
1246             /* Compute parameters for interactions between i and j atoms */
1247             qq20             = _mm_mul_ps(iq2,jq0);
1248
1249             /* EWALD ELECTROSTATICS */
1250
1251             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1252             ewrt             = _mm_mul_ps(r20,ewtabscale);
1253             ewitab           = _mm_cvttps_epi32(ewrt);
1254             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1255             ewitab           = _mm_slli_epi32(ewitab,2);
1256             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1257             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1258             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1259             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1260             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1261             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1262             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1263             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1264             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1265
1266             d                = _mm_sub_ps(r20,rswitch);
1267             d                = _mm_max_ps(d,_mm_setzero_ps());
1268             d2               = _mm_mul_ps(d,d);
1269             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1270
1271             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1272
1273             /* Evaluate switch function */
1274             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1275             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1276             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1277
1278             fscal            = felec;
1279
1280             fscal            = _mm_and_ps(fscal,cutoff_mask);
1281
1282             /* Calculate temporary vectorial force */
1283             tx               = _mm_mul_ps(fscal,dx20);
1284             ty               = _mm_mul_ps(fscal,dy20);
1285             tz               = _mm_mul_ps(fscal,dz20);
1286
1287             /* Update vectorial force */
1288             fix2             = _mm_add_ps(fix2,tx);
1289             fiy2             = _mm_add_ps(fiy2,ty);
1290             fiz2             = _mm_add_ps(fiz2,tz);
1291
1292             fjx0             = _mm_add_ps(fjx0,tx);
1293             fjy0             = _mm_add_ps(fjy0,ty);
1294             fjz0             = _mm_add_ps(fjz0,tz);
1295
1296             }
1297
1298             /**************************
1299              * CALCULATE INTERACTIONS *
1300              **************************/
1301
1302             if (gmx_mm_any_lt(rsq30,rcutoff2))
1303             {
1304
1305             r30              = _mm_mul_ps(rsq30,rinv30);
1306
1307             /* Compute parameters for interactions between i and j atoms */
1308             qq30             = _mm_mul_ps(iq3,jq0);
1309
1310             /* EWALD ELECTROSTATICS */
1311
1312             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1313             ewrt             = _mm_mul_ps(r30,ewtabscale);
1314             ewitab           = _mm_cvttps_epi32(ewrt);
1315             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1316             ewitab           = _mm_slli_epi32(ewitab,2);
1317             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1318             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1319             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1320             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1321             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1322             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1323             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1324             velec            = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
1325             felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1326
1327             d                = _mm_sub_ps(r30,rswitch);
1328             d                = _mm_max_ps(d,_mm_setzero_ps());
1329             d2               = _mm_mul_ps(d,d);
1330             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1331
1332             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1333
1334             /* Evaluate switch function */
1335             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1336             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
1337             cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
1338
1339             fscal            = felec;
1340
1341             fscal            = _mm_and_ps(fscal,cutoff_mask);
1342
1343             /* Calculate temporary vectorial force */
1344             tx               = _mm_mul_ps(fscal,dx30);
1345             ty               = _mm_mul_ps(fscal,dy30);
1346             tz               = _mm_mul_ps(fscal,dz30);
1347
1348             /* Update vectorial force */
1349             fix3             = _mm_add_ps(fix3,tx);
1350             fiy3             = _mm_add_ps(fiy3,ty);
1351             fiz3             = _mm_add_ps(fiz3,tz);
1352
1353             fjx0             = _mm_add_ps(fjx0,tx);
1354             fjy0             = _mm_add_ps(fjy0,ty);
1355             fjz0             = _mm_add_ps(fjz0,tz);
1356
1357             }
1358
1359             fjptrA             = f+j_coord_offsetA;
1360             fjptrB             = f+j_coord_offsetB;
1361             fjptrC             = f+j_coord_offsetC;
1362             fjptrD             = f+j_coord_offsetD;
1363
1364             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1365
1366             /* Inner loop uses 242 flops */
1367         }
1368
1369         if(jidx<j_index_end)
1370         {
1371
1372             /* Get j neighbor index, and coordinate index */
1373             jnrlistA         = jjnr[jidx];
1374             jnrlistB         = jjnr[jidx+1];
1375             jnrlistC         = jjnr[jidx+2];
1376             jnrlistD         = jjnr[jidx+3];
1377             /* Sign of each element will be negative for non-real atoms.
1378              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1379              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1380              */
1381             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1382             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1383             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1384             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1385             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1386             j_coord_offsetA  = DIM*jnrA;
1387             j_coord_offsetB  = DIM*jnrB;
1388             j_coord_offsetC  = DIM*jnrC;
1389             j_coord_offsetD  = DIM*jnrD;
1390
1391             /* load j atom coordinates */
1392             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1393                                               x+j_coord_offsetC,x+j_coord_offsetD,
1394                                               &jx0,&jy0,&jz0);
1395
1396             /* Calculate displacement vector */
1397             dx00             = _mm_sub_ps(ix0,jx0);
1398             dy00             = _mm_sub_ps(iy0,jy0);
1399             dz00             = _mm_sub_ps(iz0,jz0);
1400             dx10             = _mm_sub_ps(ix1,jx0);
1401             dy10             = _mm_sub_ps(iy1,jy0);
1402             dz10             = _mm_sub_ps(iz1,jz0);
1403             dx20             = _mm_sub_ps(ix2,jx0);
1404             dy20             = _mm_sub_ps(iy2,jy0);
1405             dz20             = _mm_sub_ps(iz2,jz0);
1406             dx30             = _mm_sub_ps(ix3,jx0);
1407             dy30             = _mm_sub_ps(iy3,jy0);
1408             dz30             = _mm_sub_ps(iz3,jz0);
1409
1410             /* Calculate squared distance and things based on it */
1411             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1412             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1413             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1414             rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1415
1416             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1417             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1418             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1419             rinv30           = gmx_mm_invsqrt_ps(rsq30);
1420
1421             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1422             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1423             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1424             rinvsq30         = _mm_mul_ps(rinv30,rinv30);
1425
1426             /* Load parameters for j particles */
1427             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1428                                                               charge+jnrC+0,charge+jnrD+0);
1429             vdwjidx0A        = 2*vdwtype[jnrA+0];
1430             vdwjidx0B        = 2*vdwtype[jnrB+0];
1431             vdwjidx0C        = 2*vdwtype[jnrC+0];
1432             vdwjidx0D        = 2*vdwtype[jnrD+0];
1433
1434             fjx0             = _mm_setzero_ps();
1435             fjy0             = _mm_setzero_ps();
1436             fjz0             = _mm_setzero_ps();
1437
1438             /**************************
1439              * CALCULATE INTERACTIONS *
1440              **************************/
1441
1442             if (gmx_mm_any_lt(rsq00,rcutoff2))
1443             {
1444
1445             r00              = _mm_mul_ps(rsq00,rinv00);
1446             r00              = _mm_andnot_ps(dummy_mask,r00);
1447
1448             /* Compute parameters for interactions between i and j atoms */
1449             gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1450                                          vdwparam+vdwioffset0+vdwjidx0B,
1451                                          vdwparam+vdwioffset0+vdwjidx0C,
1452                                          vdwparam+vdwioffset0+vdwjidx0D,
1453                                          &c6_00,&c12_00);
1454
1455             /* LENNARD-JONES DISPERSION/REPULSION */
1456
1457             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1458             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
1459             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1460             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1461             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1462
1463             d                = _mm_sub_ps(r00,rswitch);
1464             d                = _mm_max_ps(d,_mm_setzero_ps());
1465             d2               = _mm_mul_ps(d,d);
1466             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1467
1468             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1469
1470             /* Evaluate switch function */
1471             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1472             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1473             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1474
1475             fscal            = fvdw;
1476
1477             fscal            = _mm_and_ps(fscal,cutoff_mask);
1478
1479             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1480
1481             /* Calculate temporary vectorial force */
1482             tx               = _mm_mul_ps(fscal,dx00);
1483             ty               = _mm_mul_ps(fscal,dy00);
1484             tz               = _mm_mul_ps(fscal,dz00);
1485
1486             /* Update vectorial force */
1487             fix0             = _mm_add_ps(fix0,tx);
1488             fiy0             = _mm_add_ps(fiy0,ty);
1489             fiz0             = _mm_add_ps(fiz0,tz);
1490
1491             fjx0             = _mm_add_ps(fjx0,tx);
1492             fjy0             = _mm_add_ps(fjy0,ty);
1493             fjz0             = _mm_add_ps(fjz0,tz);
1494
1495             }
1496
1497             /**************************
1498              * CALCULATE INTERACTIONS *
1499              **************************/
1500
1501             if (gmx_mm_any_lt(rsq10,rcutoff2))
1502             {
1503
1504             r10              = _mm_mul_ps(rsq10,rinv10);
1505             r10              = _mm_andnot_ps(dummy_mask,r10);
1506
1507             /* Compute parameters for interactions between i and j atoms */
1508             qq10             = _mm_mul_ps(iq1,jq0);
1509
1510             /* EWALD ELECTROSTATICS */
1511
1512             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1513             ewrt             = _mm_mul_ps(r10,ewtabscale);
1514             ewitab           = _mm_cvttps_epi32(ewrt);
1515             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1516             ewitab           = _mm_slli_epi32(ewitab,2);
1517             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1518             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1519             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1520             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1521             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1522             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1523             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1524             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
1525             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1526
1527             d                = _mm_sub_ps(r10,rswitch);
1528             d                = _mm_max_ps(d,_mm_setzero_ps());
1529             d2               = _mm_mul_ps(d,d);
1530             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1531
1532             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1533
1534             /* Evaluate switch function */
1535             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1536             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1537             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1538
1539             fscal            = felec;
1540
1541             fscal            = _mm_and_ps(fscal,cutoff_mask);
1542
1543             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1544
1545             /* Calculate temporary vectorial force */
1546             tx               = _mm_mul_ps(fscal,dx10);
1547             ty               = _mm_mul_ps(fscal,dy10);
1548             tz               = _mm_mul_ps(fscal,dz10);
1549
1550             /* Update vectorial force */
1551             fix1             = _mm_add_ps(fix1,tx);
1552             fiy1             = _mm_add_ps(fiy1,ty);
1553             fiz1             = _mm_add_ps(fiz1,tz);
1554
1555             fjx0             = _mm_add_ps(fjx0,tx);
1556             fjy0             = _mm_add_ps(fjy0,ty);
1557             fjz0             = _mm_add_ps(fjz0,tz);
1558
1559             }
1560
1561             /**************************
1562              * CALCULATE INTERACTIONS *
1563              **************************/
1564
1565             if (gmx_mm_any_lt(rsq20,rcutoff2))
1566             {
1567
1568             r20              = _mm_mul_ps(rsq20,rinv20);
1569             r20              = _mm_andnot_ps(dummy_mask,r20);
1570
1571             /* Compute parameters for interactions between i and j atoms */
1572             qq20             = _mm_mul_ps(iq2,jq0);
1573
1574             /* EWALD ELECTROSTATICS */
1575
1576             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1577             ewrt             = _mm_mul_ps(r20,ewtabscale);
1578             ewitab           = _mm_cvttps_epi32(ewrt);
1579             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1580             ewitab           = _mm_slli_epi32(ewitab,2);
1581             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1582             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1583             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1584             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1585             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1586             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1587             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1588             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1589             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1590
1591             d                = _mm_sub_ps(r20,rswitch);
1592             d                = _mm_max_ps(d,_mm_setzero_ps());
1593             d2               = _mm_mul_ps(d,d);
1594             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1595
1596             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1597
1598             /* Evaluate switch function */
1599             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1600             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1601             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1602
1603             fscal            = felec;
1604
1605             fscal            = _mm_and_ps(fscal,cutoff_mask);
1606
1607             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1608
1609             /* Calculate temporary vectorial force */
1610             tx               = _mm_mul_ps(fscal,dx20);
1611             ty               = _mm_mul_ps(fscal,dy20);
1612             tz               = _mm_mul_ps(fscal,dz20);
1613
1614             /* Update vectorial force */
1615             fix2             = _mm_add_ps(fix2,tx);
1616             fiy2             = _mm_add_ps(fiy2,ty);
1617             fiz2             = _mm_add_ps(fiz2,tz);
1618
1619             fjx0             = _mm_add_ps(fjx0,tx);
1620             fjy0             = _mm_add_ps(fjy0,ty);
1621             fjz0             = _mm_add_ps(fjz0,tz);
1622
1623             }
1624
1625             /**************************
1626              * CALCULATE INTERACTIONS *
1627              **************************/
1628
1629             if (gmx_mm_any_lt(rsq30,rcutoff2))
1630             {
1631
1632             r30              = _mm_mul_ps(rsq30,rinv30);
1633             r30              = _mm_andnot_ps(dummy_mask,r30);
1634
1635             /* Compute parameters for interactions between i and j atoms */
1636             qq30             = _mm_mul_ps(iq3,jq0);
1637
1638             /* EWALD ELECTROSTATICS */
1639
1640             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1641             ewrt             = _mm_mul_ps(r30,ewtabscale);
1642             ewitab           = _mm_cvttps_epi32(ewrt);
1643             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1644             ewitab           = _mm_slli_epi32(ewitab,2);
1645             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1646             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1647             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1648             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1649             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1650             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1651             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1652             velec            = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
1653             felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1654
1655             d                = _mm_sub_ps(r30,rswitch);
1656             d                = _mm_max_ps(d,_mm_setzero_ps());
1657             d2               = _mm_mul_ps(d,d);
1658             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1659
1660             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1661
1662             /* Evaluate switch function */
1663             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1664             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
1665             cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
1666
1667             fscal            = felec;
1668
1669             fscal            = _mm_and_ps(fscal,cutoff_mask);
1670
1671             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1672
1673             /* Calculate temporary vectorial force */
1674             tx               = _mm_mul_ps(fscal,dx30);
1675             ty               = _mm_mul_ps(fscal,dy30);
1676             tz               = _mm_mul_ps(fscal,dz30);
1677
1678             /* Update vectorial force */
1679             fix3             = _mm_add_ps(fix3,tx);
1680             fiy3             = _mm_add_ps(fiy3,ty);
1681             fiz3             = _mm_add_ps(fiz3,tz);
1682
1683             fjx0             = _mm_add_ps(fjx0,tx);
1684             fjy0             = _mm_add_ps(fjy0,ty);
1685             fjz0             = _mm_add_ps(fjz0,tz);
1686
1687             }
1688
1689             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1690             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1691             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1692             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1693
1694             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1695
1696             /* Inner loop uses 246 flops */
1697         }
1698
1699         /* End of innermost loop */
1700
1701         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1702                                               f+i_coord_offset,fshift+i_shift_offset);
1703
1704         /* Increment number of inner iterations */
1705         inneriter                  += j_index_end - j_index_start;
1706
1707         /* Outer loop uses 24 flops */
1708     }
1709
1710     /* Increment number of outer iterations */
1711     outeriter        += nri;
1712
1713     /* Update outer/inner flops */
1714
1715     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*246);
1716 }