815d92396dac7585fe07c3f785e19db4f663cd5e
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_sse4_1_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
46
47 #include "gromacs/simd/math_x86_sse4_1_single.h"
48 #include "kernelutil_x86_sse4_1_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse4_1_single
52  * Electrostatics interaction: ReactionField
53  * VdW interaction:            LennardJones
54  * Geometry:                   Water3-Water3
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse4_1_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83     int              vdwioffset0;
84     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     int              vdwioffset1;
86     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     int              vdwioffset2;
88     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
105     real             *charge;
106     int              nvdwtype;
107     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
108     int              *vdwtype;
109     real             *vdwparam;
110     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
111     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
112     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
113     real             rswitch_scalar,d_scalar;
114     __m128           dummy_mask,cutoff_mask;
115     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
116     __m128           one     = _mm_set1_ps(1.0);
117     __m128           two     = _mm_set1_ps(2.0);
118     x                = xx[0];
119     f                = ff[0];
120
121     nri              = nlist->nri;
122     iinr             = nlist->iinr;
123     jindex           = nlist->jindex;
124     jjnr             = nlist->jjnr;
125     shiftidx         = nlist->shift;
126     gid              = nlist->gid;
127     shiftvec         = fr->shift_vec[0];
128     fshift           = fr->fshift[0];
129     facel            = _mm_set1_ps(fr->epsfac);
130     charge           = mdatoms->chargeA;
131     krf              = _mm_set1_ps(fr->ic->k_rf);
132     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
133     crf              = _mm_set1_ps(fr->ic->c_rf);
134     nvdwtype         = fr->ntype;
135     vdwparam         = fr->nbfp;
136     vdwtype          = mdatoms->typeA;
137
138     /* Setup water-specific parameters */
139     inr              = nlist->iinr[0];
140     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
141     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
142     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
143     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
144
145     jq0              = _mm_set1_ps(charge[inr+0]);
146     jq1              = _mm_set1_ps(charge[inr+1]);
147     jq2              = _mm_set1_ps(charge[inr+2]);
148     vdwjidx0A        = 2*vdwtype[inr+0];
149     qq00             = _mm_mul_ps(iq0,jq0);
150     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
151     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
152     qq01             = _mm_mul_ps(iq0,jq1);
153     qq02             = _mm_mul_ps(iq0,jq2);
154     qq10             = _mm_mul_ps(iq1,jq0);
155     qq11             = _mm_mul_ps(iq1,jq1);
156     qq12             = _mm_mul_ps(iq1,jq2);
157     qq20             = _mm_mul_ps(iq2,jq0);
158     qq21             = _mm_mul_ps(iq2,jq1);
159     qq22             = _mm_mul_ps(iq2,jq2);
160
161     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
162     rcutoff_scalar   = fr->rcoulomb;
163     rcutoff          = _mm_set1_ps(rcutoff_scalar);
164     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
165
166     rswitch_scalar   = fr->rvdw_switch;
167     rswitch          = _mm_set1_ps(rswitch_scalar);
168     /* Setup switch parameters */
169     d_scalar         = rcutoff_scalar-rswitch_scalar;
170     d                = _mm_set1_ps(d_scalar);
171     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
172     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
173     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
174     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
175     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
176     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
177
178     /* Avoid stupid compiler warnings */
179     jnrA = jnrB = jnrC = jnrD = 0;
180     j_coord_offsetA = 0;
181     j_coord_offsetB = 0;
182     j_coord_offsetC = 0;
183     j_coord_offsetD = 0;
184
185     outeriter        = 0;
186     inneriter        = 0;
187
188     for(iidx=0;iidx<4*DIM;iidx++)
189     {
190         scratch[iidx] = 0.0;
191     }
192
193     /* Start outer loop over neighborlists */
194     for(iidx=0; iidx<nri; iidx++)
195     {
196         /* Load shift vector for this list */
197         i_shift_offset   = DIM*shiftidx[iidx];
198
199         /* Load limits for loop over neighbors */
200         j_index_start    = jindex[iidx];
201         j_index_end      = jindex[iidx+1];
202
203         /* Get outer coordinate index */
204         inr              = iinr[iidx];
205         i_coord_offset   = DIM*inr;
206
207         /* Load i particle coords and add shift vector */
208         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
209                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
210
211         fix0             = _mm_setzero_ps();
212         fiy0             = _mm_setzero_ps();
213         fiz0             = _mm_setzero_ps();
214         fix1             = _mm_setzero_ps();
215         fiy1             = _mm_setzero_ps();
216         fiz1             = _mm_setzero_ps();
217         fix2             = _mm_setzero_ps();
218         fiy2             = _mm_setzero_ps();
219         fiz2             = _mm_setzero_ps();
220
221         /* Reset potential sums */
222         velecsum         = _mm_setzero_ps();
223         vvdwsum          = _mm_setzero_ps();
224
225         /* Start inner kernel loop */
226         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
227         {
228
229             /* Get j neighbor index, and coordinate index */
230             jnrA             = jjnr[jidx];
231             jnrB             = jjnr[jidx+1];
232             jnrC             = jjnr[jidx+2];
233             jnrD             = jjnr[jidx+3];
234             j_coord_offsetA  = DIM*jnrA;
235             j_coord_offsetB  = DIM*jnrB;
236             j_coord_offsetC  = DIM*jnrC;
237             j_coord_offsetD  = DIM*jnrD;
238
239             /* load j atom coordinates */
240             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
241                                               x+j_coord_offsetC,x+j_coord_offsetD,
242                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
243
244             /* Calculate displacement vector */
245             dx00             = _mm_sub_ps(ix0,jx0);
246             dy00             = _mm_sub_ps(iy0,jy0);
247             dz00             = _mm_sub_ps(iz0,jz0);
248             dx01             = _mm_sub_ps(ix0,jx1);
249             dy01             = _mm_sub_ps(iy0,jy1);
250             dz01             = _mm_sub_ps(iz0,jz1);
251             dx02             = _mm_sub_ps(ix0,jx2);
252             dy02             = _mm_sub_ps(iy0,jy2);
253             dz02             = _mm_sub_ps(iz0,jz2);
254             dx10             = _mm_sub_ps(ix1,jx0);
255             dy10             = _mm_sub_ps(iy1,jy0);
256             dz10             = _mm_sub_ps(iz1,jz0);
257             dx11             = _mm_sub_ps(ix1,jx1);
258             dy11             = _mm_sub_ps(iy1,jy1);
259             dz11             = _mm_sub_ps(iz1,jz1);
260             dx12             = _mm_sub_ps(ix1,jx2);
261             dy12             = _mm_sub_ps(iy1,jy2);
262             dz12             = _mm_sub_ps(iz1,jz2);
263             dx20             = _mm_sub_ps(ix2,jx0);
264             dy20             = _mm_sub_ps(iy2,jy0);
265             dz20             = _mm_sub_ps(iz2,jz0);
266             dx21             = _mm_sub_ps(ix2,jx1);
267             dy21             = _mm_sub_ps(iy2,jy1);
268             dz21             = _mm_sub_ps(iz2,jz1);
269             dx22             = _mm_sub_ps(ix2,jx2);
270             dy22             = _mm_sub_ps(iy2,jy2);
271             dz22             = _mm_sub_ps(iz2,jz2);
272
273             /* Calculate squared distance and things based on it */
274             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
275             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
276             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
277             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
278             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
279             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
280             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
281             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
282             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
283
284             rinv00           = gmx_mm_invsqrt_ps(rsq00);
285             rinv01           = gmx_mm_invsqrt_ps(rsq01);
286             rinv02           = gmx_mm_invsqrt_ps(rsq02);
287             rinv10           = gmx_mm_invsqrt_ps(rsq10);
288             rinv11           = gmx_mm_invsqrt_ps(rsq11);
289             rinv12           = gmx_mm_invsqrt_ps(rsq12);
290             rinv20           = gmx_mm_invsqrt_ps(rsq20);
291             rinv21           = gmx_mm_invsqrt_ps(rsq21);
292             rinv22           = gmx_mm_invsqrt_ps(rsq22);
293
294             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
295             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
296             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
297             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
298             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
299             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
300             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
301             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
302             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
303
304             fjx0             = _mm_setzero_ps();
305             fjy0             = _mm_setzero_ps();
306             fjz0             = _mm_setzero_ps();
307             fjx1             = _mm_setzero_ps();
308             fjy1             = _mm_setzero_ps();
309             fjz1             = _mm_setzero_ps();
310             fjx2             = _mm_setzero_ps();
311             fjy2             = _mm_setzero_ps();
312             fjz2             = _mm_setzero_ps();
313
314             /**************************
315              * CALCULATE INTERACTIONS *
316              **************************/
317
318             if (gmx_mm_any_lt(rsq00,rcutoff2))
319             {
320
321             r00              = _mm_mul_ps(rsq00,rinv00);
322
323             /* REACTION-FIELD ELECTROSTATICS */
324             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
325             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
326
327             /* LENNARD-JONES DISPERSION/REPULSION */
328
329             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
330             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
331             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
332             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
333             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
334
335             d                = _mm_sub_ps(r00,rswitch);
336             d                = _mm_max_ps(d,_mm_setzero_ps());
337             d2               = _mm_mul_ps(d,d);
338             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
339
340             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
341
342             /* Evaluate switch function */
343             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
344             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
345             vvdw             = _mm_mul_ps(vvdw,sw);
346             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
347
348             /* Update potential sum for this i atom from the interaction with this j atom. */
349             velec            = _mm_and_ps(velec,cutoff_mask);
350             velecsum         = _mm_add_ps(velecsum,velec);
351             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
352             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
353
354             fscal            = _mm_add_ps(felec,fvdw);
355
356             fscal            = _mm_and_ps(fscal,cutoff_mask);
357
358             /* Calculate temporary vectorial force */
359             tx               = _mm_mul_ps(fscal,dx00);
360             ty               = _mm_mul_ps(fscal,dy00);
361             tz               = _mm_mul_ps(fscal,dz00);
362
363             /* Update vectorial force */
364             fix0             = _mm_add_ps(fix0,tx);
365             fiy0             = _mm_add_ps(fiy0,ty);
366             fiz0             = _mm_add_ps(fiz0,tz);
367
368             fjx0             = _mm_add_ps(fjx0,tx);
369             fjy0             = _mm_add_ps(fjy0,ty);
370             fjz0             = _mm_add_ps(fjz0,tz);
371
372             }
373
374             /**************************
375              * CALCULATE INTERACTIONS *
376              **************************/
377
378             if (gmx_mm_any_lt(rsq01,rcutoff2))
379             {
380
381             /* REACTION-FIELD ELECTROSTATICS */
382             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
383             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
384
385             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
386
387             /* Update potential sum for this i atom from the interaction with this j atom. */
388             velec            = _mm_and_ps(velec,cutoff_mask);
389             velecsum         = _mm_add_ps(velecsum,velec);
390
391             fscal            = felec;
392
393             fscal            = _mm_and_ps(fscal,cutoff_mask);
394
395             /* Calculate temporary vectorial force */
396             tx               = _mm_mul_ps(fscal,dx01);
397             ty               = _mm_mul_ps(fscal,dy01);
398             tz               = _mm_mul_ps(fscal,dz01);
399
400             /* Update vectorial force */
401             fix0             = _mm_add_ps(fix0,tx);
402             fiy0             = _mm_add_ps(fiy0,ty);
403             fiz0             = _mm_add_ps(fiz0,tz);
404
405             fjx1             = _mm_add_ps(fjx1,tx);
406             fjy1             = _mm_add_ps(fjy1,ty);
407             fjz1             = _mm_add_ps(fjz1,tz);
408
409             }
410
411             /**************************
412              * CALCULATE INTERACTIONS *
413              **************************/
414
415             if (gmx_mm_any_lt(rsq02,rcutoff2))
416             {
417
418             /* REACTION-FIELD ELECTROSTATICS */
419             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
420             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
421
422             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
423
424             /* Update potential sum for this i atom from the interaction with this j atom. */
425             velec            = _mm_and_ps(velec,cutoff_mask);
426             velecsum         = _mm_add_ps(velecsum,velec);
427
428             fscal            = felec;
429
430             fscal            = _mm_and_ps(fscal,cutoff_mask);
431
432             /* Calculate temporary vectorial force */
433             tx               = _mm_mul_ps(fscal,dx02);
434             ty               = _mm_mul_ps(fscal,dy02);
435             tz               = _mm_mul_ps(fscal,dz02);
436
437             /* Update vectorial force */
438             fix0             = _mm_add_ps(fix0,tx);
439             fiy0             = _mm_add_ps(fiy0,ty);
440             fiz0             = _mm_add_ps(fiz0,tz);
441
442             fjx2             = _mm_add_ps(fjx2,tx);
443             fjy2             = _mm_add_ps(fjy2,ty);
444             fjz2             = _mm_add_ps(fjz2,tz);
445
446             }
447
448             /**************************
449              * CALCULATE INTERACTIONS *
450              **************************/
451
452             if (gmx_mm_any_lt(rsq10,rcutoff2))
453             {
454
455             /* REACTION-FIELD ELECTROSTATICS */
456             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
457             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
458
459             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
460
461             /* Update potential sum for this i atom from the interaction with this j atom. */
462             velec            = _mm_and_ps(velec,cutoff_mask);
463             velecsum         = _mm_add_ps(velecsum,velec);
464
465             fscal            = felec;
466
467             fscal            = _mm_and_ps(fscal,cutoff_mask);
468
469             /* Calculate temporary vectorial force */
470             tx               = _mm_mul_ps(fscal,dx10);
471             ty               = _mm_mul_ps(fscal,dy10);
472             tz               = _mm_mul_ps(fscal,dz10);
473
474             /* Update vectorial force */
475             fix1             = _mm_add_ps(fix1,tx);
476             fiy1             = _mm_add_ps(fiy1,ty);
477             fiz1             = _mm_add_ps(fiz1,tz);
478
479             fjx0             = _mm_add_ps(fjx0,tx);
480             fjy0             = _mm_add_ps(fjy0,ty);
481             fjz0             = _mm_add_ps(fjz0,tz);
482
483             }
484
485             /**************************
486              * CALCULATE INTERACTIONS *
487              **************************/
488
489             if (gmx_mm_any_lt(rsq11,rcutoff2))
490             {
491
492             /* REACTION-FIELD ELECTROSTATICS */
493             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
494             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
495
496             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
497
498             /* Update potential sum for this i atom from the interaction with this j atom. */
499             velec            = _mm_and_ps(velec,cutoff_mask);
500             velecsum         = _mm_add_ps(velecsum,velec);
501
502             fscal            = felec;
503
504             fscal            = _mm_and_ps(fscal,cutoff_mask);
505
506             /* Calculate temporary vectorial force */
507             tx               = _mm_mul_ps(fscal,dx11);
508             ty               = _mm_mul_ps(fscal,dy11);
509             tz               = _mm_mul_ps(fscal,dz11);
510
511             /* Update vectorial force */
512             fix1             = _mm_add_ps(fix1,tx);
513             fiy1             = _mm_add_ps(fiy1,ty);
514             fiz1             = _mm_add_ps(fiz1,tz);
515
516             fjx1             = _mm_add_ps(fjx1,tx);
517             fjy1             = _mm_add_ps(fjy1,ty);
518             fjz1             = _mm_add_ps(fjz1,tz);
519
520             }
521
522             /**************************
523              * CALCULATE INTERACTIONS *
524              **************************/
525
526             if (gmx_mm_any_lt(rsq12,rcutoff2))
527             {
528
529             /* REACTION-FIELD ELECTROSTATICS */
530             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
531             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
532
533             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
534
535             /* Update potential sum for this i atom from the interaction with this j atom. */
536             velec            = _mm_and_ps(velec,cutoff_mask);
537             velecsum         = _mm_add_ps(velecsum,velec);
538
539             fscal            = felec;
540
541             fscal            = _mm_and_ps(fscal,cutoff_mask);
542
543             /* Calculate temporary vectorial force */
544             tx               = _mm_mul_ps(fscal,dx12);
545             ty               = _mm_mul_ps(fscal,dy12);
546             tz               = _mm_mul_ps(fscal,dz12);
547
548             /* Update vectorial force */
549             fix1             = _mm_add_ps(fix1,tx);
550             fiy1             = _mm_add_ps(fiy1,ty);
551             fiz1             = _mm_add_ps(fiz1,tz);
552
553             fjx2             = _mm_add_ps(fjx2,tx);
554             fjy2             = _mm_add_ps(fjy2,ty);
555             fjz2             = _mm_add_ps(fjz2,tz);
556
557             }
558
559             /**************************
560              * CALCULATE INTERACTIONS *
561              **************************/
562
563             if (gmx_mm_any_lt(rsq20,rcutoff2))
564             {
565
566             /* REACTION-FIELD ELECTROSTATICS */
567             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
568             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
569
570             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
571
572             /* Update potential sum for this i atom from the interaction with this j atom. */
573             velec            = _mm_and_ps(velec,cutoff_mask);
574             velecsum         = _mm_add_ps(velecsum,velec);
575
576             fscal            = felec;
577
578             fscal            = _mm_and_ps(fscal,cutoff_mask);
579
580             /* Calculate temporary vectorial force */
581             tx               = _mm_mul_ps(fscal,dx20);
582             ty               = _mm_mul_ps(fscal,dy20);
583             tz               = _mm_mul_ps(fscal,dz20);
584
585             /* Update vectorial force */
586             fix2             = _mm_add_ps(fix2,tx);
587             fiy2             = _mm_add_ps(fiy2,ty);
588             fiz2             = _mm_add_ps(fiz2,tz);
589
590             fjx0             = _mm_add_ps(fjx0,tx);
591             fjy0             = _mm_add_ps(fjy0,ty);
592             fjz0             = _mm_add_ps(fjz0,tz);
593
594             }
595
596             /**************************
597              * CALCULATE INTERACTIONS *
598              **************************/
599
600             if (gmx_mm_any_lt(rsq21,rcutoff2))
601             {
602
603             /* REACTION-FIELD ELECTROSTATICS */
604             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
605             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
606
607             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
608
609             /* Update potential sum for this i atom from the interaction with this j atom. */
610             velec            = _mm_and_ps(velec,cutoff_mask);
611             velecsum         = _mm_add_ps(velecsum,velec);
612
613             fscal            = felec;
614
615             fscal            = _mm_and_ps(fscal,cutoff_mask);
616
617             /* Calculate temporary vectorial force */
618             tx               = _mm_mul_ps(fscal,dx21);
619             ty               = _mm_mul_ps(fscal,dy21);
620             tz               = _mm_mul_ps(fscal,dz21);
621
622             /* Update vectorial force */
623             fix2             = _mm_add_ps(fix2,tx);
624             fiy2             = _mm_add_ps(fiy2,ty);
625             fiz2             = _mm_add_ps(fiz2,tz);
626
627             fjx1             = _mm_add_ps(fjx1,tx);
628             fjy1             = _mm_add_ps(fjy1,ty);
629             fjz1             = _mm_add_ps(fjz1,tz);
630
631             }
632
633             /**************************
634              * CALCULATE INTERACTIONS *
635              **************************/
636
637             if (gmx_mm_any_lt(rsq22,rcutoff2))
638             {
639
640             /* REACTION-FIELD ELECTROSTATICS */
641             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
642             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
643
644             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
645
646             /* Update potential sum for this i atom from the interaction with this j atom. */
647             velec            = _mm_and_ps(velec,cutoff_mask);
648             velecsum         = _mm_add_ps(velecsum,velec);
649
650             fscal            = felec;
651
652             fscal            = _mm_and_ps(fscal,cutoff_mask);
653
654             /* Calculate temporary vectorial force */
655             tx               = _mm_mul_ps(fscal,dx22);
656             ty               = _mm_mul_ps(fscal,dy22);
657             tz               = _mm_mul_ps(fscal,dz22);
658
659             /* Update vectorial force */
660             fix2             = _mm_add_ps(fix2,tx);
661             fiy2             = _mm_add_ps(fiy2,ty);
662             fiz2             = _mm_add_ps(fiz2,tz);
663
664             fjx2             = _mm_add_ps(fjx2,tx);
665             fjy2             = _mm_add_ps(fjy2,ty);
666             fjz2             = _mm_add_ps(fjz2,tz);
667
668             }
669
670             fjptrA             = f+j_coord_offsetA;
671             fjptrB             = f+j_coord_offsetB;
672             fjptrC             = f+j_coord_offsetC;
673             fjptrD             = f+j_coord_offsetD;
674
675             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
676                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
677
678             /* Inner loop uses 358 flops */
679         }
680
681         if(jidx<j_index_end)
682         {
683
684             /* Get j neighbor index, and coordinate index */
685             jnrlistA         = jjnr[jidx];
686             jnrlistB         = jjnr[jidx+1];
687             jnrlistC         = jjnr[jidx+2];
688             jnrlistD         = jjnr[jidx+3];
689             /* Sign of each element will be negative for non-real atoms.
690              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
691              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
692              */
693             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
694             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
695             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
696             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
697             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
698             j_coord_offsetA  = DIM*jnrA;
699             j_coord_offsetB  = DIM*jnrB;
700             j_coord_offsetC  = DIM*jnrC;
701             j_coord_offsetD  = DIM*jnrD;
702
703             /* load j atom coordinates */
704             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
705                                               x+j_coord_offsetC,x+j_coord_offsetD,
706                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
707
708             /* Calculate displacement vector */
709             dx00             = _mm_sub_ps(ix0,jx0);
710             dy00             = _mm_sub_ps(iy0,jy0);
711             dz00             = _mm_sub_ps(iz0,jz0);
712             dx01             = _mm_sub_ps(ix0,jx1);
713             dy01             = _mm_sub_ps(iy0,jy1);
714             dz01             = _mm_sub_ps(iz0,jz1);
715             dx02             = _mm_sub_ps(ix0,jx2);
716             dy02             = _mm_sub_ps(iy0,jy2);
717             dz02             = _mm_sub_ps(iz0,jz2);
718             dx10             = _mm_sub_ps(ix1,jx0);
719             dy10             = _mm_sub_ps(iy1,jy0);
720             dz10             = _mm_sub_ps(iz1,jz0);
721             dx11             = _mm_sub_ps(ix1,jx1);
722             dy11             = _mm_sub_ps(iy1,jy1);
723             dz11             = _mm_sub_ps(iz1,jz1);
724             dx12             = _mm_sub_ps(ix1,jx2);
725             dy12             = _mm_sub_ps(iy1,jy2);
726             dz12             = _mm_sub_ps(iz1,jz2);
727             dx20             = _mm_sub_ps(ix2,jx0);
728             dy20             = _mm_sub_ps(iy2,jy0);
729             dz20             = _mm_sub_ps(iz2,jz0);
730             dx21             = _mm_sub_ps(ix2,jx1);
731             dy21             = _mm_sub_ps(iy2,jy1);
732             dz21             = _mm_sub_ps(iz2,jz1);
733             dx22             = _mm_sub_ps(ix2,jx2);
734             dy22             = _mm_sub_ps(iy2,jy2);
735             dz22             = _mm_sub_ps(iz2,jz2);
736
737             /* Calculate squared distance and things based on it */
738             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
739             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
740             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
741             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
742             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
743             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
744             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
745             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
746             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
747
748             rinv00           = gmx_mm_invsqrt_ps(rsq00);
749             rinv01           = gmx_mm_invsqrt_ps(rsq01);
750             rinv02           = gmx_mm_invsqrt_ps(rsq02);
751             rinv10           = gmx_mm_invsqrt_ps(rsq10);
752             rinv11           = gmx_mm_invsqrt_ps(rsq11);
753             rinv12           = gmx_mm_invsqrt_ps(rsq12);
754             rinv20           = gmx_mm_invsqrt_ps(rsq20);
755             rinv21           = gmx_mm_invsqrt_ps(rsq21);
756             rinv22           = gmx_mm_invsqrt_ps(rsq22);
757
758             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
759             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
760             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
761             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
762             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
763             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
764             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
765             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
766             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
767
768             fjx0             = _mm_setzero_ps();
769             fjy0             = _mm_setzero_ps();
770             fjz0             = _mm_setzero_ps();
771             fjx1             = _mm_setzero_ps();
772             fjy1             = _mm_setzero_ps();
773             fjz1             = _mm_setzero_ps();
774             fjx2             = _mm_setzero_ps();
775             fjy2             = _mm_setzero_ps();
776             fjz2             = _mm_setzero_ps();
777
778             /**************************
779              * CALCULATE INTERACTIONS *
780              **************************/
781
782             if (gmx_mm_any_lt(rsq00,rcutoff2))
783             {
784
785             r00              = _mm_mul_ps(rsq00,rinv00);
786             r00              = _mm_andnot_ps(dummy_mask,r00);
787
788             /* REACTION-FIELD ELECTROSTATICS */
789             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
790             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
791
792             /* LENNARD-JONES DISPERSION/REPULSION */
793
794             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
795             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
796             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
797             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
798             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
799
800             d                = _mm_sub_ps(r00,rswitch);
801             d                = _mm_max_ps(d,_mm_setzero_ps());
802             d2               = _mm_mul_ps(d,d);
803             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
804
805             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
806
807             /* Evaluate switch function */
808             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
809             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
810             vvdw             = _mm_mul_ps(vvdw,sw);
811             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
812
813             /* Update potential sum for this i atom from the interaction with this j atom. */
814             velec            = _mm_and_ps(velec,cutoff_mask);
815             velec            = _mm_andnot_ps(dummy_mask,velec);
816             velecsum         = _mm_add_ps(velecsum,velec);
817             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
818             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
819             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
820
821             fscal            = _mm_add_ps(felec,fvdw);
822
823             fscal            = _mm_and_ps(fscal,cutoff_mask);
824
825             fscal            = _mm_andnot_ps(dummy_mask,fscal);
826
827             /* Calculate temporary vectorial force */
828             tx               = _mm_mul_ps(fscal,dx00);
829             ty               = _mm_mul_ps(fscal,dy00);
830             tz               = _mm_mul_ps(fscal,dz00);
831
832             /* Update vectorial force */
833             fix0             = _mm_add_ps(fix0,tx);
834             fiy0             = _mm_add_ps(fiy0,ty);
835             fiz0             = _mm_add_ps(fiz0,tz);
836
837             fjx0             = _mm_add_ps(fjx0,tx);
838             fjy0             = _mm_add_ps(fjy0,ty);
839             fjz0             = _mm_add_ps(fjz0,tz);
840
841             }
842
843             /**************************
844              * CALCULATE INTERACTIONS *
845              **************************/
846
847             if (gmx_mm_any_lt(rsq01,rcutoff2))
848             {
849
850             /* REACTION-FIELD ELECTROSTATICS */
851             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
852             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
853
854             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
855
856             /* Update potential sum for this i atom from the interaction with this j atom. */
857             velec            = _mm_and_ps(velec,cutoff_mask);
858             velec            = _mm_andnot_ps(dummy_mask,velec);
859             velecsum         = _mm_add_ps(velecsum,velec);
860
861             fscal            = felec;
862
863             fscal            = _mm_and_ps(fscal,cutoff_mask);
864
865             fscal            = _mm_andnot_ps(dummy_mask,fscal);
866
867             /* Calculate temporary vectorial force */
868             tx               = _mm_mul_ps(fscal,dx01);
869             ty               = _mm_mul_ps(fscal,dy01);
870             tz               = _mm_mul_ps(fscal,dz01);
871
872             /* Update vectorial force */
873             fix0             = _mm_add_ps(fix0,tx);
874             fiy0             = _mm_add_ps(fiy0,ty);
875             fiz0             = _mm_add_ps(fiz0,tz);
876
877             fjx1             = _mm_add_ps(fjx1,tx);
878             fjy1             = _mm_add_ps(fjy1,ty);
879             fjz1             = _mm_add_ps(fjz1,tz);
880
881             }
882
883             /**************************
884              * CALCULATE INTERACTIONS *
885              **************************/
886
887             if (gmx_mm_any_lt(rsq02,rcutoff2))
888             {
889
890             /* REACTION-FIELD ELECTROSTATICS */
891             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
892             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
893
894             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
895
896             /* Update potential sum for this i atom from the interaction with this j atom. */
897             velec            = _mm_and_ps(velec,cutoff_mask);
898             velec            = _mm_andnot_ps(dummy_mask,velec);
899             velecsum         = _mm_add_ps(velecsum,velec);
900
901             fscal            = felec;
902
903             fscal            = _mm_and_ps(fscal,cutoff_mask);
904
905             fscal            = _mm_andnot_ps(dummy_mask,fscal);
906
907             /* Calculate temporary vectorial force */
908             tx               = _mm_mul_ps(fscal,dx02);
909             ty               = _mm_mul_ps(fscal,dy02);
910             tz               = _mm_mul_ps(fscal,dz02);
911
912             /* Update vectorial force */
913             fix0             = _mm_add_ps(fix0,tx);
914             fiy0             = _mm_add_ps(fiy0,ty);
915             fiz0             = _mm_add_ps(fiz0,tz);
916
917             fjx2             = _mm_add_ps(fjx2,tx);
918             fjy2             = _mm_add_ps(fjy2,ty);
919             fjz2             = _mm_add_ps(fjz2,tz);
920
921             }
922
923             /**************************
924              * CALCULATE INTERACTIONS *
925              **************************/
926
927             if (gmx_mm_any_lt(rsq10,rcutoff2))
928             {
929
930             /* REACTION-FIELD ELECTROSTATICS */
931             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
932             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
933
934             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
935
936             /* Update potential sum for this i atom from the interaction with this j atom. */
937             velec            = _mm_and_ps(velec,cutoff_mask);
938             velec            = _mm_andnot_ps(dummy_mask,velec);
939             velecsum         = _mm_add_ps(velecsum,velec);
940
941             fscal            = felec;
942
943             fscal            = _mm_and_ps(fscal,cutoff_mask);
944
945             fscal            = _mm_andnot_ps(dummy_mask,fscal);
946
947             /* Calculate temporary vectorial force */
948             tx               = _mm_mul_ps(fscal,dx10);
949             ty               = _mm_mul_ps(fscal,dy10);
950             tz               = _mm_mul_ps(fscal,dz10);
951
952             /* Update vectorial force */
953             fix1             = _mm_add_ps(fix1,tx);
954             fiy1             = _mm_add_ps(fiy1,ty);
955             fiz1             = _mm_add_ps(fiz1,tz);
956
957             fjx0             = _mm_add_ps(fjx0,tx);
958             fjy0             = _mm_add_ps(fjy0,ty);
959             fjz0             = _mm_add_ps(fjz0,tz);
960
961             }
962
963             /**************************
964              * CALCULATE INTERACTIONS *
965              **************************/
966
967             if (gmx_mm_any_lt(rsq11,rcutoff2))
968             {
969
970             /* REACTION-FIELD ELECTROSTATICS */
971             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
972             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
973
974             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
975
976             /* Update potential sum for this i atom from the interaction with this j atom. */
977             velec            = _mm_and_ps(velec,cutoff_mask);
978             velec            = _mm_andnot_ps(dummy_mask,velec);
979             velecsum         = _mm_add_ps(velecsum,velec);
980
981             fscal            = felec;
982
983             fscal            = _mm_and_ps(fscal,cutoff_mask);
984
985             fscal            = _mm_andnot_ps(dummy_mask,fscal);
986
987             /* Calculate temporary vectorial force */
988             tx               = _mm_mul_ps(fscal,dx11);
989             ty               = _mm_mul_ps(fscal,dy11);
990             tz               = _mm_mul_ps(fscal,dz11);
991
992             /* Update vectorial force */
993             fix1             = _mm_add_ps(fix1,tx);
994             fiy1             = _mm_add_ps(fiy1,ty);
995             fiz1             = _mm_add_ps(fiz1,tz);
996
997             fjx1             = _mm_add_ps(fjx1,tx);
998             fjy1             = _mm_add_ps(fjy1,ty);
999             fjz1             = _mm_add_ps(fjz1,tz);
1000
1001             }
1002
1003             /**************************
1004              * CALCULATE INTERACTIONS *
1005              **************************/
1006
1007             if (gmx_mm_any_lt(rsq12,rcutoff2))
1008             {
1009
1010             /* REACTION-FIELD ELECTROSTATICS */
1011             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
1012             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1013
1014             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1015
1016             /* Update potential sum for this i atom from the interaction with this j atom. */
1017             velec            = _mm_and_ps(velec,cutoff_mask);
1018             velec            = _mm_andnot_ps(dummy_mask,velec);
1019             velecsum         = _mm_add_ps(velecsum,velec);
1020
1021             fscal            = felec;
1022
1023             fscal            = _mm_and_ps(fscal,cutoff_mask);
1024
1025             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1026
1027             /* Calculate temporary vectorial force */
1028             tx               = _mm_mul_ps(fscal,dx12);
1029             ty               = _mm_mul_ps(fscal,dy12);
1030             tz               = _mm_mul_ps(fscal,dz12);
1031
1032             /* Update vectorial force */
1033             fix1             = _mm_add_ps(fix1,tx);
1034             fiy1             = _mm_add_ps(fiy1,ty);
1035             fiz1             = _mm_add_ps(fiz1,tz);
1036
1037             fjx2             = _mm_add_ps(fjx2,tx);
1038             fjy2             = _mm_add_ps(fjy2,ty);
1039             fjz2             = _mm_add_ps(fjz2,tz);
1040
1041             }
1042
1043             /**************************
1044              * CALCULATE INTERACTIONS *
1045              **************************/
1046
1047             if (gmx_mm_any_lt(rsq20,rcutoff2))
1048             {
1049
1050             /* REACTION-FIELD ELECTROSTATICS */
1051             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
1052             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1053
1054             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1055
1056             /* Update potential sum for this i atom from the interaction with this j atom. */
1057             velec            = _mm_and_ps(velec,cutoff_mask);
1058             velec            = _mm_andnot_ps(dummy_mask,velec);
1059             velecsum         = _mm_add_ps(velecsum,velec);
1060
1061             fscal            = felec;
1062
1063             fscal            = _mm_and_ps(fscal,cutoff_mask);
1064
1065             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1066
1067             /* Calculate temporary vectorial force */
1068             tx               = _mm_mul_ps(fscal,dx20);
1069             ty               = _mm_mul_ps(fscal,dy20);
1070             tz               = _mm_mul_ps(fscal,dz20);
1071
1072             /* Update vectorial force */
1073             fix2             = _mm_add_ps(fix2,tx);
1074             fiy2             = _mm_add_ps(fiy2,ty);
1075             fiz2             = _mm_add_ps(fiz2,tz);
1076
1077             fjx0             = _mm_add_ps(fjx0,tx);
1078             fjy0             = _mm_add_ps(fjy0,ty);
1079             fjz0             = _mm_add_ps(fjz0,tz);
1080
1081             }
1082
1083             /**************************
1084              * CALCULATE INTERACTIONS *
1085              **************************/
1086
1087             if (gmx_mm_any_lt(rsq21,rcutoff2))
1088             {
1089
1090             /* REACTION-FIELD ELECTROSTATICS */
1091             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
1092             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1093
1094             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1095
1096             /* Update potential sum for this i atom from the interaction with this j atom. */
1097             velec            = _mm_and_ps(velec,cutoff_mask);
1098             velec            = _mm_andnot_ps(dummy_mask,velec);
1099             velecsum         = _mm_add_ps(velecsum,velec);
1100
1101             fscal            = felec;
1102
1103             fscal            = _mm_and_ps(fscal,cutoff_mask);
1104
1105             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1106
1107             /* Calculate temporary vectorial force */
1108             tx               = _mm_mul_ps(fscal,dx21);
1109             ty               = _mm_mul_ps(fscal,dy21);
1110             tz               = _mm_mul_ps(fscal,dz21);
1111
1112             /* Update vectorial force */
1113             fix2             = _mm_add_ps(fix2,tx);
1114             fiy2             = _mm_add_ps(fiy2,ty);
1115             fiz2             = _mm_add_ps(fiz2,tz);
1116
1117             fjx1             = _mm_add_ps(fjx1,tx);
1118             fjy1             = _mm_add_ps(fjy1,ty);
1119             fjz1             = _mm_add_ps(fjz1,tz);
1120
1121             }
1122
1123             /**************************
1124              * CALCULATE INTERACTIONS *
1125              **************************/
1126
1127             if (gmx_mm_any_lt(rsq22,rcutoff2))
1128             {
1129
1130             /* REACTION-FIELD ELECTROSTATICS */
1131             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
1132             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1133
1134             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1135
1136             /* Update potential sum for this i atom from the interaction with this j atom. */
1137             velec            = _mm_and_ps(velec,cutoff_mask);
1138             velec            = _mm_andnot_ps(dummy_mask,velec);
1139             velecsum         = _mm_add_ps(velecsum,velec);
1140
1141             fscal            = felec;
1142
1143             fscal            = _mm_and_ps(fscal,cutoff_mask);
1144
1145             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1146
1147             /* Calculate temporary vectorial force */
1148             tx               = _mm_mul_ps(fscal,dx22);
1149             ty               = _mm_mul_ps(fscal,dy22);
1150             tz               = _mm_mul_ps(fscal,dz22);
1151
1152             /* Update vectorial force */
1153             fix2             = _mm_add_ps(fix2,tx);
1154             fiy2             = _mm_add_ps(fiy2,ty);
1155             fiz2             = _mm_add_ps(fiz2,tz);
1156
1157             fjx2             = _mm_add_ps(fjx2,tx);
1158             fjy2             = _mm_add_ps(fjy2,ty);
1159             fjz2             = _mm_add_ps(fjz2,tz);
1160
1161             }
1162
1163             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1164             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1165             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1166             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1167
1168             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1169                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1170
1171             /* Inner loop uses 359 flops */
1172         }
1173
1174         /* End of innermost loop */
1175
1176         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1177                                               f+i_coord_offset,fshift+i_shift_offset);
1178
1179         ggid                        = gid[iidx];
1180         /* Update potential energies */
1181         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1182         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1183
1184         /* Increment number of inner iterations */
1185         inneriter                  += j_index_end - j_index_start;
1186
1187         /* Outer loop uses 20 flops */
1188     }
1189
1190     /* Increment number of outer iterations */
1191     outeriter        += nri;
1192
1193     /* Update outer/inner flops */
1194
1195     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*359);
1196 }
1197 /*
1198  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse4_1_single
1199  * Electrostatics interaction: ReactionField
1200  * VdW interaction:            LennardJones
1201  * Geometry:                   Water3-Water3
1202  * Calculate force/pot:        Force
1203  */
1204 void
1205 nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse4_1_single
1206                     (t_nblist                    * gmx_restrict       nlist,
1207                      rvec                        * gmx_restrict          xx,
1208                      rvec                        * gmx_restrict          ff,
1209                      t_forcerec                  * gmx_restrict          fr,
1210                      t_mdatoms                   * gmx_restrict     mdatoms,
1211                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1212                      t_nrnb                      * gmx_restrict        nrnb)
1213 {
1214     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1215      * just 0 for non-waters.
1216      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1217      * jnr indices corresponding to data put in the four positions in the SIMD register.
1218      */
1219     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1220     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1221     int              jnrA,jnrB,jnrC,jnrD;
1222     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1223     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1224     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1225     real             rcutoff_scalar;
1226     real             *shiftvec,*fshift,*x,*f;
1227     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1228     real             scratch[4*DIM];
1229     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1230     int              vdwioffset0;
1231     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1232     int              vdwioffset1;
1233     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1234     int              vdwioffset2;
1235     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1236     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1237     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1238     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1239     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1240     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1241     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1242     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1243     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1244     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1245     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1246     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1247     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1248     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1249     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1250     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1251     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1252     real             *charge;
1253     int              nvdwtype;
1254     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1255     int              *vdwtype;
1256     real             *vdwparam;
1257     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1258     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1259     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1260     real             rswitch_scalar,d_scalar;
1261     __m128           dummy_mask,cutoff_mask;
1262     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1263     __m128           one     = _mm_set1_ps(1.0);
1264     __m128           two     = _mm_set1_ps(2.0);
1265     x                = xx[0];
1266     f                = ff[0];
1267
1268     nri              = nlist->nri;
1269     iinr             = nlist->iinr;
1270     jindex           = nlist->jindex;
1271     jjnr             = nlist->jjnr;
1272     shiftidx         = nlist->shift;
1273     gid              = nlist->gid;
1274     shiftvec         = fr->shift_vec[0];
1275     fshift           = fr->fshift[0];
1276     facel            = _mm_set1_ps(fr->epsfac);
1277     charge           = mdatoms->chargeA;
1278     krf              = _mm_set1_ps(fr->ic->k_rf);
1279     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
1280     crf              = _mm_set1_ps(fr->ic->c_rf);
1281     nvdwtype         = fr->ntype;
1282     vdwparam         = fr->nbfp;
1283     vdwtype          = mdatoms->typeA;
1284
1285     /* Setup water-specific parameters */
1286     inr              = nlist->iinr[0];
1287     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1288     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1289     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1290     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1291
1292     jq0              = _mm_set1_ps(charge[inr+0]);
1293     jq1              = _mm_set1_ps(charge[inr+1]);
1294     jq2              = _mm_set1_ps(charge[inr+2]);
1295     vdwjidx0A        = 2*vdwtype[inr+0];
1296     qq00             = _mm_mul_ps(iq0,jq0);
1297     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1298     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1299     qq01             = _mm_mul_ps(iq0,jq1);
1300     qq02             = _mm_mul_ps(iq0,jq2);
1301     qq10             = _mm_mul_ps(iq1,jq0);
1302     qq11             = _mm_mul_ps(iq1,jq1);
1303     qq12             = _mm_mul_ps(iq1,jq2);
1304     qq20             = _mm_mul_ps(iq2,jq0);
1305     qq21             = _mm_mul_ps(iq2,jq1);
1306     qq22             = _mm_mul_ps(iq2,jq2);
1307
1308     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1309     rcutoff_scalar   = fr->rcoulomb;
1310     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1311     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1312
1313     rswitch_scalar   = fr->rvdw_switch;
1314     rswitch          = _mm_set1_ps(rswitch_scalar);
1315     /* Setup switch parameters */
1316     d_scalar         = rcutoff_scalar-rswitch_scalar;
1317     d                = _mm_set1_ps(d_scalar);
1318     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1319     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1320     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1321     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1322     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1323     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1324
1325     /* Avoid stupid compiler warnings */
1326     jnrA = jnrB = jnrC = jnrD = 0;
1327     j_coord_offsetA = 0;
1328     j_coord_offsetB = 0;
1329     j_coord_offsetC = 0;
1330     j_coord_offsetD = 0;
1331
1332     outeriter        = 0;
1333     inneriter        = 0;
1334
1335     for(iidx=0;iidx<4*DIM;iidx++)
1336     {
1337         scratch[iidx] = 0.0;
1338     }
1339
1340     /* Start outer loop over neighborlists */
1341     for(iidx=0; iidx<nri; iidx++)
1342     {
1343         /* Load shift vector for this list */
1344         i_shift_offset   = DIM*shiftidx[iidx];
1345
1346         /* Load limits for loop over neighbors */
1347         j_index_start    = jindex[iidx];
1348         j_index_end      = jindex[iidx+1];
1349
1350         /* Get outer coordinate index */
1351         inr              = iinr[iidx];
1352         i_coord_offset   = DIM*inr;
1353
1354         /* Load i particle coords and add shift vector */
1355         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1356                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1357
1358         fix0             = _mm_setzero_ps();
1359         fiy0             = _mm_setzero_ps();
1360         fiz0             = _mm_setzero_ps();
1361         fix1             = _mm_setzero_ps();
1362         fiy1             = _mm_setzero_ps();
1363         fiz1             = _mm_setzero_ps();
1364         fix2             = _mm_setzero_ps();
1365         fiy2             = _mm_setzero_ps();
1366         fiz2             = _mm_setzero_ps();
1367
1368         /* Start inner kernel loop */
1369         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1370         {
1371
1372             /* Get j neighbor index, and coordinate index */
1373             jnrA             = jjnr[jidx];
1374             jnrB             = jjnr[jidx+1];
1375             jnrC             = jjnr[jidx+2];
1376             jnrD             = jjnr[jidx+3];
1377             j_coord_offsetA  = DIM*jnrA;
1378             j_coord_offsetB  = DIM*jnrB;
1379             j_coord_offsetC  = DIM*jnrC;
1380             j_coord_offsetD  = DIM*jnrD;
1381
1382             /* load j atom coordinates */
1383             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1384                                               x+j_coord_offsetC,x+j_coord_offsetD,
1385                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1386
1387             /* Calculate displacement vector */
1388             dx00             = _mm_sub_ps(ix0,jx0);
1389             dy00             = _mm_sub_ps(iy0,jy0);
1390             dz00             = _mm_sub_ps(iz0,jz0);
1391             dx01             = _mm_sub_ps(ix0,jx1);
1392             dy01             = _mm_sub_ps(iy0,jy1);
1393             dz01             = _mm_sub_ps(iz0,jz1);
1394             dx02             = _mm_sub_ps(ix0,jx2);
1395             dy02             = _mm_sub_ps(iy0,jy2);
1396             dz02             = _mm_sub_ps(iz0,jz2);
1397             dx10             = _mm_sub_ps(ix1,jx0);
1398             dy10             = _mm_sub_ps(iy1,jy0);
1399             dz10             = _mm_sub_ps(iz1,jz0);
1400             dx11             = _mm_sub_ps(ix1,jx1);
1401             dy11             = _mm_sub_ps(iy1,jy1);
1402             dz11             = _mm_sub_ps(iz1,jz1);
1403             dx12             = _mm_sub_ps(ix1,jx2);
1404             dy12             = _mm_sub_ps(iy1,jy2);
1405             dz12             = _mm_sub_ps(iz1,jz2);
1406             dx20             = _mm_sub_ps(ix2,jx0);
1407             dy20             = _mm_sub_ps(iy2,jy0);
1408             dz20             = _mm_sub_ps(iz2,jz0);
1409             dx21             = _mm_sub_ps(ix2,jx1);
1410             dy21             = _mm_sub_ps(iy2,jy1);
1411             dz21             = _mm_sub_ps(iz2,jz1);
1412             dx22             = _mm_sub_ps(ix2,jx2);
1413             dy22             = _mm_sub_ps(iy2,jy2);
1414             dz22             = _mm_sub_ps(iz2,jz2);
1415
1416             /* Calculate squared distance and things based on it */
1417             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1418             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1419             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1420             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1421             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1422             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1423             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1424             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1425             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1426
1427             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1428             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1429             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1430             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1431             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1432             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1433             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1434             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1435             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1436
1437             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1438             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1439             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1440             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1441             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1442             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1443             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1444             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1445             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1446
1447             fjx0             = _mm_setzero_ps();
1448             fjy0             = _mm_setzero_ps();
1449             fjz0             = _mm_setzero_ps();
1450             fjx1             = _mm_setzero_ps();
1451             fjy1             = _mm_setzero_ps();
1452             fjz1             = _mm_setzero_ps();
1453             fjx2             = _mm_setzero_ps();
1454             fjy2             = _mm_setzero_ps();
1455             fjz2             = _mm_setzero_ps();
1456
1457             /**************************
1458              * CALCULATE INTERACTIONS *
1459              **************************/
1460
1461             if (gmx_mm_any_lt(rsq00,rcutoff2))
1462             {
1463
1464             r00              = _mm_mul_ps(rsq00,rinv00);
1465
1466             /* REACTION-FIELD ELECTROSTATICS */
1467             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1468
1469             /* LENNARD-JONES DISPERSION/REPULSION */
1470
1471             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1472             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
1473             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1474             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1475             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1476
1477             d                = _mm_sub_ps(r00,rswitch);
1478             d                = _mm_max_ps(d,_mm_setzero_ps());
1479             d2               = _mm_mul_ps(d,d);
1480             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1481
1482             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1483
1484             /* Evaluate switch function */
1485             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1486             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1487             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1488
1489             fscal            = _mm_add_ps(felec,fvdw);
1490
1491             fscal            = _mm_and_ps(fscal,cutoff_mask);
1492
1493             /* Calculate temporary vectorial force */
1494             tx               = _mm_mul_ps(fscal,dx00);
1495             ty               = _mm_mul_ps(fscal,dy00);
1496             tz               = _mm_mul_ps(fscal,dz00);
1497
1498             /* Update vectorial force */
1499             fix0             = _mm_add_ps(fix0,tx);
1500             fiy0             = _mm_add_ps(fiy0,ty);
1501             fiz0             = _mm_add_ps(fiz0,tz);
1502
1503             fjx0             = _mm_add_ps(fjx0,tx);
1504             fjy0             = _mm_add_ps(fjy0,ty);
1505             fjz0             = _mm_add_ps(fjz0,tz);
1506
1507             }
1508
1509             /**************************
1510              * CALCULATE INTERACTIONS *
1511              **************************/
1512
1513             if (gmx_mm_any_lt(rsq01,rcutoff2))
1514             {
1515
1516             /* REACTION-FIELD ELECTROSTATICS */
1517             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1518
1519             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1520
1521             fscal            = felec;
1522
1523             fscal            = _mm_and_ps(fscal,cutoff_mask);
1524
1525             /* Calculate temporary vectorial force */
1526             tx               = _mm_mul_ps(fscal,dx01);
1527             ty               = _mm_mul_ps(fscal,dy01);
1528             tz               = _mm_mul_ps(fscal,dz01);
1529
1530             /* Update vectorial force */
1531             fix0             = _mm_add_ps(fix0,tx);
1532             fiy0             = _mm_add_ps(fiy0,ty);
1533             fiz0             = _mm_add_ps(fiz0,tz);
1534
1535             fjx1             = _mm_add_ps(fjx1,tx);
1536             fjy1             = _mm_add_ps(fjy1,ty);
1537             fjz1             = _mm_add_ps(fjz1,tz);
1538
1539             }
1540
1541             /**************************
1542              * CALCULATE INTERACTIONS *
1543              **************************/
1544
1545             if (gmx_mm_any_lt(rsq02,rcutoff2))
1546             {
1547
1548             /* REACTION-FIELD ELECTROSTATICS */
1549             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1550
1551             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1552
1553             fscal            = felec;
1554
1555             fscal            = _mm_and_ps(fscal,cutoff_mask);
1556
1557             /* Calculate temporary vectorial force */
1558             tx               = _mm_mul_ps(fscal,dx02);
1559             ty               = _mm_mul_ps(fscal,dy02);
1560             tz               = _mm_mul_ps(fscal,dz02);
1561
1562             /* Update vectorial force */
1563             fix0             = _mm_add_ps(fix0,tx);
1564             fiy0             = _mm_add_ps(fiy0,ty);
1565             fiz0             = _mm_add_ps(fiz0,tz);
1566
1567             fjx2             = _mm_add_ps(fjx2,tx);
1568             fjy2             = _mm_add_ps(fjy2,ty);
1569             fjz2             = _mm_add_ps(fjz2,tz);
1570
1571             }
1572
1573             /**************************
1574              * CALCULATE INTERACTIONS *
1575              **************************/
1576
1577             if (gmx_mm_any_lt(rsq10,rcutoff2))
1578             {
1579
1580             /* REACTION-FIELD ELECTROSTATICS */
1581             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1582
1583             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1584
1585             fscal            = felec;
1586
1587             fscal            = _mm_and_ps(fscal,cutoff_mask);
1588
1589             /* Calculate temporary vectorial force */
1590             tx               = _mm_mul_ps(fscal,dx10);
1591             ty               = _mm_mul_ps(fscal,dy10);
1592             tz               = _mm_mul_ps(fscal,dz10);
1593
1594             /* Update vectorial force */
1595             fix1             = _mm_add_ps(fix1,tx);
1596             fiy1             = _mm_add_ps(fiy1,ty);
1597             fiz1             = _mm_add_ps(fiz1,tz);
1598
1599             fjx0             = _mm_add_ps(fjx0,tx);
1600             fjy0             = _mm_add_ps(fjy0,ty);
1601             fjz0             = _mm_add_ps(fjz0,tz);
1602
1603             }
1604
1605             /**************************
1606              * CALCULATE INTERACTIONS *
1607              **************************/
1608
1609             if (gmx_mm_any_lt(rsq11,rcutoff2))
1610             {
1611
1612             /* REACTION-FIELD ELECTROSTATICS */
1613             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1614
1615             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1616
1617             fscal            = felec;
1618
1619             fscal            = _mm_and_ps(fscal,cutoff_mask);
1620
1621             /* Calculate temporary vectorial force */
1622             tx               = _mm_mul_ps(fscal,dx11);
1623             ty               = _mm_mul_ps(fscal,dy11);
1624             tz               = _mm_mul_ps(fscal,dz11);
1625
1626             /* Update vectorial force */
1627             fix1             = _mm_add_ps(fix1,tx);
1628             fiy1             = _mm_add_ps(fiy1,ty);
1629             fiz1             = _mm_add_ps(fiz1,tz);
1630
1631             fjx1             = _mm_add_ps(fjx1,tx);
1632             fjy1             = _mm_add_ps(fjy1,ty);
1633             fjz1             = _mm_add_ps(fjz1,tz);
1634
1635             }
1636
1637             /**************************
1638              * CALCULATE INTERACTIONS *
1639              **************************/
1640
1641             if (gmx_mm_any_lt(rsq12,rcutoff2))
1642             {
1643
1644             /* REACTION-FIELD ELECTROSTATICS */
1645             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1646
1647             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1648
1649             fscal            = felec;
1650
1651             fscal            = _mm_and_ps(fscal,cutoff_mask);
1652
1653             /* Calculate temporary vectorial force */
1654             tx               = _mm_mul_ps(fscal,dx12);
1655             ty               = _mm_mul_ps(fscal,dy12);
1656             tz               = _mm_mul_ps(fscal,dz12);
1657
1658             /* Update vectorial force */
1659             fix1             = _mm_add_ps(fix1,tx);
1660             fiy1             = _mm_add_ps(fiy1,ty);
1661             fiz1             = _mm_add_ps(fiz1,tz);
1662
1663             fjx2             = _mm_add_ps(fjx2,tx);
1664             fjy2             = _mm_add_ps(fjy2,ty);
1665             fjz2             = _mm_add_ps(fjz2,tz);
1666
1667             }
1668
1669             /**************************
1670              * CALCULATE INTERACTIONS *
1671              **************************/
1672
1673             if (gmx_mm_any_lt(rsq20,rcutoff2))
1674             {
1675
1676             /* REACTION-FIELD ELECTROSTATICS */
1677             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1678
1679             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1680
1681             fscal            = felec;
1682
1683             fscal            = _mm_and_ps(fscal,cutoff_mask);
1684
1685             /* Calculate temporary vectorial force */
1686             tx               = _mm_mul_ps(fscal,dx20);
1687             ty               = _mm_mul_ps(fscal,dy20);
1688             tz               = _mm_mul_ps(fscal,dz20);
1689
1690             /* Update vectorial force */
1691             fix2             = _mm_add_ps(fix2,tx);
1692             fiy2             = _mm_add_ps(fiy2,ty);
1693             fiz2             = _mm_add_ps(fiz2,tz);
1694
1695             fjx0             = _mm_add_ps(fjx0,tx);
1696             fjy0             = _mm_add_ps(fjy0,ty);
1697             fjz0             = _mm_add_ps(fjz0,tz);
1698
1699             }
1700
1701             /**************************
1702              * CALCULATE INTERACTIONS *
1703              **************************/
1704
1705             if (gmx_mm_any_lt(rsq21,rcutoff2))
1706             {
1707
1708             /* REACTION-FIELD ELECTROSTATICS */
1709             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1710
1711             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1712
1713             fscal            = felec;
1714
1715             fscal            = _mm_and_ps(fscal,cutoff_mask);
1716
1717             /* Calculate temporary vectorial force */
1718             tx               = _mm_mul_ps(fscal,dx21);
1719             ty               = _mm_mul_ps(fscal,dy21);
1720             tz               = _mm_mul_ps(fscal,dz21);
1721
1722             /* Update vectorial force */
1723             fix2             = _mm_add_ps(fix2,tx);
1724             fiy2             = _mm_add_ps(fiy2,ty);
1725             fiz2             = _mm_add_ps(fiz2,tz);
1726
1727             fjx1             = _mm_add_ps(fjx1,tx);
1728             fjy1             = _mm_add_ps(fjy1,ty);
1729             fjz1             = _mm_add_ps(fjz1,tz);
1730
1731             }
1732
1733             /**************************
1734              * CALCULATE INTERACTIONS *
1735              **************************/
1736
1737             if (gmx_mm_any_lt(rsq22,rcutoff2))
1738             {
1739
1740             /* REACTION-FIELD ELECTROSTATICS */
1741             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1742
1743             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1744
1745             fscal            = felec;
1746
1747             fscal            = _mm_and_ps(fscal,cutoff_mask);
1748
1749             /* Calculate temporary vectorial force */
1750             tx               = _mm_mul_ps(fscal,dx22);
1751             ty               = _mm_mul_ps(fscal,dy22);
1752             tz               = _mm_mul_ps(fscal,dz22);
1753
1754             /* Update vectorial force */
1755             fix2             = _mm_add_ps(fix2,tx);
1756             fiy2             = _mm_add_ps(fiy2,ty);
1757             fiz2             = _mm_add_ps(fiz2,tz);
1758
1759             fjx2             = _mm_add_ps(fjx2,tx);
1760             fjy2             = _mm_add_ps(fjy2,ty);
1761             fjz2             = _mm_add_ps(fjz2,tz);
1762
1763             }
1764
1765             fjptrA             = f+j_coord_offsetA;
1766             fjptrB             = f+j_coord_offsetB;
1767             fjptrC             = f+j_coord_offsetC;
1768             fjptrD             = f+j_coord_offsetD;
1769
1770             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1771                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1772
1773             /* Inner loop uses 301 flops */
1774         }
1775
1776         if(jidx<j_index_end)
1777         {
1778
1779             /* Get j neighbor index, and coordinate index */
1780             jnrlistA         = jjnr[jidx];
1781             jnrlistB         = jjnr[jidx+1];
1782             jnrlistC         = jjnr[jidx+2];
1783             jnrlistD         = jjnr[jidx+3];
1784             /* Sign of each element will be negative for non-real atoms.
1785              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1786              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1787              */
1788             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1789             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1790             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1791             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1792             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1793             j_coord_offsetA  = DIM*jnrA;
1794             j_coord_offsetB  = DIM*jnrB;
1795             j_coord_offsetC  = DIM*jnrC;
1796             j_coord_offsetD  = DIM*jnrD;
1797
1798             /* load j atom coordinates */
1799             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1800                                               x+j_coord_offsetC,x+j_coord_offsetD,
1801                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1802
1803             /* Calculate displacement vector */
1804             dx00             = _mm_sub_ps(ix0,jx0);
1805             dy00             = _mm_sub_ps(iy0,jy0);
1806             dz00             = _mm_sub_ps(iz0,jz0);
1807             dx01             = _mm_sub_ps(ix0,jx1);
1808             dy01             = _mm_sub_ps(iy0,jy1);
1809             dz01             = _mm_sub_ps(iz0,jz1);
1810             dx02             = _mm_sub_ps(ix0,jx2);
1811             dy02             = _mm_sub_ps(iy0,jy2);
1812             dz02             = _mm_sub_ps(iz0,jz2);
1813             dx10             = _mm_sub_ps(ix1,jx0);
1814             dy10             = _mm_sub_ps(iy1,jy0);
1815             dz10             = _mm_sub_ps(iz1,jz0);
1816             dx11             = _mm_sub_ps(ix1,jx1);
1817             dy11             = _mm_sub_ps(iy1,jy1);
1818             dz11             = _mm_sub_ps(iz1,jz1);
1819             dx12             = _mm_sub_ps(ix1,jx2);
1820             dy12             = _mm_sub_ps(iy1,jy2);
1821             dz12             = _mm_sub_ps(iz1,jz2);
1822             dx20             = _mm_sub_ps(ix2,jx0);
1823             dy20             = _mm_sub_ps(iy2,jy0);
1824             dz20             = _mm_sub_ps(iz2,jz0);
1825             dx21             = _mm_sub_ps(ix2,jx1);
1826             dy21             = _mm_sub_ps(iy2,jy1);
1827             dz21             = _mm_sub_ps(iz2,jz1);
1828             dx22             = _mm_sub_ps(ix2,jx2);
1829             dy22             = _mm_sub_ps(iy2,jy2);
1830             dz22             = _mm_sub_ps(iz2,jz2);
1831
1832             /* Calculate squared distance and things based on it */
1833             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1834             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1835             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1836             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1837             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1838             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1839             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1840             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1841             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1842
1843             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1844             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1845             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1846             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1847             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1848             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1849             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1850             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1851             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1852
1853             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1854             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1855             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1856             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1857             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1858             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1859             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1860             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1861             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1862
1863             fjx0             = _mm_setzero_ps();
1864             fjy0             = _mm_setzero_ps();
1865             fjz0             = _mm_setzero_ps();
1866             fjx1             = _mm_setzero_ps();
1867             fjy1             = _mm_setzero_ps();
1868             fjz1             = _mm_setzero_ps();
1869             fjx2             = _mm_setzero_ps();
1870             fjy2             = _mm_setzero_ps();
1871             fjz2             = _mm_setzero_ps();
1872
1873             /**************************
1874              * CALCULATE INTERACTIONS *
1875              **************************/
1876
1877             if (gmx_mm_any_lt(rsq00,rcutoff2))
1878             {
1879
1880             r00              = _mm_mul_ps(rsq00,rinv00);
1881             r00              = _mm_andnot_ps(dummy_mask,r00);
1882
1883             /* REACTION-FIELD ELECTROSTATICS */
1884             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1885
1886             /* LENNARD-JONES DISPERSION/REPULSION */
1887
1888             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1889             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
1890             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1891             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1892             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1893
1894             d                = _mm_sub_ps(r00,rswitch);
1895             d                = _mm_max_ps(d,_mm_setzero_ps());
1896             d2               = _mm_mul_ps(d,d);
1897             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1898
1899             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1900
1901             /* Evaluate switch function */
1902             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1903             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1904             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1905
1906             fscal            = _mm_add_ps(felec,fvdw);
1907
1908             fscal            = _mm_and_ps(fscal,cutoff_mask);
1909
1910             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1911
1912             /* Calculate temporary vectorial force */
1913             tx               = _mm_mul_ps(fscal,dx00);
1914             ty               = _mm_mul_ps(fscal,dy00);
1915             tz               = _mm_mul_ps(fscal,dz00);
1916
1917             /* Update vectorial force */
1918             fix0             = _mm_add_ps(fix0,tx);
1919             fiy0             = _mm_add_ps(fiy0,ty);
1920             fiz0             = _mm_add_ps(fiz0,tz);
1921
1922             fjx0             = _mm_add_ps(fjx0,tx);
1923             fjy0             = _mm_add_ps(fjy0,ty);
1924             fjz0             = _mm_add_ps(fjz0,tz);
1925
1926             }
1927
1928             /**************************
1929              * CALCULATE INTERACTIONS *
1930              **************************/
1931
1932             if (gmx_mm_any_lt(rsq01,rcutoff2))
1933             {
1934
1935             /* REACTION-FIELD ELECTROSTATICS */
1936             felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1937
1938             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1939
1940             fscal            = felec;
1941
1942             fscal            = _mm_and_ps(fscal,cutoff_mask);
1943
1944             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1945
1946             /* Calculate temporary vectorial force */
1947             tx               = _mm_mul_ps(fscal,dx01);
1948             ty               = _mm_mul_ps(fscal,dy01);
1949             tz               = _mm_mul_ps(fscal,dz01);
1950
1951             /* Update vectorial force */
1952             fix0             = _mm_add_ps(fix0,tx);
1953             fiy0             = _mm_add_ps(fiy0,ty);
1954             fiz0             = _mm_add_ps(fiz0,tz);
1955
1956             fjx1             = _mm_add_ps(fjx1,tx);
1957             fjy1             = _mm_add_ps(fjy1,ty);
1958             fjz1             = _mm_add_ps(fjz1,tz);
1959
1960             }
1961
1962             /**************************
1963              * CALCULATE INTERACTIONS *
1964              **************************/
1965
1966             if (gmx_mm_any_lt(rsq02,rcutoff2))
1967             {
1968
1969             /* REACTION-FIELD ELECTROSTATICS */
1970             felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1971
1972             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1973
1974             fscal            = felec;
1975
1976             fscal            = _mm_and_ps(fscal,cutoff_mask);
1977
1978             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1979
1980             /* Calculate temporary vectorial force */
1981             tx               = _mm_mul_ps(fscal,dx02);
1982             ty               = _mm_mul_ps(fscal,dy02);
1983             tz               = _mm_mul_ps(fscal,dz02);
1984
1985             /* Update vectorial force */
1986             fix0             = _mm_add_ps(fix0,tx);
1987             fiy0             = _mm_add_ps(fiy0,ty);
1988             fiz0             = _mm_add_ps(fiz0,tz);
1989
1990             fjx2             = _mm_add_ps(fjx2,tx);
1991             fjy2             = _mm_add_ps(fjy2,ty);
1992             fjz2             = _mm_add_ps(fjz2,tz);
1993
1994             }
1995
1996             /**************************
1997              * CALCULATE INTERACTIONS *
1998              **************************/
1999
2000             if (gmx_mm_any_lt(rsq10,rcutoff2))
2001             {
2002
2003             /* REACTION-FIELD ELECTROSTATICS */
2004             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
2005
2006             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
2007
2008             fscal            = felec;
2009
2010             fscal            = _mm_and_ps(fscal,cutoff_mask);
2011
2012             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2013
2014             /* Calculate temporary vectorial force */
2015             tx               = _mm_mul_ps(fscal,dx10);
2016             ty               = _mm_mul_ps(fscal,dy10);
2017             tz               = _mm_mul_ps(fscal,dz10);
2018
2019             /* Update vectorial force */
2020             fix1             = _mm_add_ps(fix1,tx);
2021             fiy1             = _mm_add_ps(fiy1,ty);
2022             fiz1             = _mm_add_ps(fiz1,tz);
2023
2024             fjx0             = _mm_add_ps(fjx0,tx);
2025             fjy0             = _mm_add_ps(fjy0,ty);
2026             fjz0             = _mm_add_ps(fjz0,tz);
2027
2028             }
2029
2030             /**************************
2031              * CALCULATE INTERACTIONS *
2032              **************************/
2033
2034             if (gmx_mm_any_lt(rsq11,rcutoff2))
2035             {
2036
2037             /* REACTION-FIELD ELECTROSTATICS */
2038             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
2039
2040             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2041
2042             fscal            = felec;
2043
2044             fscal            = _mm_and_ps(fscal,cutoff_mask);
2045
2046             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2047
2048             /* Calculate temporary vectorial force */
2049             tx               = _mm_mul_ps(fscal,dx11);
2050             ty               = _mm_mul_ps(fscal,dy11);
2051             tz               = _mm_mul_ps(fscal,dz11);
2052
2053             /* Update vectorial force */
2054             fix1             = _mm_add_ps(fix1,tx);
2055             fiy1             = _mm_add_ps(fiy1,ty);
2056             fiz1             = _mm_add_ps(fiz1,tz);
2057
2058             fjx1             = _mm_add_ps(fjx1,tx);
2059             fjy1             = _mm_add_ps(fjy1,ty);
2060             fjz1             = _mm_add_ps(fjz1,tz);
2061
2062             }
2063
2064             /**************************
2065              * CALCULATE INTERACTIONS *
2066              **************************/
2067
2068             if (gmx_mm_any_lt(rsq12,rcutoff2))
2069             {
2070
2071             /* REACTION-FIELD ELECTROSTATICS */
2072             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
2073
2074             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2075
2076             fscal            = felec;
2077
2078             fscal            = _mm_and_ps(fscal,cutoff_mask);
2079
2080             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2081
2082             /* Calculate temporary vectorial force */
2083             tx               = _mm_mul_ps(fscal,dx12);
2084             ty               = _mm_mul_ps(fscal,dy12);
2085             tz               = _mm_mul_ps(fscal,dz12);
2086
2087             /* Update vectorial force */
2088             fix1             = _mm_add_ps(fix1,tx);
2089             fiy1             = _mm_add_ps(fiy1,ty);
2090             fiz1             = _mm_add_ps(fiz1,tz);
2091
2092             fjx2             = _mm_add_ps(fjx2,tx);
2093             fjy2             = _mm_add_ps(fjy2,ty);
2094             fjz2             = _mm_add_ps(fjz2,tz);
2095
2096             }
2097
2098             /**************************
2099              * CALCULATE INTERACTIONS *
2100              **************************/
2101
2102             if (gmx_mm_any_lt(rsq20,rcutoff2))
2103             {
2104
2105             /* REACTION-FIELD ELECTROSTATICS */
2106             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
2107
2108             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
2109
2110             fscal            = felec;
2111
2112             fscal            = _mm_and_ps(fscal,cutoff_mask);
2113
2114             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2115
2116             /* Calculate temporary vectorial force */
2117             tx               = _mm_mul_ps(fscal,dx20);
2118             ty               = _mm_mul_ps(fscal,dy20);
2119             tz               = _mm_mul_ps(fscal,dz20);
2120
2121             /* Update vectorial force */
2122             fix2             = _mm_add_ps(fix2,tx);
2123             fiy2             = _mm_add_ps(fiy2,ty);
2124             fiz2             = _mm_add_ps(fiz2,tz);
2125
2126             fjx0             = _mm_add_ps(fjx0,tx);
2127             fjy0             = _mm_add_ps(fjy0,ty);
2128             fjz0             = _mm_add_ps(fjz0,tz);
2129
2130             }
2131
2132             /**************************
2133              * CALCULATE INTERACTIONS *
2134              **************************/
2135
2136             if (gmx_mm_any_lt(rsq21,rcutoff2))
2137             {
2138
2139             /* REACTION-FIELD ELECTROSTATICS */
2140             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
2141
2142             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2143
2144             fscal            = felec;
2145
2146             fscal            = _mm_and_ps(fscal,cutoff_mask);
2147
2148             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2149
2150             /* Calculate temporary vectorial force */
2151             tx               = _mm_mul_ps(fscal,dx21);
2152             ty               = _mm_mul_ps(fscal,dy21);
2153             tz               = _mm_mul_ps(fscal,dz21);
2154
2155             /* Update vectorial force */
2156             fix2             = _mm_add_ps(fix2,tx);
2157             fiy2             = _mm_add_ps(fiy2,ty);
2158             fiz2             = _mm_add_ps(fiz2,tz);
2159
2160             fjx1             = _mm_add_ps(fjx1,tx);
2161             fjy1             = _mm_add_ps(fjy1,ty);
2162             fjz1             = _mm_add_ps(fjz1,tz);
2163
2164             }
2165
2166             /**************************
2167              * CALCULATE INTERACTIONS *
2168              **************************/
2169
2170             if (gmx_mm_any_lt(rsq22,rcutoff2))
2171             {
2172
2173             /* REACTION-FIELD ELECTROSTATICS */
2174             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
2175
2176             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2177
2178             fscal            = felec;
2179
2180             fscal            = _mm_and_ps(fscal,cutoff_mask);
2181
2182             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2183
2184             /* Calculate temporary vectorial force */
2185             tx               = _mm_mul_ps(fscal,dx22);
2186             ty               = _mm_mul_ps(fscal,dy22);
2187             tz               = _mm_mul_ps(fscal,dz22);
2188
2189             /* Update vectorial force */
2190             fix2             = _mm_add_ps(fix2,tx);
2191             fiy2             = _mm_add_ps(fiy2,ty);
2192             fiz2             = _mm_add_ps(fiz2,tz);
2193
2194             fjx2             = _mm_add_ps(fjx2,tx);
2195             fjy2             = _mm_add_ps(fjy2,ty);
2196             fjz2             = _mm_add_ps(fjz2,tz);
2197
2198             }
2199
2200             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2201             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2202             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2203             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2204
2205             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2206                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2207
2208             /* Inner loop uses 302 flops */
2209         }
2210
2211         /* End of innermost loop */
2212
2213         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2214                                               f+i_coord_offset,fshift+i_shift_offset);
2215
2216         /* Increment number of inner iterations */
2217         inneriter                  += j_index_end - j_index_start;
2218
2219         /* Outer loop uses 18 flops */
2220     }
2221
2222     /* Increment number of outer iterations */
2223     outeriter        += nri;
2224
2225     /* Update outer/inner flops */
2226
2227     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*302);
2228 }