Added option to gmx nmeig to print ZPE.
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecEwSw_VdwNone_GeomW3W3_sse2_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse2_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_sse2_single.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_single
51  * Electrostatics interaction: Ewald
52  * VdW interaction:            None
53  * Geometry:                   Water3-Water3
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_single
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
77     real             rcutoff_scalar;
78     real             *shiftvec,*fshift,*x,*f;
79     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
80     real             scratch[4*DIM];
81     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
82     int              vdwioffset0;
83     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84     int              vdwioffset1;
85     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86     int              vdwioffset2;
87     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
89     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
91     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
93     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
104     real             *charge;
105     __m128i          ewitab;
106     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
107     real             *ewtab;
108     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
109     real             rswitch_scalar,d_scalar;
110     __m128           dummy_mask,cutoff_mask;
111     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
112     __m128           one     = _mm_set1_ps(1.0);
113     __m128           two     = _mm_set1_ps(2.0);
114     x                = xx[0];
115     f                = ff[0];
116
117     nri              = nlist->nri;
118     iinr             = nlist->iinr;
119     jindex           = nlist->jindex;
120     jjnr             = nlist->jjnr;
121     shiftidx         = nlist->shift;
122     gid              = nlist->gid;
123     shiftvec         = fr->shift_vec[0];
124     fshift           = fr->fshift[0];
125     facel            = _mm_set1_ps(fr->ic->epsfac);
126     charge           = mdatoms->chargeA;
127
128     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
129     ewtab            = fr->ic->tabq_coul_FDV0;
130     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
131     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
132
133     /* Setup water-specific parameters */
134     inr              = nlist->iinr[0];
135     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
136     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
137     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
138
139     jq0              = _mm_set1_ps(charge[inr+0]);
140     jq1              = _mm_set1_ps(charge[inr+1]);
141     jq2              = _mm_set1_ps(charge[inr+2]);
142     qq00             = _mm_mul_ps(iq0,jq0);
143     qq01             = _mm_mul_ps(iq0,jq1);
144     qq02             = _mm_mul_ps(iq0,jq2);
145     qq10             = _mm_mul_ps(iq1,jq0);
146     qq11             = _mm_mul_ps(iq1,jq1);
147     qq12             = _mm_mul_ps(iq1,jq2);
148     qq20             = _mm_mul_ps(iq2,jq0);
149     qq21             = _mm_mul_ps(iq2,jq1);
150     qq22             = _mm_mul_ps(iq2,jq2);
151
152     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
153     rcutoff_scalar   = fr->ic->rcoulomb;
154     rcutoff          = _mm_set1_ps(rcutoff_scalar);
155     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
156
157     rswitch_scalar   = fr->ic->rcoulomb_switch;
158     rswitch          = _mm_set1_ps(rswitch_scalar);
159     /* Setup switch parameters */
160     d_scalar         = rcutoff_scalar-rswitch_scalar;
161     d                = _mm_set1_ps(d_scalar);
162     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
163     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
164     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
165     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
166     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
167     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
168
169     /* Avoid stupid compiler warnings */
170     jnrA = jnrB = jnrC = jnrD = 0;
171     j_coord_offsetA = 0;
172     j_coord_offsetB = 0;
173     j_coord_offsetC = 0;
174     j_coord_offsetD = 0;
175
176     outeriter        = 0;
177     inneriter        = 0;
178
179     for(iidx=0;iidx<4*DIM;iidx++)
180     {
181         scratch[iidx] = 0.0;
182     }  
183
184     /* Start outer loop over neighborlists */
185     for(iidx=0; iidx<nri; iidx++)
186     {
187         /* Load shift vector for this list */
188         i_shift_offset   = DIM*shiftidx[iidx];
189
190         /* Load limits for loop over neighbors */
191         j_index_start    = jindex[iidx];
192         j_index_end      = jindex[iidx+1];
193
194         /* Get outer coordinate index */
195         inr              = iinr[iidx];
196         i_coord_offset   = DIM*inr;
197
198         /* Load i particle coords and add shift vector */
199         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
200                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
201         
202         fix0             = _mm_setzero_ps();
203         fiy0             = _mm_setzero_ps();
204         fiz0             = _mm_setzero_ps();
205         fix1             = _mm_setzero_ps();
206         fiy1             = _mm_setzero_ps();
207         fiz1             = _mm_setzero_ps();
208         fix2             = _mm_setzero_ps();
209         fiy2             = _mm_setzero_ps();
210         fiz2             = _mm_setzero_ps();
211
212         /* Reset potential sums */
213         velecsum         = _mm_setzero_ps();
214
215         /* Start inner kernel loop */
216         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
217         {
218
219             /* Get j neighbor index, and coordinate index */
220             jnrA             = jjnr[jidx];
221             jnrB             = jjnr[jidx+1];
222             jnrC             = jjnr[jidx+2];
223             jnrD             = jjnr[jidx+3];
224             j_coord_offsetA  = DIM*jnrA;
225             j_coord_offsetB  = DIM*jnrB;
226             j_coord_offsetC  = DIM*jnrC;
227             j_coord_offsetD  = DIM*jnrD;
228
229             /* load j atom coordinates */
230             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
231                                               x+j_coord_offsetC,x+j_coord_offsetD,
232                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
233
234             /* Calculate displacement vector */
235             dx00             = _mm_sub_ps(ix0,jx0);
236             dy00             = _mm_sub_ps(iy0,jy0);
237             dz00             = _mm_sub_ps(iz0,jz0);
238             dx01             = _mm_sub_ps(ix0,jx1);
239             dy01             = _mm_sub_ps(iy0,jy1);
240             dz01             = _mm_sub_ps(iz0,jz1);
241             dx02             = _mm_sub_ps(ix0,jx2);
242             dy02             = _mm_sub_ps(iy0,jy2);
243             dz02             = _mm_sub_ps(iz0,jz2);
244             dx10             = _mm_sub_ps(ix1,jx0);
245             dy10             = _mm_sub_ps(iy1,jy0);
246             dz10             = _mm_sub_ps(iz1,jz0);
247             dx11             = _mm_sub_ps(ix1,jx1);
248             dy11             = _mm_sub_ps(iy1,jy1);
249             dz11             = _mm_sub_ps(iz1,jz1);
250             dx12             = _mm_sub_ps(ix1,jx2);
251             dy12             = _mm_sub_ps(iy1,jy2);
252             dz12             = _mm_sub_ps(iz1,jz2);
253             dx20             = _mm_sub_ps(ix2,jx0);
254             dy20             = _mm_sub_ps(iy2,jy0);
255             dz20             = _mm_sub_ps(iz2,jz0);
256             dx21             = _mm_sub_ps(ix2,jx1);
257             dy21             = _mm_sub_ps(iy2,jy1);
258             dz21             = _mm_sub_ps(iz2,jz1);
259             dx22             = _mm_sub_ps(ix2,jx2);
260             dy22             = _mm_sub_ps(iy2,jy2);
261             dz22             = _mm_sub_ps(iz2,jz2);
262
263             /* Calculate squared distance and things based on it */
264             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
265             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
266             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
267             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
268             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
269             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
270             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
271             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
272             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
273
274             rinv00           = sse2_invsqrt_f(rsq00);
275             rinv01           = sse2_invsqrt_f(rsq01);
276             rinv02           = sse2_invsqrt_f(rsq02);
277             rinv10           = sse2_invsqrt_f(rsq10);
278             rinv11           = sse2_invsqrt_f(rsq11);
279             rinv12           = sse2_invsqrt_f(rsq12);
280             rinv20           = sse2_invsqrt_f(rsq20);
281             rinv21           = sse2_invsqrt_f(rsq21);
282             rinv22           = sse2_invsqrt_f(rsq22);
283
284             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
285             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
286             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
287             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
288             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
289             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
290             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
291             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
292             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
293
294             fjx0             = _mm_setzero_ps();
295             fjy0             = _mm_setzero_ps();
296             fjz0             = _mm_setzero_ps();
297             fjx1             = _mm_setzero_ps();
298             fjy1             = _mm_setzero_ps();
299             fjz1             = _mm_setzero_ps();
300             fjx2             = _mm_setzero_ps();
301             fjy2             = _mm_setzero_ps();
302             fjz2             = _mm_setzero_ps();
303
304             /**************************
305              * CALCULATE INTERACTIONS *
306              **************************/
307
308             if (gmx_mm_any_lt(rsq00,rcutoff2))
309             {
310
311             r00              = _mm_mul_ps(rsq00,rinv00);
312
313             /* EWALD ELECTROSTATICS */
314
315             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
316             ewrt             = _mm_mul_ps(r00,ewtabscale);
317             ewitab           = _mm_cvttps_epi32(ewrt);
318             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
319             ewitab           = _mm_slli_epi32(ewitab,2);
320             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
321             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
322             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
323             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
324             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
325             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
326             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
327             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
328             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
329
330             d                = _mm_sub_ps(r00,rswitch);
331             d                = _mm_max_ps(d,_mm_setzero_ps());
332             d2               = _mm_mul_ps(d,d);
333             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
334
335             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
336
337             /* Evaluate switch function */
338             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
339             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
340             velec            = _mm_mul_ps(velec,sw);
341             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
342
343             /* Update potential sum for this i atom from the interaction with this j atom. */
344             velec            = _mm_and_ps(velec,cutoff_mask);
345             velecsum         = _mm_add_ps(velecsum,velec);
346
347             fscal            = felec;
348
349             fscal            = _mm_and_ps(fscal,cutoff_mask);
350
351             /* Calculate temporary vectorial force */
352             tx               = _mm_mul_ps(fscal,dx00);
353             ty               = _mm_mul_ps(fscal,dy00);
354             tz               = _mm_mul_ps(fscal,dz00);
355
356             /* Update vectorial force */
357             fix0             = _mm_add_ps(fix0,tx);
358             fiy0             = _mm_add_ps(fiy0,ty);
359             fiz0             = _mm_add_ps(fiz0,tz);
360
361             fjx0             = _mm_add_ps(fjx0,tx);
362             fjy0             = _mm_add_ps(fjy0,ty);
363             fjz0             = _mm_add_ps(fjz0,tz);
364             
365             }
366
367             /**************************
368              * CALCULATE INTERACTIONS *
369              **************************/
370
371             if (gmx_mm_any_lt(rsq01,rcutoff2))
372             {
373
374             r01              = _mm_mul_ps(rsq01,rinv01);
375
376             /* EWALD ELECTROSTATICS */
377
378             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
379             ewrt             = _mm_mul_ps(r01,ewtabscale);
380             ewitab           = _mm_cvttps_epi32(ewrt);
381             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
382             ewitab           = _mm_slli_epi32(ewitab,2);
383             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
384             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
385             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
386             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
387             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
388             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
389             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
390             velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
391             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
392
393             d                = _mm_sub_ps(r01,rswitch);
394             d                = _mm_max_ps(d,_mm_setzero_ps());
395             d2               = _mm_mul_ps(d,d);
396             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
397
398             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
399
400             /* Evaluate switch function */
401             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
402             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
403             velec            = _mm_mul_ps(velec,sw);
404             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
405
406             /* Update potential sum for this i atom from the interaction with this j atom. */
407             velec            = _mm_and_ps(velec,cutoff_mask);
408             velecsum         = _mm_add_ps(velecsum,velec);
409
410             fscal            = felec;
411
412             fscal            = _mm_and_ps(fscal,cutoff_mask);
413
414             /* Calculate temporary vectorial force */
415             tx               = _mm_mul_ps(fscal,dx01);
416             ty               = _mm_mul_ps(fscal,dy01);
417             tz               = _mm_mul_ps(fscal,dz01);
418
419             /* Update vectorial force */
420             fix0             = _mm_add_ps(fix0,tx);
421             fiy0             = _mm_add_ps(fiy0,ty);
422             fiz0             = _mm_add_ps(fiz0,tz);
423
424             fjx1             = _mm_add_ps(fjx1,tx);
425             fjy1             = _mm_add_ps(fjy1,ty);
426             fjz1             = _mm_add_ps(fjz1,tz);
427             
428             }
429
430             /**************************
431              * CALCULATE INTERACTIONS *
432              **************************/
433
434             if (gmx_mm_any_lt(rsq02,rcutoff2))
435             {
436
437             r02              = _mm_mul_ps(rsq02,rinv02);
438
439             /* EWALD ELECTROSTATICS */
440
441             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
442             ewrt             = _mm_mul_ps(r02,ewtabscale);
443             ewitab           = _mm_cvttps_epi32(ewrt);
444             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
445             ewitab           = _mm_slli_epi32(ewitab,2);
446             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
447             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
448             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
449             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
450             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
451             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
452             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
453             velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
454             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
455
456             d                = _mm_sub_ps(r02,rswitch);
457             d                = _mm_max_ps(d,_mm_setzero_ps());
458             d2               = _mm_mul_ps(d,d);
459             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
460
461             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
462
463             /* Evaluate switch function */
464             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
465             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
466             velec            = _mm_mul_ps(velec,sw);
467             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
468
469             /* Update potential sum for this i atom from the interaction with this j atom. */
470             velec            = _mm_and_ps(velec,cutoff_mask);
471             velecsum         = _mm_add_ps(velecsum,velec);
472
473             fscal            = felec;
474
475             fscal            = _mm_and_ps(fscal,cutoff_mask);
476
477             /* Calculate temporary vectorial force */
478             tx               = _mm_mul_ps(fscal,dx02);
479             ty               = _mm_mul_ps(fscal,dy02);
480             tz               = _mm_mul_ps(fscal,dz02);
481
482             /* Update vectorial force */
483             fix0             = _mm_add_ps(fix0,tx);
484             fiy0             = _mm_add_ps(fiy0,ty);
485             fiz0             = _mm_add_ps(fiz0,tz);
486
487             fjx2             = _mm_add_ps(fjx2,tx);
488             fjy2             = _mm_add_ps(fjy2,ty);
489             fjz2             = _mm_add_ps(fjz2,tz);
490             
491             }
492
493             /**************************
494              * CALCULATE INTERACTIONS *
495              **************************/
496
497             if (gmx_mm_any_lt(rsq10,rcutoff2))
498             {
499
500             r10              = _mm_mul_ps(rsq10,rinv10);
501
502             /* EWALD ELECTROSTATICS */
503
504             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
505             ewrt             = _mm_mul_ps(r10,ewtabscale);
506             ewitab           = _mm_cvttps_epi32(ewrt);
507             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
508             ewitab           = _mm_slli_epi32(ewitab,2);
509             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
510             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
511             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
512             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
513             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
514             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
515             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
516             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
517             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
518
519             d                = _mm_sub_ps(r10,rswitch);
520             d                = _mm_max_ps(d,_mm_setzero_ps());
521             d2               = _mm_mul_ps(d,d);
522             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
523
524             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
525
526             /* Evaluate switch function */
527             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
528             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
529             velec            = _mm_mul_ps(velec,sw);
530             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
531
532             /* Update potential sum for this i atom from the interaction with this j atom. */
533             velec            = _mm_and_ps(velec,cutoff_mask);
534             velecsum         = _mm_add_ps(velecsum,velec);
535
536             fscal            = felec;
537
538             fscal            = _mm_and_ps(fscal,cutoff_mask);
539
540             /* Calculate temporary vectorial force */
541             tx               = _mm_mul_ps(fscal,dx10);
542             ty               = _mm_mul_ps(fscal,dy10);
543             tz               = _mm_mul_ps(fscal,dz10);
544
545             /* Update vectorial force */
546             fix1             = _mm_add_ps(fix1,tx);
547             fiy1             = _mm_add_ps(fiy1,ty);
548             fiz1             = _mm_add_ps(fiz1,tz);
549
550             fjx0             = _mm_add_ps(fjx0,tx);
551             fjy0             = _mm_add_ps(fjy0,ty);
552             fjz0             = _mm_add_ps(fjz0,tz);
553             
554             }
555
556             /**************************
557              * CALCULATE INTERACTIONS *
558              **************************/
559
560             if (gmx_mm_any_lt(rsq11,rcutoff2))
561             {
562
563             r11              = _mm_mul_ps(rsq11,rinv11);
564
565             /* EWALD ELECTROSTATICS */
566
567             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
568             ewrt             = _mm_mul_ps(r11,ewtabscale);
569             ewitab           = _mm_cvttps_epi32(ewrt);
570             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
571             ewitab           = _mm_slli_epi32(ewitab,2);
572             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
573             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
574             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
575             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
576             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
577             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
578             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
579             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
580             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
581
582             d                = _mm_sub_ps(r11,rswitch);
583             d                = _mm_max_ps(d,_mm_setzero_ps());
584             d2               = _mm_mul_ps(d,d);
585             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
586
587             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
588
589             /* Evaluate switch function */
590             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
591             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
592             velec            = _mm_mul_ps(velec,sw);
593             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
594
595             /* Update potential sum for this i atom from the interaction with this j atom. */
596             velec            = _mm_and_ps(velec,cutoff_mask);
597             velecsum         = _mm_add_ps(velecsum,velec);
598
599             fscal            = felec;
600
601             fscal            = _mm_and_ps(fscal,cutoff_mask);
602
603             /* Calculate temporary vectorial force */
604             tx               = _mm_mul_ps(fscal,dx11);
605             ty               = _mm_mul_ps(fscal,dy11);
606             tz               = _mm_mul_ps(fscal,dz11);
607
608             /* Update vectorial force */
609             fix1             = _mm_add_ps(fix1,tx);
610             fiy1             = _mm_add_ps(fiy1,ty);
611             fiz1             = _mm_add_ps(fiz1,tz);
612
613             fjx1             = _mm_add_ps(fjx1,tx);
614             fjy1             = _mm_add_ps(fjy1,ty);
615             fjz1             = _mm_add_ps(fjz1,tz);
616             
617             }
618
619             /**************************
620              * CALCULATE INTERACTIONS *
621              **************************/
622
623             if (gmx_mm_any_lt(rsq12,rcutoff2))
624             {
625
626             r12              = _mm_mul_ps(rsq12,rinv12);
627
628             /* EWALD ELECTROSTATICS */
629
630             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
631             ewrt             = _mm_mul_ps(r12,ewtabscale);
632             ewitab           = _mm_cvttps_epi32(ewrt);
633             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
634             ewitab           = _mm_slli_epi32(ewitab,2);
635             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
636             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
637             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
638             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
639             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
640             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
641             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
642             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
643             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
644
645             d                = _mm_sub_ps(r12,rswitch);
646             d                = _mm_max_ps(d,_mm_setzero_ps());
647             d2               = _mm_mul_ps(d,d);
648             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
649
650             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
651
652             /* Evaluate switch function */
653             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
654             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
655             velec            = _mm_mul_ps(velec,sw);
656             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
657
658             /* Update potential sum for this i atom from the interaction with this j atom. */
659             velec            = _mm_and_ps(velec,cutoff_mask);
660             velecsum         = _mm_add_ps(velecsum,velec);
661
662             fscal            = felec;
663
664             fscal            = _mm_and_ps(fscal,cutoff_mask);
665
666             /* Calculate temporary vectorial force */
667             tx               = _mm_mul_ps(fscal,dx12);
668             ty               = _mm_mul_ps(fscal,dy12);
669             tz               = _mm_mul_ps(fscal,dz12);
670
671             /* Update vectorial force */
672             fix1             = _mm_add_ps(fix1,tx);
673             fiy1             = _mm_add_ps(fiy1,ty);
674             fiz1             = _mm_add_ps(fiz1,tz);
675
676             fjx2             = _mm_add_ps(fjx2,tx);
677             fjy2             = _mm_add_ps(fjy2,ty);
678             fjz2             = _mm_add_ps(fjz2,tz);
679             
680             }
681
682             /**************************
683              * CALCULATE INTERACTIONS *
684              **************************/
685
686             if (gmx_mm_any_lt(rsq20,rcutoff2))
687             {
688
689             r20              = _mm_mul_ps(rsq20,rinv20);
690
691             /* EWALD ELECTROSTATICS */
692
693             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
694             ewrt             = _mm_mul_ps(r20,ewtabscale);
695             ewitab           = _mm_cvttps_epi32(ewrt);
696             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
697             ewitab           = _mm_slli_epi32(ewitab,2);
698             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
699             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
700             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
701             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
702             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
703             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
704             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
705             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
706             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
707
708             d                = _mm_sub_ps(r20,rswitch);
709             d                = _mm_max_ps(d,_mm_setzero_ps());
710             d2               = _mm_mul_ps(d,d);
711             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
712
713             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
714
715             /* Evaluate switch function */
716             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
717             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
718             velec            = _mm_mul_ps(velec,sw);
719             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
720
721             /* Update potential sum for this i atom from the interaction with this j atom. */
722             velec            = _mm_and_ps(velec,cutoff_mask);
723             velecsum         = _mm_add_ps(velecsum,velec);
724
725             fscal            = felec;
726
727             fscal            = _mm_and_ps(fscal,cutoff_mask);
728
729             /* Calculate temporary vectorial force */
730             tx               = _mm_mul_ps(fscal,dx20);
731             ty               = _mm_mul_ps(fscal,dy20);
732             tz               = _mm_mul_ps(fscal,dz20);
733
734             /* Update vectorial force */
735             fix2             = _mm_add_ps(fix2,tx);
736             fiy2             = _mm_add_ps(fiy2,ty);
737             fiz2             = _mm_add_ps(fiz2,tz);
738
739             fjx0             = _mm_add_ps(fjx0,tx);
740             fjy0             = _mm_add_ps(fjy0,ty);
741             fjz0             = _mm_add_ps(fjz0,tz);
742             
743             }
744
745             /**************************
746              * CALCULATE INTERACTIONS *
747              **************************/
748
749             if (gmx_mm_any_lt(rsq21,rcutoff2))
750             {
751
752             r21              = _mm_mul_ps(rsq21,rinv21);
753
754             /* EWALD ELECTROSTATICS */
755
756             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
757             ewrt             = _mm_mul_ps(r21,ewtabscale);
758             ewitab           = _mm_cvttps_epi32(ewrt);
759             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
760             ewitab           = _mm_slli_epi32(ewitab,2);
761             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
762             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
763             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
764             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
765             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
766             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
767             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
768             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
769             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
770
771             d                = _mm_sub_ps(r21,rswitch);
772             d                = _mm_max_ps(d,_mm_setzero_ps());
773             d2               = _mm_mul_ps(d,d);
774             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
775
776             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
777
778             /* Evaluate switch function */
779             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
780             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
781             velec            = _mm_mul_ps(velec,sw);
782             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
783
784             /* Update potential sum for this i atom from the interaction with this j atom. */
785             velec            = _mm_and_ps(velec,cutoff_mask);
786             velecsum         = _mm_add_ps(velecsum,velec);
787
788             fscal            = felec;
789
790             fscal            = _mm_and_ps(fscal,cutoff_mask);
791
792             /* Calculate temporary vectorial force */
793             tx               = _mm_mul_ps(fscal,dx21);
794             ty               = _mm_mul_ps(fscal,dy21);
795             tz               = _mm_mul_ps(fscal,dz21);
796
797             /* Update vectorial force */
798             fix2             = _mm_add_ps(fix2,tx);
799             fiy2             = _mm_add_ps(fiy2,ty);
800             fiz2             = _mm_add_ps(fiz2,tz);
801
802             fjx1             = _mm_add_ps(fjx1,tx);
803             fjy1             = _mm_add_ps(fjy1,ty);
804             fjz1             = _mm_add_ps(fjz1,tz);
805             
806             }
807
808             /**************************
809              * CALCULATE INTERACTIONS *
810              **************************/
811
812             if (gmx_mm_any_lt(rsq22,rcutoff2))
813             {
814
815             r22              = _mm_mul_ps(rsq22,rinv22);
816
817             /* EWALD ELECTROSTATICS */
818
819             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
820             ewrt             = _mm_mul_ps(r22,ewtabscale);
821             ewitab           = _mm_cvttps_epi32(ewrt);
822             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
823             ewitab           = _mm_slli_epi32(ewitab,2);
824             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
825             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
826             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
827             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
828             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
829             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
830             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
831             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
832             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
833
834             d                = _mm_sub_ps(r22,rswitch);
835             d                = _mm_max_ps(d,_mm_setzero_ps());
836             d2               = _mm_mul_ps(d,d);
837             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
838
839             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
840
841             /* Evaluate switch function */
842             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
843             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
844             velec            = _mm_mul_ps(velec,sw);
845             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
846
847             /* Update potential sum for this i atom from the interaction with this j atom. */
848             velec            = _mm_and_ps(velec,cutoff_mask);
849             velecsum         = _mm_add_ps(velecsum,velec);
850
851             fscal            = felec;
852
853             fscal            = _mm_and_ps(fscal,cutoff_mask);
854
855             /* Calculate temporary vectorial force */
856             tx               = _mm_mul_ps(fscal,dx22);
857             ty               = _mm_mul_ps(fscal,dy22);
858             tz               = _mm_mul_ps(fscal,dz22);
859
860             /* Update vectorial force */
861             fix2             = _mm_add_ps(fix2,tx);
862             fiy2             = _mm_add_ps(fiy2,ty);
863             fiz2             = _mm_add_ps(fiz2,tz);
864
865             fjx2             = _mm_add_ps(fjx2,tx);
866             fjy2             = _mm_add_ps(fjy2,ty);
867             fjz2             = _mm_add_ps(fjz2,tz);
868             
869             }
870
871             fjptrA             = f+j_coord_offsetA;
872             fjptrB             = f+j_coord_offsetB;
873             fjptrC             = f+j_coord_offsetC;
874             fjptrD             = f+j_coord_offsetD;
875
876             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
877                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
878
879             /* Inner loop uses 585 flops */
880         }
881
882         if(jidx<j_index_end)
883         {
884
885             /* Get j neighbor index, and coordinate index */
886             jnrlistA         = jjnr[jidx];
887             jnrlistB         = jjnr[jidx+1];
888             jnrlistC         = jjnr[jidx+2];
889             jnrlistD         = jjnr[jidx+3];
890             /* Sign of each element will be negative for non-real atoms.
891              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
892              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
893              */
894             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
895             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
896             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
897             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
898             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
899             j_coord_offsetA  = DIM*jnrA;
900             j_coord_offsetB  = DIM*jnrB;
901             j_coord_offsetC  = DIM*jnrC;
902             j_coord_offsetD  = DIM*jnrD;
903
904             /* load j atom coordinates */
905             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
906                                               x+j_coord_offsetC,x+j_coord_offsetD,
907                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
908
909             /* Calculate displacement vector */
910             dx00             = _mm_sub_ps(ix0,jx0);
911             dy00             = _mm_sub_ps(iy0,jy0);
912             dz00             = _mm_sub_ps(iz0,jz0);
913             dx01             = _mm_sub_ps(ix0,jx1);
914             dy01             = _mm_sub_ps(iy0,jy1);
915             dz01             = _mm_sub_ps(iz0,jz1);
916             dx02             = _mm_sub_ps(ix0,jx2);
917             dy02             = _mm_sub_ps(iy0,jy2);
918             dz02             = _mm_sub_ps(iz0,jz2);
919             dx10             = _mm_sub_ps(ix1,jx0);
920             dy10             = _mm_sub_ps(iy1,jy0);
921             dz10             = _mm_sub_ps(iz1,jz0);
922             dx11             = _mm_sub_ps(ix1,jx1);
923             dy11             = _mm_sub_ps(iy1,jy1);
924             dz11             = _mm_sub_ps(iz1,jz1);
925             dx12             = _mm_sub_ps(ix1,jx2);
926             dy12             = _mm_sub_ps(iy1,jy2);
927             dz12             = _mm_sub_ps(iz1,jz2);
928             dx20             = _mm_sub_ps(ix2,jx0);
929             dy20             = _mm_sub_ps(iy2,jy0);
930             dz20             = _mm_sub_ps(iz2,jz0);
931             dx21             = _mm_sub_ps(ix2,jx1);
932             dy21             = _mm_sub_ps(iy2,jy1);
933             dz21             = _mm_sub_ps(iz2,jz1);
934             dx22             = _mm_sub_ps(ix2,jx2);
935             dy22             = _mm_sub_ps(iy2,jy2);
936             dz22             = _mm_sub_ps(iz2,jz2);
937
938             /* Calculate squared distance and things based on it */
939             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
940             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
941             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
942             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
943             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
944             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
945             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
946             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
947             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
948
949             rinv00           = sse2_invsqrt_f(rsq00);
950             rinv01           = sse2_invsqrt_f(rsq01);
951             rinv02           = sse2_invsqrt_f(rsq02);
952             rinv10           = sse2_invsqrt_f(rsq10);
953             rinv11           = sse2_invsqrt_f(rsq11);
954             rinv12           = sse2_invsqrt_f(rsq12);
955             rinv20           = sse2_invsqrt_f(rsq20);
956             rinv21           = sse2_invsqrt_f(rsq21);
957             rinv22           = sse2_invsqrt_f(rsq22);
958
959             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
960             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
961             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
962             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
963             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
964             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
965             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
966             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
967             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
968
969             fjx0             = _mm_setzero_ps();
970             fjy0             = _mm_setzero_ps();
971             fjz0             = _mm_setzero_ps();
972             fjx1             = _mm_setzero_ps();
973             fjy1             = _mm_setzero_ps();
974             fjz1             = _mm_setzero_ps();
975             fjx2             = _mm_setzero_ps();
976             fjy2             = _mm_setzero_ps();
977             fjz2             = _mm_setzero_ps();
978
979             /**************************
980              * CALCULATE INTERACTIONS *
981              **************************/
982
983             if (gmx_mm_any_lt(rsq00,rcutoff2))
984             {
985
986             r00              = _mm_mul_ps(rsq00,rinv00);
987             r00              = _mm_andnot_ps(dummy_mask,r00);
988
989             /* EWALD ELECTROSTATICS */
990
991             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
992             ewrt             = _mm_mul_ps(r00,ewtabscale);
993             ewitab           = _mm_cvttps_epi32(ewrt);
994             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
995             ewitab           = _mm_slli_epi32(ewitab,2);
996             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
997             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
998             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
999             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1000             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1001             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1002             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1003             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
1004             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1005
1006             d                = _mm_sub_ps(r00,rswitch);
1007             d                = _mm_max_ps(d,_mm_setzero_ps());
1008             d2               = _mm_mul_ps(d,d);
1009             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1010
1011             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1012
1013             /* Evaluate switch function */
1014             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1015             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1016             velec            = _mm_mul_ps(velec,sw);
1017             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1018
1019             /* Update potential sum for this i atom from the interaction with this j atom. */
1020             velec            = _mm_and_ps(velec,cutoff_mask);
1021             velec            = _mm_andnot_ps(dummy_mask,velec);
1022             velecsum         = _mm_add_ps(velecsum,velec);
1023
1024             fscal            = felec;
1025
1026             fscal            = _mm_and_ps(fscal,cutoff_mask);
1027
1028             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1029
1030             /* Calculate temporary vectorial force */
1031             tx               = _mm_mul_ps(fscal,dx00);
1032             ty               = _mm_mul_ps(fscal,dy00);
1033             tz               = _mm_mul_ps(fscal,dz00);
1034
1035             /* Update vectorial force */
1036             fix0             = _mm_add_ps(fix0,tx);
1037             fiy0             = _mm_add_ps(fiy0,ty);
1038             fiz0             = _mm_add_ps(fiz0,tz);
1039
1040             fjx0             = _mm_add_ps(fjx0,tx);
1041             fjy0             = _mm_add_ps(fjy0,ty);
1042             fjz0             = _mm_add_ps(fjz0,tz);
1043             
1044             }
1045
1046             /**************************
1047              * CALCULATE INTERACTIONS *
1048              **************************/
1049
1050             if (gmx_mm_any_lt(rsq01,rcutoff2))
1051             {
1052
1053             r01              = _mm_mul_ps(rsq01,rinv01);
1054             r01              = _mm_andnot_ps(dummy_mask,r01);
1055
1056             /* EWALD ELECTROSTATICS */
1057
1058             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1059             ewrt             = _mm_mul_ps(r01,ewtabscale);
1060             ewitab           = _mm_cvttps_epi32(ewrt);
1061             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1062             ewitab           = _mm_slli_epi32(ewitab,2);
1063             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1064             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1065             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1066             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1067             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1068             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1069             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1070             velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
1071             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1072
1073             d                = _mm_sub_ps(r01,rswitch);
1074             d                = _mm_max_ps(d,_mm_setzero_ps());
1075             d2               = _mm_mul_ps(d,d);
1076             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1077
1078             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1079
1080             /* Evaluate switch function */
1081             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1082             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1083             velec            = _mm_mul_ps(velec,sw);
1084             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1085
1086             /* Update potential sum for this i atom from the interaction with this j atom. */
1087             velec            = _mm_and_ps(velec,cutoff_mask);
1088             velec            = _mm_andnot_ps(dummy_mask,velec);
1089             velecsum         = _mm_add_ps(velecsum,velec);
1090
1091             fscal            = felec;
1092
1093             fscal            = _mm_and_ps(fscal,cutoff_mask);
1094
1095             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1096
1097             /* Calculate temporary vectorial force */
1098             tx               = _mm_mul_ps(fscal,dx01);
1099             ty               = _mm_mul_ps(fscal,dy01);
1100             tz               = _mm_mul_ps(fscal,dz01);
1101
1102             /* Update vectorial force */
1103             fix0             = _mm_add_ps(fix0,tx);
1104             fiy0             = _mm_add_ps(fiy0,ty);
1105             fiz0             = _mm_add_ps(fiz0,tz);
1106
1107             fjx1             = _mm_add_ps(fjx1,tx);
1108             fjy1             = _mm_add_ps(fjy1,ty);
1109             fjz1             = _mm_add_ps(fjz1,tz);
1110             
1111             }
1112
1113             /**************************
1114              * CALCULATE INTERACTIONS *
1115              **************************/
1116
1117             if (gmx_mm_any_lt(rsq02,rcutoff2))
1118             {
1119
1120             r02              = _mm_mul_ps(rsq02,rinv02);
1121             r02              = _mm_andnot_ps(dummy_mask,r02);
1122
1123             /* EWALD ELECTROSTATICS */
1124
1125             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1126             ewrt             = _mm_mul_ps(r02,ewtabscale);
1127             ewitab           = _mm_cvttps_epi32(ewrt);
1128             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1129             ewitab           = _mm_slli_epi32(ewitab,2);
1130             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1131             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1132             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1133             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1134             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1135             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1136             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1137             velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
1138             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1139
1140             d                = _mm_sub_ps(r02,rswitch);
1141             d                = _mm_max_ps(d,_mm_setzero_ps());
1142             d2               = _mm_mul_ps(d,d);
1143             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1144
1145             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1146
1147             /* Evaluate switch function */
1148             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1149             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
1150             velec            = _mm_mul_ps(velec,sw);
1151             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1152
1153             /* Update potential sum for this i atom from the interaction with this j atom. */
1154             velec            = _mm_and_ps(velec,cutoff_mask);
1155             velec            = _mm_andnot_ps(dummy_mask,velec);
1156             velecsum         = _mm_add_ps(velecsum,velec);
1157
1158             fscal            = felec;
1159
1160             fscal            = _mm_and_ps(fscal,cutoff_mask);
1161
1162             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1163
1164             /* Calculate temporary vectorial force */
1165             tx               = _mm_mul_ps(fscal,dx02);
1166             ty               = _mm_mul_ps(fscal,dy02);
1167             tz               = _mm_mul_ps(fscal,dz02);
1168
1169             /* Update vectorial force */
1170             fix0             = _mm_add_ps(fix0,tx);
1171             fiy0             = _mm_add_ps(fiy0,ty);
1172             fiz0             = _mm_add_ps(fiz0,tz);
1173
1174             fjx2             = _mm_add_ps(fjx2,tx);
1175             fjy2             = _mm_add_ps(fjy2,ty);
1176             fjz2             = _mm_add_ps(fjz2,tz);
1177             
1178             }
1179
1180             /**************************
1181              * CALCULATE INTERACTIONS *
1182              **************************/
1183
1184             if (gmx_mm_any_lt(rsq10,rcutoff2))
1185             {
1186
1187             r10              = _mm_mul_ps(rsq10,rinv10);
1188             r10              = _mm_andnot_ps(dummy_mask,r10);
1189
1190             /* EWALD ELECTROSTATICS */
1191
1192             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1193             ewrt             = _mm_mul_ps(r10,ewtabscale);
1194             ewitab           = _mm_cvttps_epi32(ewrt);
1195             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1196             ewitab           = _mm_slli_epi32(ewitab,2);
1197             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1198             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1199             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1200             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1201             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1202             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1203             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1204             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
1205             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1206
1207             d                = _mm_sub_ps(r10,rswitch);
1208             d                = _mm_max_ps(d,_mm_setzero_ps());
1209             d2               = _mm_mul_ps(d,d);
1210             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1211
1212             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1213
1214             /* Evaluate switch function */
1215             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1216             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1217             velec            = _mm_mul_ps(velec,sw);
1218             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1219
1220             /* Update potential sum for this i atom from the interaction with this j atom. */
1221             velec            = _mm_and_ps(velec,cutoff_mask);
1222             velec            = _mm_andnot_ps(dummy_mask,velec);
1223             velecsum         = _mm_add_ps(velecsum,velec);
1224
1225             fscal            = felec;
1226
1227             fscal            = _mm_and_ps(fscal,cutoff_mask);
1228
1229             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1230
1231             /* Calculate temporary vectorial force */
1232             tx               = _mm_mul_ps(fscal,dx10);
1233             ty               = _mm_mul_ps(fscal,dy10);
1234             tz               = _mm_mul_ps(fscal,dz10);
1235
1236             /* Update vectorial force */
1237             fix1             = _mm_add_ps(fix1,tx);
1238             fiy1             = _mm_add_ps(fiy1,ty);
1239             fiz1             = _mm_add_ps(fiz1,tz);
1240
1241             fjx0             = _mm_add_ps(fjx0,tx);
1242             fjy0             = _mm_add_ps(fjy0,ty);
1243             fjz0             = _mm_add_ps(fjz0,tz);
1244             
1245             }
1246
1247             /**************************
1248              * CALCULATE INTERACTIONS *
1249              **************************/
1250
1251             if (gmx_mm_any_lt(rsq11,rcutoff2))
1252             {
1253
1254             r11              = _mm_mul_ps(rsq11,rinv11);
1255             r11              = _mm_andnot_ps(dummy_mask,r11);
1256
1257             /* EWALD ELECTROSTATICS */
1258
1259             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1260             ewrt             = _mm_mul_ps(r11,ewtabscale);
1261             ewitab           = _mm_cvttps_epi32(ewrt);
1262             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1263             ewitab           = _mm_slli_epi32(ewitab,2);
1264             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1265             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1266             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1267             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1268             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1269             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1270             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1271             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1272             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1273
1274             d                = _mm_sub_ps(r11,rswitch);
1275             d                = _mm_max_ps(d,_mm_setzero_ps());
1276             d2               = _mm_mul_ps(d,d);
1277             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1278
1279             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1280
1281             /* Evaluate switch function */
1282             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1283             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1284             velec            = _mm_mul_ps(velec,sw);
1285             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1286
1287             /* Update potential sum for this i atom from the interaction with this j atom. */
1288             velec            = _mm_and_ps(velec,cutoff_mask);
1289             velec            = _mm_andnot_ps(dummy_mask,velec);
1290             velecsum         = _mm_add_ps(velecsum,velec);
1291
1292             fscal            = felec;
1293
1294             fscal            = _mm_and_ps(fscal,cutoff_mask);
1295
1296             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1297
1298             /* Calculate temporary vectorial force */
1299             tx               = _mm_mul_ps(fscal,dx11);
1300             ty               = _mm_mul_ps(fscal,dy11);
1301             tz               = _mm_mul_ps(fscal,dz11);
1302
1303             /* Update vectorial force */
1304             fix1             = _mm_add_ps(fix1,tx);
1305             fiy1             = _mm_add_ps(fiy1,ty);
1306             fiz1             = _mm_add_ps(fiz1,tz);
1307
1308             fjx1             = _mm_add_ps(fjx1,tx);
1309             fjy1             = _mm_add_ps(fjy1,ty);
1310             fjz1             = _mm_add_ps(fjz1,tz);
1311             
1312             }
1313
1314             /**************************
1315              * CALCULATE INTERACTIONS *
1316              **************************/
1317
1318             if (gmx_mm_any_lt(rsq12,rcutoff2))
1319             {
1320
1321             r12              = _mm_mul_ps(rsq12,rinv12);
1322             r12              = _mm_andnot_ps(dummy_mask,r12);
1323
1324             /* EWALD ELECTROSTATICS */
1325
1326             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1327             ewrt             = _mm_mul_ps(r12,ewtabscale);
1328             ewitab           = _mm_cvttps_epi32(ewrt);
1329             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1330             ewitab           = _mm_slli_epi32(ewitab,2);
1331             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1332             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1333             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1334             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1335             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1336             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1337             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1338             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1339             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1340
1341             d                = _mm_sub_ps(r12,rswitch);
1342             d                = _mm_max_ps(d,_mm_setzero_ps());
1343             d2               = _mm_mul_ps(d,d);
1344             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1345
1346             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1347
1348             /* Evaluate switch function */
1349             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1350             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1351             velec            = _mm_mul_ps(velec,sw);
1352             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1353
1354             /* Update potential sum for this i atom from the interaction with this j atom. */
1355             velec            = _mm_and_ps(velec,cutoff_mask);
1356             velec            = _mm_andnot_ps(dummy_mask,velec);
1357             velecsum         = _mm_add_ps(velecsum,velec);
1358
1359             fscal            = felec;
1360
1361             fscal            = _mm_and_ps(fscal,cutoff_mask);
1362
1363             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1364
1365             /* Calculate temporary vectorial force */
1366             tx               = _mm_mul_ps(fscal,dx12);
1367             ty               = _mm_mul_ps(fscal,dy12);
1368             tz               = _mm_mul_ps(fscal,dz12);
1369
1370             /* Update vectorial force */
1371             fix1             = _mm_add_ps(fix1,tx);
1372             fiy1             = _mm_add_ps(fiy1,ty);
1373             fiz1             = _mm_add_ps(fiz1,tz);
1374
1375             fjx2             = _mm_add_ps(fjx2,tx);
1376             fjy2             = _mm_add_ps(fjy2,ty);
1377             fjz2             = _mm_add_ps(fjz2,tz);
1378             
1379             }
1380
1381             /**************************
1382              * CALCULATE INTERACTIONS *
1383              **************************/
1384
1385             if (gmx_mm_any_lt(rsq20,rcutoff2))
1386             {
1387
1388             r20              = _mm_mul_ps(rsq20,rinv20);
1389             r20              = _mm_andnot_ps(dummy_mask,r20);
1390
1391             /* EWALD ELECTROSTATICS */
1392
1393             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1394             ewrt             = _mm_mul_ps(r20,ewtabscale);
1395             ewitab           = _mm_cvttps_epi32(ewrt);
1396             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1397             ewitab           = _mm_slli_epi32(ewitab,2);
1398             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1399             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1400             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1401             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1402             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1403             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1404             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1405             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1406             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1407
1408             d                = _mm_sub_ps(r20,rswitch);
1409             d                = _mm_max_ps(d,_mm_setzero_ps());
1410             d2               = _mm_mul_ps(d,d);
1411             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1412
1413             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1414
1415             /* Evaluate switch function */
1416             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1417             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1418             velec            = _mm_mul_ps(velec,sw);
1419             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1420
1421             /* Update potential sum for this i atom from the interaction with this j atom. */
1422             velec            = _mm_and_ps(velec,cutoff_mask);
1423             velec            = _mm_andnot_ps(dummy_mask,velec);
1424             velecsum         = _mm_add_ps(velecsum,velec);
1425
1426             fscal            = felec;
1427
1428             fscal            = _mm_and_ps(fscal,cutoff_mask);
1429
1430             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1431
1432             /* Calculate temporary vectorial force */
1433             tx               = _mm_mul_ps(fscal,dx20);
1434             ty               = _mm_mul_ps(fscal,dy20);
1435             tz               = _mm_mul_ps(fscal,dz20);
1436
1437             /* Update vectorial force */
1438             fix2             = _mm_add_ps(fix2,tx);
1439             fiy2             = _mm_add_ps(fiy2,ty);
1440             fiz2             = _mm_add_ps(fiz2,tz);
1441
1442             fjx0             = _mm_add_ps(fjx0,tx);
1443             fjy0             = _mm_add_ps(fjy0,ty);
1444             fjz0             = _mm_add_ps(fjz0,tz);
1445             
1446             }
1447
1448             /**************************
1449              * CALCULATE INTERACTIONS *
1450              **************************/
1451
1452             if (gmx_mm_any_lt(rsq21,rcutoff2))
1453             {
1454
1455             r21              = _mm_mul_ps(rsq21,rinv21);
1456             r21              = _mm_andnot_ps(dummy_mask,r21);
1457
1458             /* EWALD ELECTROSTATICS */
1459
1460             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1461             ewrt             = _mm_mul_ps(r21,ewtabscale);
1462             ewitab           = _mm_cvttps_epi32(ewrt);
1463             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1464             ewitab           = _mm_slli_epi32(ewitab,2);
1465             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1466             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1467             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1468             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1469             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1470             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1471             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1472             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1473             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1474
1475             d                = _mm_sub_ps(r21,rswitch);
1476             d                = _mm_max_ps(d,_mm_setzero_ps());
1477             d2               = _mm_mul_ps(d,d);
1478             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1479
1480             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1481
1482             /* Evaluate switch function */
1483             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1484             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1485             velec            = _mm_mul_ps(velec,sw);
1486             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1487
1488             /* Update potential sum for this i atom from the interaction with this j atom. */
1489             velec            = _mm_and_ps(velec,cutoff_mask);
1490             velec            = _mm_andnot_ps(dummy_mask,velec);
1491             velecsum         = _mm_add_ps(velecsum,velec);
1492
1493             fscal            = felec;
1494
1495             fscal            = _mm_and_ps(fscal,cutoff_mask);
1496
1497             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1498
1499             /* Calculate temporary vectorial force */
1500             tx               = _mm_mul_ps(fscal,dx21);
1501             ty               = _mm_mul_ps(fscal,dy21);
1502             tz               = _mm_mul_ps(fscal,dz21);
1503
1504             /* Update vectorial force */
1505             fix2             = _mm_add_ps(fix2,tx);
1506             fiy2             = _mm_add_ps(fiy2,ty);
1507             fiz2             = _mm_add_ps(fiz2,tz);
1508
1509             fjx1             = _mm_add_ps(fjx1,tx);
1510             fjy1             = _mm_add_ps(fjy1,ty);
1511             fjz1             = _mm_add_ps(fjz1,tz);
1512             
1513             }
1514
1515             /**************************
1516              * CALCULATE INTERACTIONS *
1517              **************************/
1518
1519             if (gmx_mm_any_lt(rsq22,rcutoff2))
1520             {
1521
1522             r22              = _mm_mul_ps(rsq22,rinv22);
1523             r22              = _mm_andnot_ps(dummy_mask,r22);
1524
1525             /* EWALD ELECTROSTATICS */
1526
1527             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1528             ewrt             = _mm_mul_ps(r22,ewtabscale);
1529             ewitab           = _mm_cvttps_epi32(ewrt);
1530             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1531             ewitab           = _mm_slli_epi32(ewitab,2);
1532             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1533             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1534             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1535             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1536             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1537             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1538             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1539             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1540             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1541
1542             d                = _mm_sub_ps(r22,rswitch);
1543             d                = _mm_max_ps(d,_mm_setzero_ps());
1544             d2               = _mm_mul_ps(d,d);
1545             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1546
1547             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1548
1549             /* Evaluate switch function */
1550             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1551             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1552             velec            = _mm_mul_ps(velec,sw);
1553             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1554
1555             /* Update potential sum for this i atom from the interaction with this j atom. */
1556             velec            = _mm_and_ps(velec,cutoff_mask);
1557             velec            = _mm_andnot_ps(dummy_mask,velec);
1558             velecsum         = _mm_add_ps(velecsum,velec);
1559
1560             fscal            = felec;
1561
1562             fscal            = _mm_and_ps(fscal,cutoff_mask);
1563
1564             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1565
1566             /* Calculate temporary vectorial force */
1567             tx               = _mm_mul_ps(fscal,dx22);
1568             ty               = _mm_mul_ps(fscal,dy22);
1569             tz               = _mm_mul_ps(fscal,dz22);
1570
1571             /* Update vectorial force */
1572             fix2             = _mm_add_ps(fix2,tx);
1573             fiy2             = _mm_add_ps(fiy2,ty);
1574             fiz2             = _mm_add_ps(fiz2,tz);
1575
1576             fjx2             = _mm_add_ps(fjx2,tx);
1577             fjy2             = _mm_add_ps(fjy2,ty);
1578             fjz2             = _mm_add_ps(fjz2,tz);
1579             
1580             }
1581
1582             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1583             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1584             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1585             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1586
1587             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1588                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1589
1590             /* Inner loop uses 594 flops */
1591         }
1592
1593         /* End of innermost loop */
1594
1595         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1596                                               f+i_coord_offset,fshift+i_shift_offset);
1597
1598         ggid                        = gid[iidx];
1599         /* Update potential energies */
1600         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1601
1602         /* Increment number of inner iterations */
1603         inneriter                  += j_index_end - j_index_start;
1604
1605         /* Outer loop uses 19 flops */
1606     }
1607
1608     /* Increment number of outer iterations */
1609     outeriter        += nri;
1610
1611     /* Update outer/inner flops */
1612
1613     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*594);
1614 }
1615 /*
1616  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_single
1617  * Electrostatics interaction: Ewald
1618  * VdW interaction:            None
1619  * Geometry:                   Water3-Water3
1620  * Calculate force/pot:        Force
1621  */
1622 void
1623 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_single
1624                     (t_nblist                    * gmx_restrict       nlist,
1625                      rvec                        * gmx_restrict          xx,
1626                      rvec                        * gmx_restrict          ff,
1627                      struct t_forcerec           * gmx_restrict          fr,
1628                      t_mdatoms                   * gmx_restrict     mdatoms,
1629                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1630                      t_nrnb                      * gmx_restrict        nrnb)
1631 {
1632     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1633      * just 0 for non-waters.
1634      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1635      * jnr indices corresponding to data put in the four positions in the SIMD register.
1636      */
1637     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1638     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1639     int              jnrA,jnrB,jnrC,jnrD;
1640     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1641     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1642     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1643     real             rcutoff_scalar;
1644     real             *shiftvec,*fshift,*x,*f;
1645     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1646     real             scratch[4*DIM];
1647     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1648     int              vdwioffset0;
1649     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1650     int              vdwioffset1;
1651     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1652     int              vdwioffset2;
1653     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1654     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1655     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1656     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1657     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1658     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1659     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1660     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1661     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1662     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1663     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1664     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1665     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1666     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1667     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1668     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1669     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1670     real             *charge;
1671     __m128i          ewitab;
1672     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1673     real             *ewtab;
1674     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1675     real             rswitch_scalar,d_scalar;
1676     __m128           dummy_mask,cutoff_mask;
1677     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1678     __m128           one     = _mm_set1_ps(1.0);
1679     __m128           two     = _mm_set1_ps(2.0);
1680     x                = xx[0];
1681     f                = ff[0];
1682
1683     nri              = nlist->nri;
1684     iinr             = nlist->iinr;
1685     jindex           = nlist->jindex;
1686     jjnr             = nlist->jjnr;
1687     shiftidx         = nlist->shift;
1688     gid              = nlist->gid;
1689     shiftvec         = fr->shift_vec[0];
1690     fshift           = fr->fshift[0];
1691     facel            = _mm_set1_ps(fr->ic->epsfac);
1692     charge           = mdatoms->chargeA;
1693
1694     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1695     ewtab            = fr->ic->tabq_coul_FDV0;
1696     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1697     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1698
1699     /* Setup water-specific parameters */
1700     inr              = nlist->iinr[0];
1701     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1702     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1703     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1704
1705     jq0              = _mm_set1_ps(charge[inr+0]);
1706     jq1              = _mm_set1_ps(charge[inr+1]);
1707     jq2              = _mm_set1_ps(charge[inr+2]);
1708     qq00             = _mm_mul_ps(iq0,jq0);
1709     qq01             = _mm_mul_ps(iq0,jq1);
1710     qq02             = _mm_mul_ps(iq0,jq2);
1711     qq10             = _mm_mul_ps(iq1,jq0);
1712     qq11             = _mm_mul_ps(iq1,jq1);
1713     qq12             = _mm_mul_ps(iq1,jq2);
1714     qq20             = _mm_mul_ps(iq2,jq0);
1715     qq21             = _mm_mul_ps(iq2,jq1);
1716     qq22             = _mm_mul_ps(iq2,jq2);
1717
1718     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1719     rcutoff_scalar   = fr->ic->rcoulomb;
1720     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1721     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1722
1723     rswitch_scalar   = fr->ic->rcoulomb_switch;
1724     rswitch          = _mm_set1_ps(rswitch_scalar);
1725     /* Setup switch parameters */
1726     d_scalar         = rcutoff_scalar-rswitch_scalar;
1727     d                = _mm_set1_ps(d_scalar);
1728     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1729     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1730     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1731     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1732     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1733     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1734
1735     /* Avoid stupid compiler warnings */
1736     jnrA = jnrB = jnrC = jnrD = 0;
1737     j_coord_offsetA = 0;
1738     j_coord_offsetB = 0;
1739     j_coord_offsetC = 0;
1740     j_coord_offsetD = 0;
1741
1742     outeriter        = 0;
1743     inneriter        = 0;
1744
1745     for(iidx=0;iidx<4*DIM;iidx++)
1746     {
1747         scratch[iidx] = 0.0;
1748     }  
1749
1750     /* Start outer loop over neighborlists */
1751     for(iidx=0; iidx<nri; iidx++)
1752     {
1753         /* Load shift vector for this list */
1754         i_shift_offset   = DIM*shiftidx[iidx];
1755
1756         /* Load limits for loop over neighbors */
1757         j_index_start    = jindex[iidx];
1758         j_index_end      = jindex[iidx+1];
1759
1760         /* Get outer coordinate index */
1761         inr              = iinr[iidx];
1762         i_coord_offset   = DIM*inr;
1763
1764         /* Load i particle coords and add shift vector */
1765         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1766                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1767         
1768         fix0             = _mm_setzero_ps();
1769         fiy0             = _mm_setzero_ps();
1770         fiz0             = _mm_setzero_ps();
1771         fix1             = _mm_setzero_ps();
1772         fiy1             = _mm_setzero_ps();
1773         fiz1             = _mm_setzero_ps();
1774         fix2             = _mm_setzero_ps();
1775         fiy2             = _mm_setzero_ps();
1776         fiz2             = _mm_setzero_ps();
1777
1778         /* Start inner kernel loop */
1779         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1780         {
1781
1782             /* Get j neighbor index, and coordinate index */
1783             jnrA             = jjnr[jidx];
1784             jnrB             = jjnr[jidx+1];
1785             jnrC             = jjnr[jidx+2];
1786             jnrD             = jjnr[jidx+3];
1787             j_coord_offsetA  = DIM*jnrA;
1788             j_coord_offsetB  = DIM*jnrB;
1789             j_coord_offsetC  = DIM*jnrC;
1790             j_coord_offsetD  = DIM*jnrD;
1791
1792             /* load j atom coordinates */
1793             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1794                                               x+j_coord_offsetC,x+j_coord_offsetD,
1795                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1796
1797             /* Calculate displacement vector */
1798             dx00             = _mm_sub_ps(ix0,jx0);
1799             dy00             = _mm_sub_ps(iy0,jy0);
1800             dz00             = _mm_sub_ps(iz0,jz0);
1801             dx01             = _mm_sub_ps(ix0,jx1);
1802             dy01             = _mm_sub_ps(iy0,jy1);
1803             dz01             = _mm_sub_ps(iz0,jz1);
1804             dx02             = _mm_sub_ps(ix0,jx2);
1805             dy02             = _mm_sub_ps(iy0,jy2);
1806             dz02             = _mm_sub_ps(iz0,jz2);
1807             dx10             = _mm_sub_ps(ix1,jx0);
1808             dy10             = _mm_sub_ps(iy1,jy0);
1809             dz10             = _mm_sub_ps(iz1,jz0);
1810             dx11             = _mm_sub_ps(ix1,jx1);
1811             dy11             = _mm_sub_ps(iy1,jy1);
1812             dz11             = _mm_sub_ps(iz1,jz1);
1813             dx12             = _mm_sub_ps(ix1,jx2);
1814             dy12             = _mm_sub_ps(iy1,jy2);
1815             dz12             = _mm_sub_ps(iz1,jz2);
1816             dx20             = _mm_sub_ps(ix2,jx0);
1817             dy20             = _mm_sub_ps(iy2,jy0);
1818             dz20             = _mm_sub_ps(iz2,jz0);
1819             dx21             = _mm_sub_ps(ix2,jx1);
1820             dy21             = _mm_sub_ps(iy2,jy1);
1821             dz21             = _mm_sub_ps(iz2,jz1);
1822             dx22             = _mm_sub_ps(ix2,jx2);
1823             dy22             = _mm_sub_ps(iy2,jy2);
1824             dz22             = _mm_sub_ps(iz2,jz2);
1825
1826             /* Calculate squared distance and things based on it */
1827             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1828             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1829             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1830             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1831             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1832             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1833             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1834             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1835             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1836
1837             rinv00           = sse2_invsqrt_f(rsq00);
1838             rinv01           = sse2_invsqrt_f(rsq01);
1839             rinv02           = sse2_invsqrt_f(rsq02);
1840             rinv10           = sse2_invsqrt_f(rsq10);
1841             rinv11           = sse2_invsqrt_f(rsq11);
1842             rinv12           = sse2_invsqrt_f(rsq12);
1843             rinv20           = sse2_invsqrt_f(rsq20);
1844             rinv21           = sse2_invsqrt_f(rsq21);
1845             rinv22           = sse2_invsqrt_f(rsq22);
1846
1847             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1848             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1849             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1850             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1851             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1852             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1853             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1854             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1855             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1856
1857             fjx0             = _mm_setzero_ps();
1858             fjy0             = _mm_setzero_ps();
1859             fjz0             = _mm_setzero_ps();
1860             fjx1             = _mm_setzero_ps();
1861             fjy1             = _mm_setzero_ps();
1862             fjz1             = _mm_setzero_ps();
1863             fjx2             = _mm_setzero_ps();
1864             fjy2             = _mm_setzero_ps();
1865             fjz2             = _mm_setzero_ps();
1866
1867             /**************************
1868              * CALCULATE INTERACTIONS *
1869              **************************/
1870
1871             if (gmx_mm_any_lt(rsq00,rcutoff2))
1872             {
1873
1874             r00              = _mm_mul_ps(rsq00,rinv00);
1875
1876             /* EWALD ELECTROSTATICS */
1877
1878             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1879             ewrt             = _mm_mul_ps(r00,ewtabscale);
1880             ewitab           = _mm_cvttps_epi32(ewrt);
1881             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1882             ewitab           = _mm_slli_epi32(ewitab,2);
1883             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1884             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1885             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1886             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1887             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1888             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1889             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1890             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
1891             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1892
1893             d                = _mm_sub_ps(r00,rswitch);
1894             d                = _mm_max_ps(d,_mm_setzero_ps());
1895             d2               = _mm_mul_ps(d,d);
1896             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1897
1898             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1899
1900             /* Evaluate switch function */
1901             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1902             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1903             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1904
1905             fscal            = felec;
1906
1907             fscal            = _mm_and_ps(fscal,cutoff_mask);
1908
1909             /* Calculate temporary vectorial force */
1910             tx               = _mm_mul_ps(fscal,dx00);
1911             ty               = _mm_mul_ps(fscal,dy00);
1912             tz               = _mm_mul_ps(fscal,dz00);
1913
1914             /* Update vectorial force */
1915             fix0             = _mm_add_ps(fix0,tx);
1916             fiy0             = _mm_add_ps(fiy0,ty);
1917             fiz0             = _mm_add_ps(fiz0,tz);
1918
1919             fjx0             = _mm_add_ps(fjx0,tx);
1920             fjy0             = _mm_add_ps(fjy0,ty);
1921             fjz0             = _mm_add_ps(fjz0,tz);
1922             
1923             }
1924
1925             /**************************
1926              * CALCULATE INTERACTIONS *
1927              **************************/
1928
1929             if (gmx_mm_any_lt(rsq01,rcutoff2))
1930             {
1931
1932             r01              = _mm_mul_ps(rsq01,rinv01);
1933
1934             /* EWALD ELECTROSTATICS */
1935
1936             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1937             ewrt             = _mm_mul_ps(r01,ewtabscale);
1938             ewitab           = _mm_cvttps_epi32(ewrt);
1939             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1940             ewitab           = _mm_slli_epi32(ewitab,2);
1941             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1942             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1943             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1944             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1945             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1946             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1947             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1948             velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
1949             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1950
1951             d                = _mm_sub_ps(r01,rswitch);
1952             d                = _mm_max_ps(d,_mm_setzero_ps());
1953             d2               = _mm_mul_ps(d,d);
1954             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1955
1956             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1957
1958             /* Evaluate switch function */
1959             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1960             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1961             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1962
1963             fscal            = felec;
1964
1965             fscal            = _mm_and_ps(fscal,cutoff_mask);
1966
1967             /* Calculate temporary vectorial force */
1968             tx               = _mm_mul_ps(fscal,dx01);
1969             ty               = _mm_mul_ps(fscal,dy01);
1970             tz               = _mm_mul_ps(fscal,dz01);
1971
1972             /* Update vectorial force */
1973             fix0             = _mm_add_ps(fix0,tx);
1974             fiy0             = _mm_add_ps(fiy0,ty);
1975             fiz0             = _mm_add_ps(fiz0,tz);
1976
1977             fjx1             = _mm_add_ps(fjx1,tx);
1978             fjy1             = _mm_add_ps(fjy1,ty);
1979             fjz1             = _mm_add_ps(fjz1,tz);
1980             
1981             }
1982
1983             /**************************
1984              * CALCULATE INTERACTIONS *
1985              **************************/
1986
1987             if (gmx_mm_any_lt(rsq02,rcutoff2))
1988             {
1989
1990             r02              = _mm_mul_ps(rsq02,rinv02);
1991
1992             /* EWALD ELECTROSTATICS */
1993
1994             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1995             ewrt             = _mm_mul_ps(r02,ewtabscale);
1996             ewitab           = _mm_cvttps_epi32(ewrt);
1997             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1998             ewitab           = _mm_slli_epi32(ewitab,2);
1999             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2000             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2001             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2002             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2003             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2004             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2005             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2006             velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
2007             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2008
2009             d                = _mm_sub_ps(r02,rswitch);
2010             d                = _mm_max_ps(d,_mm_setzero_ps());
2011             d2               = _mm_mul_ps(d,d);
2012             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2013
2014             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2015
2016             /* Evaluate switch function */
2017             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2018             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2019             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
2020
2021             fscal            = felec;
2022
2023             fscal            = _mm_and_ps(fscal,cutoff_mask);
2024
2025             /* Calculate temporary vectorial force */
2026             tx               = _mm_mul_ps(fscal,dx02);
2027             ty               = _mm_mul_ps(fscal,dy02);
2028             tz               = _mm_mul_ps(fscal,dz02);
2029
2030             /* Update vectorial force */
2031             fix0             = _mm_add_ps(fix0,tx);
2032             fiy0             = _mm_add_ps(fiy0,ty);
2033             fiz0             = _mm_add_ps(fiz0,tz);
2034
2035             fjx2             = _mm_add_ps(fjx2,tx);
2036             fjy2             = _mm_add_ps(fjy2,ty);
2037             fjz2             = _mm_add_ps(fjz2,tz);
2038             
2039             }
2040
2041             /**************************
2042              * CALCULATE INTERACTIONS *
2043              **************************/
2044
2045             if (gmx_mm_any_lt(rsq10,rcutoff2))
2046             {
2047
2048             r10              = _mm_mul_ps(rsq10,rinv10);
2049
2050             /* EWALD ELECTROSTATICS */
2051
2052             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2053             ewrt             = _mm_mul_ps(r10,ewtabscale);
2054             ewitab           = _mm_cvttps_epi32(ewrt);
2055             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2056             ewitab           = _mm_slli_epi32(ewitab,2);
2057             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2058             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2059             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2060             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2061             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2062             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2063             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2064             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
2065             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2066
2067             d                = _mm_sub_ps(r10,rswitch);
2068             d                = _mm_max_ps(d,_mm_setzero_ps());
2069             d2               = _mm_mul_ps(d,d);
2070             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2071
2072             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2073
2074             /* Evaluate switch function */
2075             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2076             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2077             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
2078
2079             fscal            = felec;
2080
2081             fscal            = _mm_and_ps(fscal,cutoff_mask);
2082
2083             /* Calculate temporary vectorial force */
2084             tx               = _mm_mul_ps(fscal,dx10);
2085             ty               = _mm_mul_ps(fscal,dy10);
2086             tz               = _mm_mul_ps(fscal,dz10);
2087
2088             /* Update vectorial force */
2089             fix1             = _mm_add_ps(fix1,tx);
2090             fiy1             = _mm_add_ps(fiy1,ty);
2091             fiz1             = _mm_add_ps(fiz1,tz);
2092
2093             fjx0             = _mm_add_ps(fjx0,tx);
2094             fjy0             = _mm_add_ps(fjy0,ty);
2095             fjz0             = _mm_add_ps(fjz0,tz);
2096             
2097             }
2098
2099             /**************************
2100              * CALCULATE INTERACTIONS *
2101              **************************/
2102
2103             if (gmx_mm_any_lt(rsq11,rcutoff2))
2104             {
2105
2106             r11              = _mm_mul_ps(rsq11,rinv11);
2107
2108             /* EWALD ELECTROSTATICS */
2109
2110             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2111             ewrt             = _mm_mul_ps(r11,ewtabscale);
2112             ewitab           = _mm_cvttps_epi32(ewrt);
2113             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2114             ewitab           = _mm_slli_epi32(ewitab,2);
2115             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2116             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2117             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2118             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2119             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2120             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2121             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2122             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2123             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2124
2125             d                = _mm_sub_ps(r11,rswitch);
2126             d                = _mm_max_ps(d,_mm_setzero_ps());
2127             d2               = _mm_mul_ps(d,d);
2128             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2129
2130             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2131
2132             /* Evaluate switch function */
2133             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2134             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2135             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2136
2137             fscal            = felec;
2138
2139             fscal            = _mm_and_ps(fscal,cutoff_mask);
2140
2141             /* Calculate temporary vectorial force */
2142             tx               = _mm_mul_ps(fscal,dx11);
2143             ty               = _mm_mul_ps(fscal,dy11);
2144             tz               = _mm_mul_ps(fscal,dz11);
2145
2146             /* Update vectorial force */
2147             fix1             = _mm_add_ps(fix1,tx);
2148             fiy1             = _mm_add_ps(fiy1,ty);
2149             fiz1             = _mm_add_ps(fiz1,tz);
2150
2151             fjx1             = _mm_add_ps(fjx1,tx);
2152             fjy1             = _mm_add_ps(fjy1,ty);
2153             fjz1             = _mm_add_ps(fjz1,tz);
2154             
2155             }
2156
2157             /**************************
2158              * CALCULATE INTERACTIONS *
2159              **************************/
2160
2161             if (gmx_mm_any_lt(rsq12,rcutoff2))
2162             {
2163
2164             r12              = _mm_mul_ps(rsq12,rinv12);
2165
2166             /* EWALD ELECTROSTATICS */
2167
2168             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2169             ewrt             = _mm_mul_ps(r12,ewtabscale);
2170             ewitab           = _mm_cvttps_epi32(ewrt);
2171             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2172             ewitab           = _mm_slli_epi32(ewitab,2);
2173             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2174             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2175             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2176             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2177             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2178             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2179             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2180             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2181             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2182
2183             d                = _mm_sub_ps(r12,rswitch);
2184             d                = _mm_max_ps(d,_mm_setzero_ps());
2185             d2               = _mm_mul_ps(d,d);
2186             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2187
2188             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2189
2190             /* Evaluate switch function */
2191             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2192             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2193             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2194
2195             fscal            = felec;
2196
2197             fscal            = _mm_and_ps(fscal,cutoff_mask);
2198
2199             /* Calculate temporary vectorial force */
2200             tx               = _mm_mul_ps(fscal,dx12);
2201             ty               = _mm_mul_ps(fscal,dy12);
2202             tz               = _mm_mul_ps(fscal,dz12);
2203
2204             /* Update vectorial force */
2205             fix1             = _mm_add_ps(fix1,tx);
2206             fiy1             = _mm_add_ps(fiy1,ty);
2207             fiz1             = _mm_add_ps(fiz1,tz);
2208
2209             fjx2             = _mm_add_ps(fjx2,tx);
2210             fjy2             = _mm_add_ps(fjy2,ty);
2211             fjz2             = _mm_add_ps(fjz2,tz);
2212             
2213             }
2214
2215             /**************************
2216              * CALCULATE INTERACTIONS *
2217              **************************/
2218
2219             if (gmx_mm_any_lt(rsq20,rcutoff2))
2220             {
2221
2222             r20              = _mm_mul_ps(rsq20,rinv20);
2223
2224             /* EWALD ELECTROSTATICS */
2225
2226             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2227             ewrt             = _mm_mul_ps(r20,ewtabscale);
2228             ewitab           = _mm_cvttps_epi32(ewrt);
2229             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2230             ewitab           = _mm_slli_epi32(ewitab,2);
2231             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2232             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2233             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2234             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2235             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2236             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2237             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2238             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
2239             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2240
2241             d                = _mm_sub_ps(r20,rswitch);
2242             d                = _mm_max_ps(d,_mm_setzero_ps());
2243             d2               = _mm_mul_ps(d,d);
2244             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2245
2246             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2247
2248             /* Evaluate switch function */
2249             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2250             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2251             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
2252
2253             fscal            = felec;
2254
2255             fscal            = _mm_and_ps(fscal,cutoff_mask);
2256
2257             /* Calculate temporary vectorial force */
2258             tx               = _mm_mul_ps(fscal,dx20);
2259             ty               = _mm_mul_ps(fscal,dy20);
2260             tz               = _mm_mul_ps(fscal,dz20);
2261
2262             /* Update vectorial force */
2263             fix2             = _mm_add_ps(fix2,tx);
2264             fiy2             = _mm_add_ps(fiy2,ty);
2265             fiz2             = _mm_add_ps(fiz2,tz);
2266
2267             fjx0             = _mm_add_ps(fjx0,tx);
2268             fjy0             = _mm_add_ps(fjy0,ty);
2269             fjz0             = _mm_add_ps(fjz0,tz);
2270             
2271             }
2272
2273             /**************************
2274              * CALCULATE INTERACTIONS *
2275              **************************/
2276
2277             if (gmx_mm_any_lt(rsq21,rcutoff2))
2278             {
2279
2280             r21              = _mm_mul_ps(rsq21,rinv21);
2281
2282             /* EWALD ELECTROSTATICS */
2283
2284             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2285             ewrt             = _mm_mul_ps(r21,ewtabscale);
2286             ewitab           = _mm_cvttps_epi32(ewrt);
2287             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2288             ewitab           = _mm_slli_epi32(ewitab,2);
2289             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2290             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2291             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2292             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2293             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2294             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2295             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2296             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2297             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2298
2299             d                = _mm_sub_ps(r21,rswitch);
2300             d                = _mm_max_ps(d,_mm_setzero_ps());
2301             d2               = _mm_mul_ps(d,d);
2302             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2303
2304             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2305
2306             /* Evaluate switch function */
2307             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2308             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2309             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2310
2311             fscal            = felec;
2312
2313             fscal            = _mm_and_ps(fscal,cutoff_mask);
2314
2315             /* Calculate temporary vectorial force */
2316             tx               = _mm_mul_ps(fscal,dx21);
2317             ty               = _mm_mul_ps(fscal,dy21);
2318             tz               = _mm_mul_ps(fscal,dz21);
2319
2320             /* Update vectorial force */
2321             fix2             = _mm_add_ps(fix2,tx);
2322             fiy2             = _mm_add_ps(fiy2,ty);
2323             fiz2             = _mm_add_ps(fiz2,tz);
2324
2325             fjx1             = _mm_add_ps(fjx1,tx);
2326             fjy1             = _mm_add_ps(fjy1,ty);
2327             fjz1             = _mm_add_ps(fjz1,tz);
2328             
2329             }
2330
2331             /**************************
2332              * CALCULATE INTERACTIONS *
2333              **************************/
2334
2335             if (gmx_mm_any_lt(rsq22,rcutoff2))
2336             {
2337
2338             r22              = _mm_mul_ps(rsq22,rinv22);
2339
2340             /* EWALD ELECTROSTATICS */
2341
2342             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2343             ewrt             = _mm_mul_ps(r22,ewtabscale);
2344             ewitab           = _mm_cvttps_epi32(ewrt);
2345             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2346             ewitab           = _mm_slli_epi32(ewitab,2);
2347             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2348             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2349             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2350             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2351             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2352             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2353             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2354             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2355             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2356
2357             d                = _mm_sub_ps(r22,rswitch);
2358             d                = _mm_max_ps(d,_mm_setzero_ps());
2359             d2               = _mm_mul_ps(d,d);
2360             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2361
2362             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2363
2364             /* Evaluate switch function */
2365             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2366             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2367             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2368
2369             fscal            = felec;
2370
2371             fscal            = _mm_and_ps(fscal,cutoff_mask);
2372
2373             /* Calculate temporary vectorial force */
2374             tx               = _mm_mul_ps(fscal,dx22);
2375             ty               = _mm_mul_ps(fscal,dy22);
2376             tz               = _mm_mul_ps(fscal,dz22);
2377
2378             /* Update vectorial force */
2379             fix2             = _mm_add_ps(fix2,tx);
2380             fiy2             = _mm_add_ps(fiy2,ty);
2381             fiz2             = _mm_add_ps(fiz2,tz);
2382
2383             fjx2             = _mm_add_ps(fjx2,tx);
2384             fjy2             = _mm_add_ps(fjy2,ty);
2385             fjz2             = _mm_add_ps(fjz2,tz);
2386             
2387             }
2388
2389             fjptrA             = f+j_coord_offsetA;
2390             fjptrB             = f+j_coord_offsetB;
2391             fjptrC             = f+j_coord_offsetC;
2392             fjptrD             = f+j_coord_offsetD;
2393
2394             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2395                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2396
2397             /* Inner loop uses 558 flops */
2398         }
2399
2400         if(jidx<j_index_end)
2401         {
2402
2403             /* Get j neighbor index, and coordinate index */
2404             jnrlistA         = jjnr[jidx];
2405             jnrlistB         = jjnr[jidx+1];
2406             jnrlistC         = jjnr[jidx+2];
2407             jnrlistD         = jjnr[jidx+3];
2408             /* Sign of each element will be negative for non-real atoms.
2409              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2410              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2411              */
2412             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2413             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2414             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2415             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2416             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2417             j_coord_offsetA  = DIM*jnrA;
2418             j_coord_offsetB  = DIM*jnrB;
2419             j_coord_offsetC  = DIM*jnrC;
2420             j_coord_offsetD  = DIM*jnrD;
2421
2422             /* load j atom coordinates */
2423             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2424                                               x+j_coord_offsetC,x+j_coord_offsetD,
2425                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2426
2427             /* Calculate displacement vector */
2428             dx00             = _mm_sub_ps(ix0,jx0);
2429             dy00             = _mm_sub_ps(iy0,jy0);
2430             dz00             = _mm_sub_ps(iz0,jz0);
2431             dx01             = _mm_sub_ps(ix0,jx1);
2432             dy01             = _mm_sub_ps(iy0,jy1);
2433             dz01             = _mm_sub_ps(iz0,jz1);
2434             dx02             = _mm_sub_ps(ix0,jx2);
2435             dy02             = _mm_sub_ps(iy0,jy2);
2436             dz02             = _mm_sub_ps(iz0,jz2);
2437             dx10             = _mm_sub_ps(ix1,jx0);
2438             dy10             = _mm_sub_ps(iy1,jy0);
2439             dz10             = _mm_sub_ps(iz1,jz0);
2440             dx11             = _mm_sub_ps(ix1,jx1);
2441             dy11             = _mm_sub_ps(iy1,jy1);
2442             dz11             = _mm_sub_ps(iz1,jz1);
2443             dx12             = _mm_sub_ps(ix1,jx2);
2444             dy12             = _mm_sub_ps(iy1,jy2);
2445             dz12             = _mm_sub_ps(iz1,jz2);
2446             dx20             = _mm_sub_ps(ix2,jx0);
2447             dy20             = _mm_sub_ps(iy2,jy0);
2448             dz20             = _mm_sub_ps(iz2,jz0);
2449             dx21             = _mm_sub_ps(ix2,jx1);
2450             dy21             = _mm_sub_ps(iy2,jy1);
2451             dz21             = _mm_sub_ps(iz2,jz1);
2452             dx22             = _mm_sub_ps(ix2,jx2);
2453             dy22             = _mm_sub_ps(iy2,jy2);
2454             dz22             = _mm_sub_ps(iz2,jz2);
2455
2456             /* Calculate squared distance and things based on it */
2457             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2458             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2459             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2460             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2461             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2462             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2463             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2464             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2465             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2466
2467             rinv00           = sse2_invsqrt_f(rsq00);
2468             rinv01           = sse2_invsqrt_f(rsq01);
2469             rinv02           = sse2_invsqrt_f(rsq02);
2470             rinv10           = sse2_invsqrt_f(rsq10);
2471             rinv11           = sse2_invsqrt_f(rsq11);
2472             rinv12           = sse2_invsqrt_f(rsq12);
2473             rinv20           = sse2_invsqrt_f(rsq20);
2474             rinv21           = sse2_invsqrt_f(rsq21);
2475             rinv22           = sse2_invsqrt_f(rsq22);
2476
2477             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
2478             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
2479             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
2480             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
2481             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2482             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2483             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
2484             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2485             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2486
2487             fjx0             = _mm_setzero_ps();
2488             fjy0             = _mm_setzero_ps();
2489             fjz0             = _mm_setzero_ps();
2490             fjx1             = _mm_setzero_ps();
2491             fjy1             = _mm_setzero_ps();
2492             fjz1             = _mm_setzero_ps();
2493             fjx2             = _mm_setzero_ps();
2494             fjy2             = _mm_setzero_ps();
2495             fjz2             = _mm_setzero_ps();
2496
2497             /**************************
2498              * CALCULATE INTERACTIONS *
2499              **************************/
2500
2501             if (gmx_mm_any_lt(rsq00,rcutoff2))
2502             {
2503
2504             r00              = _mm_mul_ps(rsq00,rinv00);
2505             r00              = _mm_andnot_ps(dummy_mask,r00);
2506
2507             /* EWALD ELECTROSTATICS */
2508
2509             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2510             ewrt             = _mm_mul_ps(r00,ewtabscale);
2511             ewitab           = _mm_cvttps_epi32(ewrt);
2512             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2513             ewitab           = _mm_slli_epi32(ewitab,2);
2514             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2515             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2516             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2517             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2518             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2519             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2520             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2521             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
2522             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
2523
2524             d                = _mm_sub_ps(r00,rswitch);
2525             d                = _mm_max_ps(d,_mm_setzero_ps());
2526             d2               = _mm_mul_ps(d,d);
2527             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2528
2529             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2530
2531             /* Evaluate switch function */
2532             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2533             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
2534             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2535
2536             fscal            = felec;
2537
2538             fscal            = _mm_and_ps(fscal,cutoff_mask);
2539
2540             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2541
2542             /* Calculate temporary vectorial force */
2543             tx               = _mm_mul_ps(fscal,dx00);
2544             ty               = _mm_mul_ps(fscal,dy00);
2545             tz               = _mm_mul_ps(fscal,dz00);
2546
2547             /* Update vectorial force */
2548             fix0             = _mm_add_ps(fix0,tx);
2549             fiy0             = _mm_add_ps(fiy0,ty);
2550             fiz0             = _mm_add_ps(fiz0,tz);
2551
2552             fjx0             = _mm_add_ps(fjx0,tx);
2553             fjy0             = _mm_add_ps(fjy0,ty);
2554             fjz0             = _mm_add_ps(fjz0,tz);
2555             
2556             }
2557
2558             /**************************
2559              * CALCULATE INTERACTIONS *
2560              **************************/
2561
2562             if (gmx_mm_any_lt(rsq01,rcutoff2))
2563             {
2564
2565             r01              = _mm_mul_ps(rsq01,rinv01);
2566             r01              = _mm_andnot_ps(dummy_mask,r01);
2567
2568             /* EWALD ELECTROSTATICS */
2569
2570             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2571             ewrt             = _mm_mul_ps(r01,ewtabscale);
2572             ewitab           = _mm_cvttps_epi32(ewrt);
2573             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2574             ewitab           = _mm_slli_epi32(ewitab,2);
2575             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2576             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2577             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2578             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2579             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2580             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2581             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2582             velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
2583             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2584
2585             d                = _mm_sub_ps(r01,rswitch);
2586             d                = _mm_max_ps(d,_mm_setzero_ps());
2587             d2               = _mm_mul_ps(d,d);
2588             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2589
2590             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2591
2592             /* Evaluate switch function */
2593             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2594             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
2595             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
2596
2597             fscal            = felec;
2598
2599             fscal            = _mm_and_ps(fscal,cutoff_mask);
2600
2601             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2602
2603             /* Calculate temporary vectorial force */
2604             tx               = _mm_mul_ps(fscal,dx01);
2605             ty               = _mm_mul_ps(fscal,dy01);
2606             tz               = _mm_mul_ps(fscal,dz01);
2607
2608             /* Update vectorial force */
2609             fix0             = _mm_add_ps(fix0,tx);
2610             fiy0             = _mm_add_ps(fiy0,ty);
2611             fiz0             = _mm_add_ps(fiz0,tz);
2612
2613             fjx1             = _mm_add_ps(fjx1,tx);
2614             fjy1             = _mm_add_ps(fjy1,ty);
2615             fjz1             = _mm_add_ps(fjz1,tz);
2616             
2617             }
2618
2619             /**************************
2620              * CALCULATE INTERACTIONS *
2621              **************************/
2622
2623             if (gmx_mm_any_lt(rsq02,rcutoff2))
2624             {
2625
2626             r02              = _mm_mul_ps(rsq02,rinv02);
2627             r02              = _mm_andnot_ps(dummy_mask,r02);
2628
2629             /* EWALD ELECTROSTATICS */
2630
2631             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2632             ewrt             = _mm_mul_ps(r02,ewtabscale);
2633             ewitab           = _mm_cvttps_epi32(ewrt);
2634             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2635             ewitab           = _mm_slli_epi32(ewitab,2);
2636             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2637             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2638             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2639             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2640             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2641             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2642             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2643             velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
2644             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2645
2646             d                = _mm_sub_ps(r02,rswitch);
2647             d                = _mm_max_ps(d,_mm_setzero_ps());
2648             d2               = _mm_mul_ps(d,d);
2649             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2650
2651             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2652
2653             /* Evaluate switch function */
2654             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2655             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2656             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
2657
2658             fscal            = felec;
2659
2660             fscal            = _mm_and_ps(fscal,cutoff_mask);
2661
2662             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2663
2664             /* Calculate temporary vectorial force */
2665             tx               = _mm_mul_ps(fscal,dx02);
2666             ty               = _mm_mul_ps(fscal,dy02);
2667             tz               = _mm_mul_ps(fscal,dz02);
2668
2669             /* Update vectorial force */
2670             fix0             = _mm_add_ps(fix0,tx);
2671             fiy0             = _mm_add_ps(fiy0,ty);
2672             fiz0             = _mm_add_ps(fiz0,tz);
2673
2674             fjx2             = _mm_add_ps(fjx2,tx);
2675             fjy2             = _mm_add_ps(fjy2,ty);
2676             fjz2             = _mm_add_ps(fjz2,tz);
2677             
2678             }
2679
2680             /**************************
2681              * CALCULATE INTERACTIONS *
2682              **************************/
2683
2684             if (gmx_mm_any_lt(rsq10,rcutoff2))
2685             {
2686
2687             r10              = _mm_mul_ps(rsq10,rinv10);
2688             r10              = _mm_andnot_ps(dummy_mask,r10);
2689
2690             /* EWALD ELECTROSTATICS */
2691
2692             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2693             ewrt             = _mm_mul_ps(r10,ewtabscale);
2694             ewitab           = _mm_cvttps_epi32(ewrt);
2695             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2696             ewitab           = _mm_slli_epi32(ewitab,2);
2697             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2698             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2699             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2700             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2701             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2702             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2703             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2704             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
2705             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2706
2707             d                = _mm_sub_ps(r10,rswitch);
2708             d                = _mm_max_ps(d,_mm_setzero_ps());
2709             d2               = _mm_mul_ps(d,d);
2710             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2711
2712             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2713
2714             /* Evaluate switch function */
2715             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2716             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2717             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
2718
2719             fscal            = felec;
2720
2721             fscal            = _mm_and_ps(fscal,cutoff_mask);
2722
2723             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2724
2725             /* Calculate temporary vectorial force */
2726             tx               = _mm_mul_ps(fscal,dx10);
2727             ty               = _mm_mul_ps(fscal,dy10);
2728             tz               = _mm_mul_ps(fscal,dz10);
2729
2730             /* Update vectorial force */
2731             fix1             = _mm_add_ps(fix1,tx);
2732             fiy1             = _mm_add_ps(fiy1,ty);
2733             fiz1             = _mm_add_ps(fiz1,tz);
2734
2735             fjx0             = _mm_add_ps(fjx0,tx);
2736             fjy0             = _mm_add_ps(fjy0,ty);
2737             fjz0             = _mm_add_ps(fjz0,tz);
2738             
2739             }
2740
2741             /**************************
2742              * CALCULATE INTERACTIONS *
2743              **************************/
2744
2745             if (gmx_mm_any_lt(rsq11,rcutoff2))
2746             {
2747
2748             r11              = _mm_mul_ps(rsq11,rinv11);
2749             r11              = _mm_andnot_ps(dummy_mask,r11);
2750
2751             /* EWALD ELECTROSTATICS */
2752
2753             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2754             ewrt             = _mm_mul_ps(r11,ewtabscale);
2755             ewitab           = _mm_cvttps_epi32(ewrt);
2756             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2757             ewitab           = _mm_slli_epi32(ewitab,2);
2758             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2759             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2760             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2761             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2762             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2763             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2764             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2765             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2766             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2767
2768             d                = _mm_sub_ps(r11,rswitch);
2769             d                = _mm_max_ps(d,_mm_setzero_ps());
2770             d2               = _mm_mul_ps(d,d);
2771             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2772
2773             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2774
2775             /* Evaluate switch function */
2776             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2777             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2778             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2779
2780             fscal            = felec;
2781
2782             fscal            = _mm_and_ps(fscal,cutoff_mask);
2783
2784             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2785
2786             /* Calculate temporary vectorial force */
2787             tx               = _mm_mul_ps(fscal,dx11);
2788             ty               = _mm_mul_ps(fscal,dy11);
2789             tz               = _mm_mul_ps(fscal,dz11);
2790
2791             /* Update vectorial force */
2792             fix1             = _mm_add_ps(fix1,tx);
2793             fiy1             = _mm_add_ps(fiy1,ty);
2794             fiz1             = _mm_add_ps(fiz1,tz);
2795
2796             fjx1             = _mm_add_ps(fjx1,tx);
2797             fjy1             = _mm_add_ps(fjy1,ty);
2798             fjz1             = _mm_add_ps(fjz1,tz);
2799             
2800             }
2801
2802             /**************************
2803              * CALCULATE INTERACTIONS *
2804              **************************/
2805
2806             if (gmx_mm_any_lt(rsq12,rcutoff2))
2807             {
2808
2809             r12              = _mm_mul_ps(rsq12,rinv12);
2810             r12              = _mm_andnot_ps(dummy_mask,r12);
2811
2812             /* EWALD ELECTROSTATICS */
2813
2814             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2815             ewrt             = _mm_mul_ps(r12,ewtabscale);
2816             ewitab           = _mm_cvttps_epi32(ewrt);
2817             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2818             ewitab           = _mm_slli_epi32(ewitab,2);
2819             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2820             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2821             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2822             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2823             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2824             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2825             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2826             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2827             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2828
2829             d                = _mm_sub_ps(r12,rswitch);
2830             d                = _mm_max_ps(d,_mm_setzero_ps());
2831             d2               = _mm_mul_ps(d,d);
2832             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2833
2834             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2835
2836             /* Evaluate switch function */
2837             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2838             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2839             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2840
2841             fscal            = felec;
2842
2843             fscal            = _mm_and_ps(fscal,cutoff_mask);
2844
2845             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2846
2847             /* Calculate temporary vectorial force */
2848             tx               = _mm_mul_ps(fscal,dx12);
2849             ty               = _mm_mul_ps(fscal,dy12);
2850             tz               = _mm_mul_ps(fscal,dz12);
2851
2852             /* Update vectorial force */
2853             fix1             = _mm_add_ps(fix1,tx);
2854             fiy1             = _mm_add_ps(fiy1,ty);
2855             fiz1             = _mm_add_ps(fiz1,tz);
2856
2857             fjx2             = _mm_add_ps(fjx2,tx);
2858             fjy2             = _mm_add_ps(fjy2,ty);
2859             fjz2             = _mm_add_ps(fjz2,tz);
2860             
2861             }
2862
2863             /**************************
2864              * CALCULATE INTERACTIONS *
2865              **************************/
2866
2867             if (gmx_mm_any_lt(rsq20,rcutoff2))
2868             {
2869
2870             r20              = _mm_mul_ps(rsq20,rinv20);
2871             r20              = _mm_andnot_ps(dummy_mask,r20);
2872
2873             /* EWALD ELECTROSTATICS */
2874
2875             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2876             ewrt             = _mm_mul_ps(r20,ewtabscale);
2877             ewitab           = _mm_cvttps_epi32(ewrt);
2878             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2879             ewitab           = _mm_slli_epi32(ewitab,2);
2880             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2881             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2882             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2883             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2884             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2885             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2886             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2887             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
2888             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2889
2890             d                = _mm_sub_ps(r20,rswitch);
2891             d                = _mm_max_ps(d,_mm_setzero_ps());
2892             d2               = _mm_mul_ps(d,d);
2893             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2894
2895             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2896
2897             /* Evaluate switch function */
2898             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2899             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2900             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
2901
2902             fscal            = felec;
2903
2904             fscal            = _mm_and_ps(fscal,cutoff_mask);
2905
2906             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2907
2908             /* Calculate temporary vectorial force */
2909             tx               = _mm_mul_ps(fscal,dx20);
2910             ty               = _mm_mul_ps(fscal,dy20);
2911             tz               = _mm_mul_ps(fscal,dz20);
2912
2913             /* Update vectorial force */
2914             fix2             = _mm_add_ps(fix2,tx);
2915             fiy2             = _mm_add_ps(fiy2,ty);
2916             fiz2             = _mm_add_ps(fiz2,tz);
2917
2918             fjx0             = _mm_add_ps(fjx0,tx);
2919             fjy0             = _mm_add_ps(fjy0,ty);
2920             fjz0             = _mm_add_ps(fjz0,tz);
2921             
2922             }
2923
2924             /**************************
2925              * CALCULATE INTERACTIONS *
2926              **************************/
2927
2928             if (gmx_mm_any_lt(rsq21,rcutoff2))
2929             {
2930
2931             r21              = _mm_mul_ps(rsq21,rinv21);
2932             r21              = _mm_andnot_ps(dummy_mask,r21);
2933
2934             /* EWALD ELECTROSTATICS */
2935
2936             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2937             ewrt             = _mm_mul_ps(r21,ewtabscale);
2938             ewitab           = _mm_cvttps_epi32(ewrt);
2939             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2940             ewitab           = _mm_slli_epi32(ewitab,2);
2941             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2942             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2943             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2944             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2945             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2946             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2947             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2948             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2949             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2950
2951             d                = _mm_sub_ps(r21,rswitch);
2952             d                = _mm_max_ps(d,_mm_setzero_ps());
2953             d2               = _mm_mul_ps(d,d);
2954             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2955
2956             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2957
2958             /* Evaluate switch function */
2959             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2960             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2961             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2962
2963             fscal            = felec;
2964
2965             fscal            = _mm_and_ps(fscal,cutoff_mask);
2966
2967             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2968
2969             /* Calculate temporary vectorial force */
2970             tx               = _mm_mul_ps(fscal,dx21);
2971             ty               = _mm_mul_ps(fscal,dy21);
2972             tz               = _mm_mul_ps(fscal,dz21);
2973
2974             /* Update vectorial force */
2975             fix2             = _mm_add_ps(fix2,tx);
2976             fiy2             = _mm_add_ps(fiy2,ty);
2977             fiz2             = _mm_add_ps(fiz2,tz);
2978
2979             fjx1             = _mm_add_ps(fjx1,tx);
2980             fjy1             = _mm_add_ps(fjy1,ty);
2981             fjz1             = _mm_add_ps(fjz1,tz);
2982             
2983             }
2984
2985             /**************************
2986              * CALCULATE INTERACTIONS *
2987              **************************/
2988
2989             if (gmx_mm_any_lt(rsq22,rcutoff2))
2990             {
2991
2992             r22              = _mm_mul_ps(rsq22,rinv22);
2993             r22              = _mm_andnot_ps(dummy_mask,r22);
2994
2995             /* EWALD ELECTROSTATICS */
2996
2997             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2998             ewrt             = _mm_mul_ps(r22,ewtabscale);
2999             ewitab           = _mm_cvttps_epi32(ewrt);
3000             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3001             ewitab           = _mm_slli_epi32(ewitab,2);
3002             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3003             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3004             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3005             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3006             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3007             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3008             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3009             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
3010             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
3011
3012             d                = _mm_sub_ps(r22,rswitch);
3013             d                = _mm_max_ps(d,_mm_setzero_ps());
3014             d2               = _mm_mul_ps(d,d);
3015             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3016
3017             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3018
3019             /* Evaluate switch function */
3020             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3021             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
3022             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
3023
3024             fscal            = felec;
3025
3026             fscal            = _mm_and_ps(fscal,cutoff_mask);
3027
3028             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3029
3030             /* Calculate temporary vectorial force */
3031             tx               = _mm_mul_ps(fscal,dx22);
3032             ty               = _mm_mul_ps(fscal,dy22);
3033             tz               = _mm_mul_ps(fscal,dz22);
3034
3035             /* Update vectorial force */
3036             fix2             = _mm_add_ps(fix2,tx);
3037             fiy2             = _mm_add_ps(fiy2,ty);
3038             fiz2             = _mm_add_ps(fiz2,tz);
3039
3040             fjx2             = _mm_add_ps(fjx2,tx);
3041             fjy2             = _mm_add_ps(fjy2,ty);
3042             fjz2             = _mm_add_ps(fjz2,tz);
3043             
3044             }
3045
3046             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3047             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3048             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3049             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3050
3051             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
3052                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
3053
3054             /* Inner loop uses 567 flops */
3055         }
3056
3057         /* End of innermost loop */
3058
3059         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
3060                                               f+i_coord_offset,fshift+i_shift_offset);
3061
3062         /* Increment number of inner iterations */
3063         inneriter                  += j_index_end - j_index_start;
3064
3065         /* Outer loop uses 18 flops */
3066     }
3067
3068     /* Increment number of outer iterations */
3069     outeriter        += nri;
3070
3071     /* Update outer/inner flops */
3072
3073     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*567);
3074 }