Merge release-5-0 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_ElecEwSw_VdwNone_GeomW4W4_sse4_1_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
48
49 #include "gromacs/simd/math_x86_sse4_1_single.h"
50 #include "kernelutil_x86_sse4_1_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse4_1_single
54  * Electrostatics interaction: Ewald
55  * VdW interaction:            None
56  * Geometry:                   Water4-Water4
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse4_1_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83     real             scratch[4*DIM];
84     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85     int              vdwioffset1;
86     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     int              vdwioffset2;
88     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     int              vdwioffset3;
90     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
96     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
97     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
98     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
99     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
100     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
101     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
102     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
103     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
104     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
105     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
106     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
107     real             *charge;
108     __m128i          ewitab;
109     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
110     real             *ewtab;
111     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
112     real             rswitch_scalar,d_scalar;
113     __m128           dummy_mask,cutoff_mask;
114     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
115     __m128           one     = _mm_set1_ps(1.0);
116     __m128           two     = _mm_set1_ps(2.0);
117     x                = xx[0];
118     f                = ff[0];
119
120     nri              = nlist->nri;
121     iinr             = nlist->iinr;
122     jindex           = nlist->jindex;
123     jjnr             = nlist->jjnr;
124     shiftidx         = nlist->shift;
125     gid              = nlist->gid;
126     shiftvec         = fr->shift_vec[0];
127     fshift           = fr->fshift[0];
128     facel            = _mm_set1_ps(fr->epsfac);
129     charge           = mdatoms->chargeA;
130
131     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
132     ewtab            = fr->ic->tabq_coul_FDV0;
133     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
134     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
135
136     /* Setup water-specific parameters */
137     inr              = nlist->iinr[0];
138     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
139     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
140     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
141
142     jq1              = _mm_set1_ps(charge[inr+1]);
143     jq2              = _mm_set1_ps(charge[inr+2]);
144     jq3              = _mm_set1_ps(charge[inr+3]);
145     qq11             = _mm_mul_ps(iq1,jq1);
146     qq12             = _mm_mul_ps(iq1,jq2);
147     qq13             = _mm_mul_ps(iq1,jq3);
148     qq21             = _mm_mul_ps(iq2,jq1);
149     qq22             = _mm_mul_ps(iq2,jq2);
150     qq23             = _mm_mul_ps(iq2,jq3);
151     qq31             = _mm_mul_ps(iq3,jq1);
152     qq32             = _mm_mul_ps(iq3,jq2);
153     qq33             = _mm_mul_ps(iq3,jq3);
154
155     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
156     rcutoff_scalar   = fr->rcoulomb;
157     rcutoff          = _mm_set1_ps(rcutoff_scalar);
158     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
159
160     rswitch_scalar   = fr->rcoulomb_switch;
161     rswitch          = _mm_set1_ps(rswitch_scalar);
162     /* Setup switch parameters */
163     d_scalar         = rcutoff_scalar-rswitch_scalar;
164     d                = _mm_set1_ps(d_scalar);
165     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
166     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
167     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
168     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
169     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
170     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
171
172     /* Avoid stupid compiler warnings */
173     jnrA = jnrB = jnrC = jnrD = 0;
174     j_coord_offsetA = 0;
175     j_coord_offsetB = 0;
176     j_coord_offsetC = 0;
177     j_coord_offsetD = 0;
178
179     outeriter        = 0;
180     inneriter        = 0;
181
182     for(iidx=0;iidx<4*DIM;iidx++)
183     {
184         scratch[iidx] = 0.0;
185     }
186
187     /* Start outer loop over neighborlists */
188     for(iidx=0; iidx<nri; iidx++)
189     {
190         /* Load shift vector for this list */
191         i_shift_offset   = DIM*shiftidx[iidx];
192
193         /* Load limits for loop over neighbors */
194         j_index_start    = jindex[iidx];
195         j_index_end      = jindex[iidx+1];
196
197         /* Get outer coordinate index */
198         inr              = iinr[iidx];
199         i_coord_offset   = DIM*inr;
200
201         /* Load i particle coords and add shift vector */
202         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
203                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
204
205         fix1             = _mm_setzero_ps();
206         fiy1             = _mm_setzero_ps();
207         fiz1             = _mm_setzero_ps();
208         fix2             = _mm_setzero_ps();
209         fiy2             = _mm_setzero_ps();
210         fiz2             = _mm_setzero_ps();
211         fix3             = _mm_setzero_ps();
212         fiy3             = _mm_setzero_ps();
213         fiz3             = _mm_setzero_ps();
214
215         /* Reset potential sums */
216         velecsum         = _mm_setzero_ps();
217
218         /* Start inner kernel loop */
219         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
220         {
221
222             /* Get j neighbor index, and coordinate index */
223             jnrA             = jjnr[jidx];
224             jnrB             = jjnr[jidx+1];
225             jnrC             = jjnr[jidx+2];
226             jnrD             = jjnr[jidx+3];
227             j_coord_offsetA  = DIM*jnrA;
228             j_coord_offsetB  = DIM*jnrB;
229             j_coord_offsetC  = DIM*jnrC;
230             j_coord_offsetD  = DIM*jnrD;
231
232             /* load j atom coordinates */
233             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
234                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
235                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
236
237             /* Calculate displacement vector */
238             dx11             = _mm_sub_ps(ix1,jx1);
239             dy11             = _mm_sub_ps(iy1,jy1);
240             dz11             = _mm_sub_ps(iz1,jz1);
241             dx12             = _mm_sub_ps(ix1,jx2);
242             dy12             = _mm_sub_ps(iy1,jy2);
243             dz12             = _mm_sub_ps(iz1,jz2);
244             dx13             = _mm_sub_ps(ix1,jx3);
245             dy13             = _mm_sub_ps(iy1,jy3);
246             dz13             = _mm_sub_ps(iz1,jz3);
247             dx21             = _mm_sub_ps(ix2,jx1);
248             dy21             = _mm_sub_ps(iy2,jy1);
249             dz21             = _mm_sub_ps(iz2,jz1);
250             dx22             = _mm_sub_ps(ix2,jx2);
251             dy22             = _mm_sub_ps(iy2,jy2);
252             dz22             = _mm_sub_ps(iz2,jz2);
253             dx23             = _mm_sub_ps(ix2,jx3);
254             dy23             = _mm_sub_ps(iy2,jy3);
255             dz23             = _mm_sub_ps(iz2,jz3);
256             dx31             = _mm_sub_ps(ix3,jx1);
257             dy31             = _mm_sub_ps(iy3,jy1);
258             dz31             = _mm_sub_ps(iz3,jz1);
259             dx32             = _mm_sub_ps(ix3,jx2);
260             dy32             = _mm_sub_ps(iy3,jy2);
261             dz32             = _mm_sub_ps(iz3,jz2);
262             dx33             = _mm_sub_ps(ix3,jx3);
263             dy33             = _mm_sub_ps(iy3,jy3);
264             dz33             = _mm_sub_ps(iz3,jz3);
265
266             /* Calculate squared distance and things based on it */
267             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
268             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
269             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
270             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
271             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
272             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
273             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
274             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
275             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
276
277             rinv11           = gmx_mm_invsqrt_ps(rsq11);
278             rinv12           = gmx_mm_invsqrt_ps(rsq12);
279             rinv13           = gmx_mm_invsqrt_ps(rsq13);
280             rinv21           = gmx_mm_invsqrt_ps(rsq21);
281             rinv22           = gmx_mm_invsqrt_ps(rsq22);
282             rinv23           = gmx_mm_invsqrt_ps(rsq23);
283             rinv31           = gmx_mm_invsqrt_ps(rsq31);
284             rinv32           = gmx_mm_invsqrt_ps(rsq32);
285             rinv33           = gmx_mm_invsqrt_ps(rsq33);
286
287             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
288             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
289             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
290             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
291             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
292             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
293             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
294             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
295             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
296
297             fjx1             = _mm_setzero_ps();
298             fjy1             = _mm_setzero_ps();
299             fjz1             = _mm_setzero_ps();
300             fjx2             = _mm_setzero_ps();
301             fjy2             = _mm_setzero_ps();
302             fjz2             = _mm_setzero_ps();
303             fjx3             = _mm_setzero_ps();
304             fjy3             = _mm_setzero_ps();
305             fjz3             = _mm_setzero_ps();
306
307             /**************************
308              * CALCULATE INTERACTIONS *
309              **************************/
310
311             if (gmx_mm_any_lt(rsq11,rcutoff2))
312             {
313
314             r11              = _mm_mul_ps(rsq11,rinv11);
315
316             /* EWALD ELECTROSTATICS */
317
318             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
319             ewrt             = _mm_mul_ps(r11,ewtabscale);
320             ewitab           = _mm_cvttps_epi32(ewrt);
321             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
322             ewitab           = _mm_slli_epi32(ewitab,2);
323             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
324             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
325             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
326             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
327             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
328             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
329             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
330             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
331             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
332
333             d                = _mm_sub_ps(r11,rswitch);
334             d                = _mm_max_ps(d,_mm_setzero_ps());
335             d2               = _mm_mul_ps(d,d);
336             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
337
338             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
339
340             /* Evaluate switch function */
341             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
342             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
343             velec            = _mm_mul_ps(velec,sw);
344             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
345
346             /* Update potential sum for this i atom from the interaction with this j atom. */
347             velec            = _mm_and_ps(velec,cutoff_mask);
348             velecsum         = _mm_add_ps(velecsum,velec);
349
350             fscal            = felec;
351
352             fscal            = _mm_and_ps(fscal,cutoff_mask);
353
354             /* Calculate temporary vectorial force */
355             tx               = _mm_mul_ps(fscal,dx11);
356             ty               = _mm_mul_ps(fscal,dy11);
357             tz               = _mm_mul_ps(fscal,dz11);
358
359             /* Update vectorial force */
360             fix1             = _mm_add_ps(fix1,tx);
361             fiy1             = _mm_add_ps(fiy1,ty);
362             fiz1             = _mm_add_ps(fiz1,tz);
363
364             fjx1             = _mm_add_ps(fjx1,tx);
365             fjy1             = _mm_add_ps(fjy1,ty);
366             fjz1             = _mm_add_ps(fjz1,tz);
367
368             }
369
370             /**************************
371              * CALCULATE INTERACTIONS *
372              **************************/
373
374             if (gmx_mm_any_lt(rsq12,rcutoff2))
375             {
376
377             r12              = _mm_mul_ps(rsq12,rinv12);
378
379             /* EWALD ELECTROSTATICS */
380
381             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
382             ewrt             = _mm_mul_ps(r12,ewtabscale);
383             ewitab           = _mm_cvttps_epi32(ewrt);
384             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
385             ewitab           = _mm_slli_epi32(ewitab,2);
386             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
387             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
388             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
389             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
390             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
391             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
392             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
393             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
394             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
395
396             d                = _mm_sub_ps(r12,rswitch);
397             d                = _mm_max_ps(d,_mm_setzero_ps());
398             d2               = _mm_mul_ps(d,d);
399             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
400
401             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
402
403             /* Evaluate switch function */
404             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
405             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
406             velec            = _mm_mul_ps(velec,sw);
407             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
408
409             /* Update potential sum for this i atom from the interaction with this j atom. */
410             velec            = _mm_and_ps(velec,cutoff_mask);
411             velecsum         = _mm_add_ps(velecsum,velec);
412
413             fscal            = felec;
414
415             fscal            = _mm_and_ps(fscal,cutoff_mask);
416
417             /* Calculate temporary vectorial force */
418             tx               = _mm_mul_ps(fscal,dx12);
419             ty               = _mm_mul_ps(fscal,dy12);
420             tz               = _mm_mul_ps(fscal,dz12);
421
422             /* Update vectorial force */
423             fix1             = _mm_add_ps(fix1,tx);
424             fiy1             = _mm_add_ps(fiy1,ty);
425             fiz1             = _mm_add_ps(fiz1,tz);
426
427             fjx2             = _mm_add_ps(fjx2,tx);
428             fjy2             = _mm_add_ps(fjy2,ty);
429             fjz2             = _mm_add_ps(fjz2,tz);
430
431             }
432
433             /**************************
434              * CALCULATE INTERACTIONS *
435              **************************/
436
437             if (gmx_mm_any_lt(rsq13,rcutoff2))
438             {
439
440             r13              = _mm_mul_ps(rsq13,rinv13);
441
442             /* EWALD ELECTROSTATICS */
443
444             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
445             ewrt             = _mm_mul_ps(r13,ewtabscale);
446             ewitab           = _mm_cvttps_epi32(ewrt);
447             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
448             ewitab           = _mm_slli_epi32(ewitab,2);
449             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
450             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
451             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
452             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
453             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
454             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
455             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
456             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
457             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
458
459             d                = _mm_sub_ps(r13,rswitch);
460             d                = _mm_max_ps(d,_mm_setzero_ps());
461             d2               = _mm_mul_ps(d,d);
462             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
463
464             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
465
466             /* Evaluate switch function */
467             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
468             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
469             velec            = _mm_mul_ps(velec,sw);
470             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
471
472             /* Update potential sum for this i atom from the interaction with this j atom. */
473             velec            = _mm_and_ps(velec,cutoff_mask);
474             velecsum         = _mm_add_ps(velecsum,velec);
475
476             fscal            = felec;
477
478             fscal            = _mm_and_ps(fscal,cutoff_mask);
479
480             /* Calculate temporary vectorial force */
481             tx               = _mm_mul_ps(fscal,dx13);
482             ty               = _mm_mul_ps(fscal,dy13);
483             tz               = _mm_mul_ps(fscal,dz13);
484
485             /* Update vectorial force */
486             fix1             = _mm_add_ps(fix1,tx);
487             fiy1             = _mm_add_ps(fiy1,ty);
488             fiz1             = _mm_add_ps(fiz1,tz);
489
490             fjx3             = _mm_add_ps(fjx3,tx);
491             fjy3             = _mm_add_ps(fjy3,ty);
492             fjz3             = _mm_add_ps(fjz3,tz);
493
494             }
495
496             /**************************
497              * CALCULATE INTERACTIONS *
498              **************************/
499
500             if (gmx_mm_any_lt(rsq21,rcutoff2))
501             {
502
503             r21              = _mm_mul_ps(rsq21,rinv21);
504
505             /* EWALD ELECTROSTATICS */
506
507             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
508             ewrt             = _mm_mul_ps(r21,ewtabscale);
509             ewitab           = _mm_cvttps_epi32(ewrt);
510             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
511             ewitab           = _mm_slli_epi32(ewitab,2);
512             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
513             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
514             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
515             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
516             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
517             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
518             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
519             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
520             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
521
522             d                = _mm_sub_ps(r21,rswitch);
523             d                = _mm_max_ps(d,_mm_setzero_ps());
524             d2               = _mm_mul_ps(d,d);
525             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
526
527             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
528
529             /* Evaluate switch function */
530             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
531             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
532             velec            = _mm_mul_ps(velec,sw);
533             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
534
535             /* Update potential sum for this i atom from the interaction with this j atom. */
536             velec            = _mm_and_ps(velec,cutoff_mask);
537             velecsum         = _mm_add_ps(velecsum,velec);
538
539             fscal            = felec;
540
541             fscal            = _mm_and_ps(fscal,cutoff_mask);
542
543             /* Calculate temporary vectorial force */
544             tx               = _mm_mul_ps(fscal,dx21);
545             ty               = _mm_mul_ps(fscal,dy21);
546             tz               = _mm_mul_ps(fscal,dz21);
547
548             /* Update vectorial force */
549             fix2             = _mm_add_ps(fix2,tx);
550             fiy2             = _mm_add_ps(fiy2,ty);
551             fiz2             = _mm_add_ps(fiz2,tz);
552
553             fjx1             = _mm_add_ps(fjx1,tx);
554             fjy1             = _mm_add_ps(fjy1,ty);
555             fjz1             = _mm_add_ps(fjz1,tz);
556
557             }
558
559             /**************************
560              * CALCULATE INTERACTIONS *
561              **************************/
562
563             if (gmx_mm_any_lt(rsq22,rcutoff2))
564             {
565
566             r22              = _mm_mul_ps(rsq22,rinv22);
567
568             /* EWALD ELECTROSTATICS */
569
570             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
571             ewrt             = _mm_mul_ps(r22,ewtabscale);
572             ewitab           = _mm_cvttps_epi32(ewrt);
573             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
574             ewitab           = _mm_slli_epi32(ewitab,2);
575             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
576             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
577             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
578             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
579             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
580             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
581             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
582             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
583             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
584
585             d                = _mm_sub_ps(r22,rswitch);
586             d                = _mm_max_ps(d,_mm_setzero_ps());
587             d2               = _mm_mul_ps(d,d);
588             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
589
590             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
591
592             /* Evaluate switch function */
593             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
594             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
595             velec            = _mm_mul_ps(velec,sw);
596             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
597
598             /* Update potential sum for this i atom from the interaction with this j atom. */
599             velec            = _mm_and_ps(velec,cutoff_mask);
600             velecsum         = _mm_add_ps(velecsum,velec);
601
602             fscal            = felec;
603
604             fscal            = _mm_and_ps(fscal,cutoff_mask);
605
606             /* Calculate temporary vectorial force */
607             tx               = _mm_mul_ps(fscal,dx22);
608             ty               = _mm_mul_ps(fscal,dy22);
609             tz               = _mm_mul_ps(fscal,dz22);
610
611             /* Update vectorial force */
612             fix2             = _mm_add_ps(fix2,tx);
613             fiy2             = _mm_add_ps(fiy2,ty);
614             fiz2             = _mm_add_ps(fiz2,tz);
615
616             fjx2             = _mm_add_ps(fjx2,tx);
617             fjy2             = _mm_add_ps(fjy2,ty);
618             fjz2             = _mm_add_ps(fjz2,tz);
619
620             }
621
622             /**************************
623              * CALCULATE INTERACTIONS *
624              **************************/
625
626             if (gmx_mm_any_lt(rsq23,rcutoff2))
627             {
628
629             r23              = _mm_mul_ps(rsq23,rinv23);
630
631             /* EWALD ELECTROSTATICS */
632
633             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
634             ewrt             = _mm_mul_ps(r23,ewtabscale);
635             ewitab           = _mm_cvttps_epi32(ewrt);
636             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
637             ewitab           = _mm_slli_epi32(ewitab,2);
638             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
639             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
640             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
641             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
642             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
643             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
644             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
645             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
646             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
647
648             d                = _mm_sub_ps(r23,rswitch);
649             d                = _mm_max_ps(d,_mm_setzero_ps());
650             d2               = _mm_mul_ps(d,d);
651             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
652
653             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
654
655             /* Evaluate switch function */
656             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
657             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
658             velec            = _mm_mul_ps(velec,sw);
659             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
660
661             /* Update potential sum for this i atom from the interaction with this j atom. */
662             velec            = _mm_and_ps(velec,cutoff_mask);
663             velecsum         = _mm_add_ps(velecsum,velec);
664
665             fscal            = felec;
666
667             fscal            = _mm_and_ps(fscal,cutoff_mask);
668
669             /* Calculate temporary vectorial force */
670             tx               = _mm_mul_ps(fscal,dx23);
671             ty               = _mm_mul_ps(fscal,dy23);
672             tz               = _mm_mul_ps(fscal,dz23);
673
674             /* Update vectorial force */
675             fix2             = _mm_add_ps(fix2,tx);
676             fiy2             = _mm_add_ps(fiy2,ty);
677             fiz2             = _mm_add_ps(fiz2,tz);
678
679             fjx3             = _mm_add_ps(fjx3,tx);
680             fjy3             = _mm_add_ps(fjy3,ty);
681             fjz3             = _mm_add_ps(fjz3,tz);
682
683             }
684
685             /**************************
686              * CALCULATE INTERACTIONS *
687              **************************/
688
689             if (gmx_mm_any_lt(rsq31,rcutoff2))
690             {
691
692             r31              = _mm_mul_ps(rsq31,rinv31);
693
694             /* EWALD ELECTROSTATICS */
695
696             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
697             ewrt             = _mm_mul_ps(r31,ewtabscale);
698             ewitab           = _mm_cvttps_epi32(ewrt);
699             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
700             ewitab           = _mm_slli_epi32(ewitab,2);
701             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
702             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
703             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
704             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
705             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
706             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
707             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
708             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
709             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
710
711             d                = _mm_sub_ps(r31,rswitch);
712             d                = _mm_max_ps(d,_mm_setzero_ps());
713             d2               = _mm_mul_ps(d,d);
714             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
715
716             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
717
718             /* Evaluate switch function */
719             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
720             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
721             velec            = _mm_mul_ps(velec,sw);
722             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
723
724             /* Update potential sum for this i atom from the interaction with this j atom. */
725             velec            = _mm_and_ps(velec,cutoff_mask);
726             velecsum         = _mm_add_ps(velecsum,velec);
727
728             fscal            = felec;
729
730             fscal            = _mm_and_ps(fscal,cutoff_mask);
731
732             /* Calculate temporary vectorial force */
733             tx               = _mm_mul_ps(fscal,dx31);
734             ty               = _mm_mul_ps(fscal,dy31);
735             tz               = _mm_mul_ps(fscal,dz31);
736
737             /* Update vectorial force */
738             fix3             = _mm_add_ps(fix3,tx);
739             fiy3             = _mm_add_ps(fiy3,ty);
740             fiz3             = _mm_add_ps(fiz3,tz);
741
742             fjx1             = _mm_add_ps(fjx1,tx);
743             fjy1             = _mm_add_ps(fjy1,ty);
744             fjz1             = _mm_add_ps(fjz1,tz);
745
746             }
747
748             /**************************
749              * CALCULATE INTERACTIONS *
750              **************************/
751
752             if (gmx_mm_any_lt(rsq32,rcutoff2))
753             {
754
755             r32              = _mm_mul_ps(rsq32,rinv32);
756
757             /* EWALD ELECTROSTATICS */
758
759             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
760             ewrt             = _mm_mul_ps(r32,ewtabscale);
761             ewitab           = _mm_cvttps_epi32(ewrt);
762             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
763             ewitab           = _mm_slli_epi32(ewitab,2);
764             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
765             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
766             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
767             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
768             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
769             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
770             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
771             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
772             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
773
774             d                = _mm_sub_ps(r32,rswitch);
775             d                = _mm_max_ps(d,_mm_setzero_ps());
776             d2               = _mm_mul_ps(d,d);
777             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
778
779             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
780
781             /* Evaluate switch function */
782             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
783             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
784             velec            = _mm_mul_ps(velec,sw);
785             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
786
787             /* Update potential sum for this i atom from the interaction with this j atom. */
788             velec            = _mm_and_ps(velec,cutoff_mask);
789             velecsum         = _mm_add_ps(velecsum,velec);
790
791             fscal            = felec;
792
793             fscal            = _mm_and_ps(fscal,cutoff_mask);
794
795             /* Calculate temporary vectorial force */
796             tx               = _mm_mul_ps(fscal,dx32);
797             ty               = _mm_mul_ps(fscal,dy32);
798             tz               = _mm_mul_ps(fscal,dz32);
799
800             /* Update vectorial force */
801             fix3             = _mm_add_ps(fix3,tx);
802             fiy3             = _mm_add_ps(fiy3,ty);
803             fiz3             = _mm_add_ps(fiz3,tz);
804
805             fjx2             = _mm_add_ps(fjx2,tx);
806             fjy2             = _mm_add_ps(fjy2,ty);
807             fjz2             = _mm_add_ps(fjz2,tz);
808
809             }
810
811             /**************************
812              * CALCULATE INTERACTIONS *
813              **************************/
814
815             if (gmx_mm_any_lt(rsq33,rcutoff2))
816             {
817
818             r33              = _mm_mul_ps(rsq33,rinv33);
819
820             /* EWALD ELECTROSTATICS */
821
822             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
823             ewrt             = _mm_mul_ps(r33,ewtabscale);
824             ewitab           = _mm_cvttps_epi32(ewrt);
825             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
826             ewitab           = _mm_slli_epi32(ewitab,2);
827             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
828             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
829             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
830             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
831             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
832             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
833             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
834             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
835             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
836
837             d                = _mm_sub_ps(r33,rswitch);
838             d                = _mm_max_ps(d,_mm_setzero_ps());
839             d2               = _mm_mul_ps(d,d);
840             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
841
842             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
843
844             /* Evaluate switch function */
845             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
846             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
847             velec            = _mm_mul_ps(velec,sw);
848             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
849
850             /* Update potential sum for this i atom from the interaction with this j atom. */
851             velec            = _mm_and_ps(velec,cutoff_mask);
852             velecsum         = _mm_add_ps(velecsum,velec);
853
854             fscal            = felec;
855
856             fscal            = _mm_and_ps(fscal,cutoff_mask);
857
858             /* Calculate temporary vectorial force */
859             tx               = _mm_mul_ps(fscal,dx33);
860             ty               = _mm_mul_ps(fscal,dy33);
861             tz               = _mm_mul_ps(fscal,dz33);
862
863             /* Update vectorial force */
864             fix3             = _mm_add_ps(fix3,tx);
865             fiy3             = _mm_add_ps(fiy3,ty);
866             fiz3             = _mm_add_ps(fiz3,tz);
867
868             fjx3             = _mm_add_ps(fjx3,tx);
869             fjy3             = _mm_add_ps(fjy3,ty);
870             fjz3             = _mm_add_ps(fjz3,tz);
871
872             }
873
874             fjptrA             = f+j_coord_offsetA;
875             fjptrB             = f+j_coord_offsetB;
876             fjptrC             = f+j_coord_offsetC;
877             fjptrD             = f+j_coord_offsetD;
878
879             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
880                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
881
882             /* Inner loop uses 585 flops */
883         }
884
885         if(jidx<j_index_end)
886         {
887
888             /* Get j neighbor index, and coordinate index */
889             jnrlistA         = jjnr[jidx];
890             jnrlistB         = jjnr[jidx+1];
891             jnrlistC         = jjnr[jidx+2];
892             jnrlistD         = jjnr[jidx+3];
893             /* Sign of each element will be negative for non-real atoms.
894              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
895              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
896              */
897             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
898             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
899             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
900             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
901             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
902             j_coord_offsetA  = DIM*jnrA;
903             j_coord_offsetB  = DIM*jnrB;
904             j_coord_offsetC  = DIM*jnrC;
905             j_coord_offsetD  = DIM*jnrD;
906
907             /* load j atom coordinates */
908             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
909                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
910                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
911
912             /* Calculate displacement vector */
913             dx11             = _mm_sub_ps(ix1,jx1);
914             dy11             = _mm_sub_ps(iy1,jy1);
915             dz11             = _mm_sub_ps(iz1,jz1);
916             dx12             = _mm_sub_ps(ix1,jx2);
917             dy12             = _mm_sub_ps(iy1,jy2);
918             dz12             = _mm_sub_ps(iz1,jz2);
919             dx13             = _mm_sub_ps(ix1,jx3);
920             dy13             = _mm_sub_ps(iy1,jy3);
921             dz13             = _mm_sub_ps(iz1,jz3);
922             dx21             = _mm_sub_ps(ix2,jx1);
923             dy21             = _mm_sub_ps(iy2,jy1);
924             dz21             = _mm_sub_ps(iz2,jz1);
925             dx22             = _mm_sub_ps(ix2,jx2);
926             dy22             = _mm_sub_ps(iy2,jy2);
927             dz22             = _mm_sub_ps(iz2,jz2);
928             dx23             = _mm_sub_ps(ix2,jx3);
929             dy23             = _mm_sub_ps(iy2,jy3);
930             dz23             = _mm_sub_ps(iz2,jz3);
931             dx31             = _mm_sub_ps(ix3,jx1);
932             dy31             = _mm_sub_ps(iy3,jy1);
933             dz31             = _mm_sub_ps(iz3,jz1);
934             dx32             = _mm_sub_ps(ix3,jx2);
935             dy32             = _mm_sub_ps(iy3,jy2);
936             dz32             = _mm_sub_ps(iz3,jz2);
937             dx33             = _mm_sub_ps(ix3,jx3);
938             dy33             = _mm_sub_ps(iy3,jy3);
939             dz33             = _mm_sub_ps(iz3,jz3);
940
941             /* Calculate squared distance and things based on it */
942             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
943             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
944             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
945             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
946             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
947             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
948             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
949             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
950             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
951
952             rinv11           = gmx_mm_invsqrt_ps(rsq11);
953             rinv12           = gmx_mm_invsqrt_ps(rsq12);
954             rinv13           = gmx_mm_invsqrt_ps(rsq13);
955             rinv21           = gmx_mm_invsqrt_ps(rsq21);
956             rinv22           = gmx_mm_invsqrt_ps(rsq22);
957             rinv23           = gmx_mm_invsqrt_ps(rsq23);
958             rinv31           = gmx_mm_invsqrt_ps(rsq31);
959             rinv32           = gmx_mm_invsqrt_ps(rsq32);
960             rinv33           = gmx_mm_invsqrt_ps(rsq33);
961
962             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
963             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
964             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
965             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
966             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
967             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
968             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
969             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
970             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
971
972             fjx1             = _mm_setzero_ps();
973             fjy1             = _mm_setzero_ps();
974             fjz1             = _mm_setzero_ps();
975             fjx2             = _mm_setzero_ps();
976             fjy2             = _mm_setzero_ps();
977             fjz2             = _mm_setzero_ps();
978             fjx3             = _mm_setzero_ps();
979             fjy3             = _mm_setzero_ps();
980             fjz3             = _mm_setzero_ps();
981
982             /**************************
983              * CALCULATE INTERACTIONS *
984              **************************/
985
986             if (gmx_mm_any_lt(rsq11,rcutoff2))
987             {
988
989             r11              = _mm_mul_ps(rsq11,rinv11);
990             r11              = _mm_andnot_ps(dummy_mask,r11);
991
992             /* EWALD ELECTROSTATICS */
993
994             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
995             ewrt             = _mm_mul_ps(r11,ewtabscale);
996             ewitab           = _mm_cvttps_epi32(ewrt);
997             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
998             ewitab           = _mm_slli_epi32(ewitab,2);
999             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1000             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1001             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1002             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1003             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1004             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1005             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1006             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1007             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1008
1009             d                = _mm_sub_ps(r11,rswitch);
1010             d                = _mm_max_ps(d,_mm_setzero_ps());
1011             d2               = _mm_mul_ps(d,d);
1012             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1013
1014             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1015
1016             /* Evaluate switch function */
1017             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1018             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1019             velec            = _mm_mul_ps(velec,sw);
1020             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1021
1022             /* Update potential sum for this i atom from the interaction with this j atom. */
1023             velec            = _mm_and_ps(velec,cutoff_mask);
1024             velec            = _mm_andnot_ps(dummy_mask,velec);
1025             velecsum         = _mm_add_ps(velecsum,velec);
1026
1027             fscal            = felec;
1028
1029             fscal            = _mm_and_ps(fscal,cutoff_mask);
1030
1031             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1032
1033             /* Calculate temporary vectorial force */
1034             tx               = _mm_mul_ps(fscal,dx11);
1035             ty               = _mm_mul_ps(fscal,dy11);
1036             tz               = _mm_mul_ps(fscal,dz11);
1037
1038             /* Update vectorial force */
1039             fix1             = _mm_add_ps(fix1,tx);
1040             fiy1             = _mm_add_ps(fiy1,ty);
1041             fiz1             = _mm_add_ps(fiz1,tz);
1042
1043             fjx1             = _mm_add_ps(fjx1,tx);
1044             fjy1             = _mm_add_ps(fjy1,ty);
1045             fjz1             = _mm_add_ps(fjz1,tz);
1046
1047             }
1048
1049             /**************************
1050              * CALCULATE INTERACTIONS *
1051              **************************/
1052
1053             if (gmx_mm_any_lt(rsq12,rcutoff2))
1054             {
1055
1056             r12              = _mm_mul_ps(rsq12,rinv12);
1057             r12              = _mm_andnot_ps(dummy_mask,r12);
1058
1059             /* EWALD ELECTROSTATICS */
1060
1061             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1062             ewrt             = _mm_mul_ps(r12,ewtabscale);
1063             ewitab           = _mm_cvttps_epi32(ewrt);
1064             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1065             ewitab           = _mm_slli_epi32(ewitab,2);
1066             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1067             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1068             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1069             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1070             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1071             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1072             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1073             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1074             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1075
1076             d                = _mm_sub_ps(r12,rswitch);
1077             d                = _mm_max_ps(d,_mm_setzero_ps());
1078             d2               = _mm_mul_ps(d,d);
1079             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1080
1081             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1082
1083             /* Evaluate switch function */
1084             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1085             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1086             velec            = _mm_mul_ps(velec,sw);
1087             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1088
1089             /* Update potential sum for this i atom from the interaction with this j atom. */
1090             velec            = _mm_and_ps(velec,cutoff_mask);
1091             velec            = _mm_andnot_ps(dummy_mask,velec);
1092             velecsum         = _mm_add_ps(velecsum,velec);
1093
1094             fscal            = felec;
1095
1096             fscal            = _mm_and_ps(fscal,cutoff_mask);
1097
1098             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1099
1100             /* Calculate temporary vectorial force */
1101             tx               = _mm_mul_ps(fscal,dx12);
1102             ty               = _mm_mul_ps(fscal,dy12);
1103             tz               = _mm_mul_ps(fscal,dz12);
1104
1105             /* Update vectorial force */
1106             fix1             = _mm_add_ps(fix1,tx);
1107             fiy1             = _mm_add_ps(fiy1,ty);
1108             fiz1             = _mm_add_ps(fiz1,tz);
1109
1110             fjx2             = _mm_add_ps(fjx2,tx);
1111             fjy2             = _mm_add_ps(fjy2,ty);
1112             fjz2             = _mm_add_ps(fjz2,tz);
1113
1114             }
1115
1116             /**************************
1117              * CALCULATE INTERACTIONS *
1118              **************************/
1119
1120             if (gmx_mm_any_lt(rsq13,rcutoff2))
1121             {
1122
1123             r13              = _mm_mul_ps(rsq13,rinv13);
1124             r13              = _mm_andnot_ps(dummy_mask,r13);
1125
1126             /* EWALD ELECTROSTATICS */
1127
1128             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1129             ewrt             = _mm_mul_ps(r13,ewtabscale);
1130             ewitab           = _mm_cvttps_epi32(ewrt);
1131             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1132             ewitab           = _mm_slli_epi32(ewitab,2);
1133             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1134             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1135             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1136             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1137             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1138             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1139             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1140             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
1141             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1142
1143             d                = _mm_sub_ps(r13,rswitch);
1144             d                = _mm_max_ps(d,_mm_setzero_ps());
1145             d2               = _mm_mul_ps(d,d);
1146             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1147
1148             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1149
1150             /* Evaluate switch function */
1151             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1152             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
1153             velec            = _mm_mul_ps(velec,sw);
1154             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
1155
1156             /* Update potential sum for this i atom from the interaction with this j atom. */
1157             velec            = _mm_and_ps(velec,cutoff_mask);
1158             velec            = _mm_andnot_ps(dummy_mask,velec);
1159             velecsum         = _mm_add_ps(velecsum,velec);
1160
1161             fscal            = felec;
1162
1163             fscal            = _mm_and_ps(fscal,cutoff_mask);
1164
1165             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1166
1167             /* Calculate temporary vectorial force */
1168             tx               = _mm_mul_ps(fscal,dx13);
1169             ty               = _mm_mul_ps(fscal,dy13);
1170             tz               = _mm_mul_ps(fscal,dz13);
1171
1172             /* Update vectorial force */
1173             fix1             = _mm_add_ps(fix1,tx);
1174             fiy1             = _mm_add_ps(fiy1,ty);
1175             fiz1             = _mm_add_ps(fiz1,tz);
1176
1177             fjx3             = _mm_add_ps(fjx3,tx);
1178             fjy3             = _mm_add_ps(fjy3,ty);
1179             fjz3             = _mm_add_ps(fjz3,tz);
1180
1181             }
1182
1183             /**************************
1184              * CALCULATE INTERACTIONS *
1185              **************************/
1186
1187             if (gmx_mm_any_lt(rsq21,rcutoff2))
1188             {
1189
1190             r21              = _mm_mul_ps(rsq21,rinv21);
1191             r21              = _mm_andnot_ps(dummy_mask,r21);
1192
1193             /* EWALD ELECTROSTATICS */
1194
1195             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1196             ewrt             = _mm_mul_ps(r21,ewtabscale);
1197             ewitab           = _mm_cvttps_epi32(ewrt);
1198             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1199             ewitab           = _mm_slli_epi32(ewitab,2);
1200             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1201             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1202             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1203             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1204             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1205             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1206             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1207             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1208             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1209
1210             d                = _mm_sub_ps(r21,rswitch);
1211             d                = _mm_max_ps(d,_mm_setzero_ps());
1212             d2               = _mm_mul_ps(d,d);
1213             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1214
1215             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1216
1217             /* Evaluate switch function */
1218             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1219             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1220             velec            = _mm_mul_ps(velec,sw);
1221             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1222
1223             /* Update potential sum for this i atom from the interaction with this j atom. */
1224             velec            = _mm_and_ps(velec,cutoff_mask);
1225             velec            = _mm_andnot_ps(dummy_mask,velec);
1226             velecsum         = _mm_add_ps(velecsum,velec);
1227
1228             fscal            = felec;
1229
1230             fscal            = _mm_and_ps(fscal,cutoff_mask);
1231
1232             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1233
1234             /* Calculate temporary vectorial force */
1235             tx               = _mm_mul_ps(fscal,dx21);
1236             ty               = _mm_mul_ps(fscal,dy21);
1237             tz               = _mm_mul_ps(fscal,dz21);
1238
1239             /* Update vectorial force */
1240             fix2             = _mm_add_ps(fix2,tx);
1241             fiy2             = _mm_add_ps(fiy2,ty);
1242             fiz2             = _mm_add_ps(fiz2,tz);
1243
1244             fjx1             = _mm_add_ps(fjx1,tx);
1245             fjy1             = _mm_add_ps(fjy1,ty);
1246             fjz1             = _mm_add_ps(fjz1,tz);
1247
1248             }
1249
1250             /**************************
1251              * CALCULATE INTERACTIONS *
1252              **************************/
1253
1254             if (gmx_mm_any_lt(rsq22,rcutoff2))
1255             {
1256
1257             r22              = _mm_mul_ps(rsq22,rinv22);
1258             r22              = _mm_andnot_ps(dummy_mask,r22);
1259
1260             /* EWALD ELECTROSTATICS */
1261
1262             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1263             ewrt             = _mm_mul_ps(r22,ewtabscale);
1264             ewitab           = _mm_cvttps_epi32(ewrt);
1265             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1266             ewitab           = _mm_slli_epi32(ewitab,2);
1267             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1268             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1269             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1270             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1271             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1272             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1273             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1274             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1275             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1276
1277             d                = _mm_sub_ps(r22,rswitch);
1278             d                = _mm_max_ps(d,_mm_setzero_ps());
1279             d2               = _mm_mul_ps(d,d);
1280             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1281
1282             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1283
1284             /* Evaluate switch function */
1285             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1286             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1287             velec            = _mm_mul_ps(velec,sw);
1288             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1289
1290             /* Update potential sum for this i atom from the interaction with this j atom. */
1291             velec            = _mm_and_ps(velec,cutoff_mask);
1292             velec            = _mm_andnot_ps(dummy_mask,velec);
1293             velecsum         = _mm_add_ps(velecsum,velec);
1294
1295             fscal            = felec;
1296
1297             fscal            = _mm_and_ps(fscal,cutoff_mask);
1298
1299             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1300
1301             /* Calculate temporary vectorial force */
1302             tx               = _mm_mul_ps(fscal,dx22);
1303             ty               = _mm_mul_ps(fscal,dy22);
1304             tz               = _mm_mul_ps(fscal,dz22);
1305
1306             /* Update vectorial force */
1307             fix2             = _mm_add_ps(fix2,tx);
1308             fiy2             = _mm_add_ps(fiy2,ty);
1309             fiz2             = _mm_add_ps(fiz2,tz);
1310
1311             fjx2             = _mm_add_ps(fjx2,tx);
1312             fjy2             = _mm_add_ps(fjy2,ty);
1313             fjz2             = _mm_add_ps(fjz2,tz);
1314
1315             }
1316
1317             /**************************
1318              * CALCULATE INTERACTIONS *
1319              **************************/
1320
1321             if (gmx_mm_any_lt(rsq23,rcutoff2))
1322             {
1323
1324             r23              = _mm_mul_ps(rsq23,rinv23);
1325             r23              = _mm_andnot_ps(dummy_mask,r23);
1326
1327             /* EWALD ELECTROSTATICS */
1328
1329             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1330             ewrt             = _mm_mul_ps(r23,ewtabscale);
1331             ewitab           = _mm_cvttps_epi32(ewrt);
1332             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1333             ewitab           = _mm_slli_epi32(ewitab,2);
1334             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1335             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1336             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1337             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1338             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1339             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1340             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1341             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
1342             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1343
1344             d                = _mm_sub_ps(r23,rswitch);
1345             d                = _mm_max_ps(d,_mm_setzero_ps());
1346             d2               = _mm_mul_ps(d,d);
1347             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1348
1349             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1350
1351             /* Evaluate switch function */
1352             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1353             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
1354             velec            = _mm_mul_ps(velec,sw);
1355             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1356
1357             /* Update potential sum for this i atom from the interaction with this j atom. */
1358             velec            = _mm_and_ps(velec,cutoff_mask);
1359             velec            = _mm_andnot_ps(dummy_mask,velec);
1360             velecsum         = _mm_add_ps(velecsum,velec);
1361
1362             fscal            = felec;
1363
1364             fscal            = _mm_and_ps(fscal,cutoff_mask);
1365
1366             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1367
1368             /* Calculate temporary vectorial force */
1369             tx               = _mm_mul_ps(fscal,dx23);
1370             ty               = _mm_mul_ps(fscal,dy23);
1371             tz               = _mm_mul_ps(fscal,dz23);
1372
1373             /* Update vectorial force */
1374             fix2             = _mm_add_ps(fix2,tx);
1375             fiy2             = _mm_add_ps(fiy2,ty);
1376             fiz2             = _mm_add_ps(fiz2,tz);
1377
1378             fjx3             = _mm_add_ps(fjx3,tx);
1379             fjy3             = _mm_add_ps(fjy3,ty);
1380             fjz3             = _mm_add_ps(fjz3,tz);
1381
1382             }
1383
1384             /**************************
1385              * CALCULATE INTERACTIONS *
1386              **************************/
1387
1388             if (gmx_mm_any_lt(rsq31,rcutoff2))
1389             {
1390
1391             r31              = _mm_mul_ps(rsq31,rinv31);
1392             r31              = _mm_andnot_ps(dummy_mask,r31);
1393
1394             /* EWALD ELECTROSTATICS */
1395
1396             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1397             ewrt             = _mm_mul_ps(r31,ewtabscale);
1398             ewitab           = _mm_cvttps_epi32(ewrt);
1399             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1400             ewitab           = _mm_slli_epi32(ewitab,2);
1401             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1402             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1403             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1404             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1405             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1406             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1407             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1408             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
1409             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1410
1411             d                = _mm_sub_ps(r31,rswitch);
1412             d                = _mm_max_ps(d,_mm_setzero_ps());
1413             d2               = _mm_mul_ps(d,d);
1414             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1415
1416             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1417
1418             /* Evaluate switch function */
1419             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1420             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
1421             velec            = _mm_mul_ps(velec,sw);
1422             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1423
1424             /* Update potential sum for this i atom from the interaction with this j atom. */
1425             velec            = _mm_and_ps(velec,cutoff_mask);
1426             velec            = _mm_andnot_ps(dummy_mask,velec);
1427             velecsum         = _mm_add_ps(velecsum,velec);
1428
1429             fscal            = felec;
1430
1431             fscal            = _mm_and_ps(fscal,cutoff_mask);
1432
1433             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1434
1435             /* Calculate temporary vectorial force */
1436             tx               = _mm_mul_ps(fscal,dx31);
1437             ty               = _mm_mul_ps(fscal,dy31);
1438             tz               = _mm_mul_ps(fscal,dz31);
1439
1440             /* Update vectorial force */
1441             fix3             = _mm_add_ps(fix3,tx);
1442             fiy3             = _mm_add_ps(fiy3,ty);
1443             fiz3             = _mm_add_ps(fiz3,tz);
1444
1445             fjx1             = _mm_add_ps(fjx1,tx);
1446             fjy1             = _mm_add_ps(fjy1,ty);
1447             fjz1             = _mm_add_ps(fjz1,tz);
1448
1449             }
1450
1451             /**************************
1452              * CALCULATE INTERACTIONS *
1453              **************************/
1454
1455             if (gmx_mm_any_lt(rsq32,rcutoff2))
1456             {
1457
1458             r32              = _mm_mul_ps(rsq32,rinv32);
1459             r32              = _mm_andnot_ps(dummy_mask,r32);
1460
1461             /* EWALD ELECTROSTATICS */
1462
1463             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1464             ewrt             = _mm_mul_ps(r32,ewtabscale);
1465             ewitab           = _mm_cvttps_epi32(ewrt);
1466             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1467             ewitab           = _mm_slli_epi32(ewitab,2);
1468             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1469             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1470             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1471             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1472             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1473             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1474             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1475             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
1476             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1477
1478             d                = _mm_sub_ps(r32,rswitch);
1479             d                = _mm_max_ps(d,_mm_setzero_ps());
1480             d2               = _mm_mul_ps(d,d);
1481             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1482
1483             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1484
1485             /* Evaluate switch function */
1486             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1487             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
1488             velec            = _mm_mul_ps(velec,sw);
1489             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1490
1491             /* Update potential sum for this i atom from the interaction with this j atom. */
1492             velec            = _mm_and_ps(velec,cutoff_mask);
1493             velec            = _mm_andnot_ps(dummy_mask,velec);
1494             velecsum         = _mm_add_ps(velecsum,velec);
1495
1496             fscal            = felec;
1497
1498             fscal            = _mm_and_ps(fscal,cutoff_mask);
1499
1500             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1501
1502             /* Calculate temporary vectorial force */
1503             tx               = _mm_mul_ps(fscal,dx32);
1504             ty               = _mm_mul_ps(fscal,dy32);
1505             tz               = _mm_mul_ps(fscal,dz32);
1506
1507             /* Update vectorial force */
1508             fix3             = _mm_add_ps(fix3,tx);
1509             fiy3             = _mm_add_ps(fiy3,ty);
1510             fiz3             = _mm_add_ps(fiz3,tz);
1511
1512             fjx2             = _mm_add_ps(fjx2,tx);
1513             fjy2             = _mm_add_ps(fjy2,ty);
1514             fjz2             = _mm_add_ps(fjz2,tz);
1515
1516             }
1517
1518             /**************************
1519              * CALCULATE INTERACTIONS *
1520              **************************/
1521
1522             if (gmx_mm_any_lt(rsq33,rcutoff2))
1523             {
1524
1525             r33              = _mm_mul_ps(rsq33,rinv33);
1526             r33              = _mm_andnot_ps(dummy_mask,r33);
1527
1528             /* EWALD ELECTROSTATICS */
1529
1530             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1531             ewrt             = _mm_mul_ps(r33,ewtabscale);
1532             ewitab           = _mm_cvttps_epi32(ewrt);
1533             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1534             ewitab           = _mm_slli_epi32(ewitab,2);
1535             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1536             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1537             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1538             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1539             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1540             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1541             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1542             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
1543             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1544
1545             d                = _mm_sub_ps(r33,rswitch);
1546             d                = _mm_max_ps(d,_mm_setzero_ps());
1547             d2               = _mm_mul_ps(d,d);
1548             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1549
1550             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1551
1552             /* Evaluate switch function */
1553             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1554             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
1555             velec            = _mm_mul_ps(velec,sw);
1556             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1557
1558             /* Update potential sum for this i atom from the interaction with this j atom. */
1559             velec            = _mm_and_ps(velec,cutoff_mask);
1560             velec            = _mm_andnot_ps(dummy_mask,velec);
1561             velecsum         = _mm_add_ps(velecsum,velec);
1562
1563             fscal            = felec;
1564
1565             fscal            = _mm_and_ps(fscal,cutoff_mask);
1566
1567             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1568
1569             /* Calculate temporary vectorial force */
1570             tx               = _mm_mul_ps(fscal,dx33);
1571             ty               = _mm_mul_ps(fscal,dy33);
1572             tz               = _mm_mul_ps(fscal,dz33);
1573
1574             /* Update vectorial force */
1575             fix3             = _mm_add_ps(fix3,tx);
1576             fiy3             = _mm_add_ps(fiy3,ty);
1577             fiz3             = _mm_add_ps(fiz3,tz);
1578
1579             fjx3             = _mm_add_ps(fjx3,tx);
1580             fjy3             = _mm_add_ps(fjy3,ty);
1581             fjz3             = _mm_add_ps(fjz3,tz);
1582
1583             }
1584
1585             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1586             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1587             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1588             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1589
1590             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1591                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1592
1593             /* Inner loop uses 594 flops */
1594         }
1595
1596         /* End of innermost loop */
1597
1598         gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1599                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
1600
1601         ggid                        = gid[iidx];
1602         /* Update potential energies */
1603         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1604
1605         /* Increment number of inner iterations */
1606         inneriter                  += j_index_end - j_index_start;
1607
1608         /* Outer loop uses 19 flops */
1609     }
1610
1611     /* Increment number of outer iterations */
1612     outeriter        += nri;
1613
1614     /* Update outer/inner flops */
1615
1616     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*594);
1617 }
1618 /*
1619  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse4_1_single
1620  * Electrostatics interaction: Ewald
1621  * VdW interaction:            None
1622  * Geometry:                   Water4-Water4
1623  * Calculate force/pot:        Force
1624  */
1625 void
1626 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse4_1_single
1627                     (t_nblist                    * gmx_restrict       nlist,
1628                      rvec                        * gmx_restrict          xx,
1629                      rvec                        * gmx_restrict          ff,
1630                      t_forcerec                  * gmx_restrict          fr,
1631                      t_mdatoms                   * gmx_restrict     mdatoms,
1632                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1633                      t_nrnb                      * gmx_restrict        nrnb)
1634 {
1635     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1636      * just 0 for non-waters.
1637      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1638      * jnr indices corresponding to data put in the four positions in the SIMD register.
1639      */
1640     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1641     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1642     int              jnrA,jnrB,jnrC,jnrD;
1643     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1644     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1645     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1646     real             rcutoff_scalar;
1647     real             *shiftvec,*fshift,*x,*f;
1648     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1649     real             scratch[4*DIM];
1650     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1651     int              vdwioffset1;
1652     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1653     int              vdwioffset2;
1654     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1655     int              vdwioffset3;
1656     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1657     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1658     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1659     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1660     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1661     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1662     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1663     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1664     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1665     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1666     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1667     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1668     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1669     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1670     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1671     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1672     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1673     real             *charge;
1674     __m128i          ewitab;
1675     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1676     real             *ewtab;
1677     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1678     real             rswitch_scalar,d_scalar;
1679     __m128           dummy_mask,cutoff_mask;
1680     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1681     __m128           one     = _mm_set1_ps(1.0);
1682     __m128           two     = _mm_set1_ps(2.0);
1683     x                = xx[0];
1684     f                = ff[0];
1685
1686     nri              = nlist->nri;
1687     iinr             = nlist->iinr;
1688     jindex           = nlist->jindex;
1689     jjnr             = nlist->jjnr;
1690     shiftidx         = nlist->shift;
1691     gid              = nlist->gid;
1692     shiftvec         = fr->shift_vec[0];
1693     fshift           = fr->fshift[0];
1694     facel            = _mm_set1_ps(fr->epsfac);
1695     charge           = mdatoms->chargeA;
1696
1697     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1698     ewtab            = fr->ic->tabq_coul_FDV0;
1699     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1700     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1701
1702     /* Setup water-specific parameters */
1703     inr              = nlist->iinr[0];
1704     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1705     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1706     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1707
1708     jq1              = _mm_set1_ps(charge[inr+1]);
1709     jq2              = _mm_set1_ps(charge[inr+2]);
1710     jq3              = _mm_set1_ps(charge[inr+3]);
1711     qq11             = _mm_mul_ps(iq1,jq1);
1712     qq12             = _mm_mul_ps(iq1,jq2);
1713     qq13             = _mm_mul_ps(iq1,jq3);
1714     qq21             = _mm_mul_ps(iq2,jq1);
1715     qq22             = _mm_mul_ps(iq2,jq2);
1716     qq23             = _mm_mul_ps(iq2,jq3);
1717     qq31             = _mm_mul_ps(iq3,jq1);
1718     qq32             = _mm_mul_ps(iq3,jq2);
1719     qq33             = _mm_mul_ps(iq3,jq3);
1720
1721     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1722     rcutoff_scalar   = fr->rcoulomb;
1723     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1724     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1725
1726     rswitch_scalar   = fr->rcoulomb_switch;
1727     rswitch          = _mm_set1_ps(rswitch_scalar);
1728     /* Setup switch parameters */
1729     d_scalar         = rcutoff_scalar-rswitch_scalar;
1730     d                = _mm_set1_ps(d_scalar);
1731     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1732     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1733     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1734     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1735     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1736     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1737
1738     /* Avoid stupid compiler warnings */
1739     jnrA = jnrB = jnrC = jnrD = 0;
1740     j_coord_offsetA = 0;
1741     j_coord_offsetB = 0;
1742     j_coord_offsetC = 0;
1743     j_coord_offsetD = 0;
1744
1745     outeriter        = 0;
1746     inneriter        = 0;
1747
1748     for(iidx=0;iidx<4*DIM;iidx++)
1749     {
1750         scratch[iidx] = 0.0;
1751     }
1752
1753     /* Start outer loop over neighborlists */
1754     for(iidx=0; iidx<nri; iidx++)
1755     {
1756         /* Load shift vector for this list */
1757         i_shift_offset   = DIM*shiftidx[iidx];
1758
1759         /* Load limits for loop over neighbors */
1760         j_index_start    = jindex[iidx];
1761         j_index_end      = jindex[iidx+1];
1762
1763         /* Get outer coordinate index */
1764         inr              = iinr[iidx];
1765         i_coord_offset   = DIM*inr;
1766
1767         /* Load i particle coords and add shift vector */
1768         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1769                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1770
1771         fix1             = _mm_setzero_ps();
1772         fiy1             = _mm_setzero_ps();
1773         fiz1             = _mm_setzero_ps();
1774         fix2             = _mm_setzero_ps();
1775         fiy2             = _mm_setzero_ps();
1776         fiz2             = _mm_setzero_ps();
1777         fix3             = _mm_setzero_ps();
1778         fiy3             = _mm_setzero_ps();
1779         fiz3             = _mm_setzero_ps();
1780
1781         /* Start inner kernel loop */
1782         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1783         {
1784
1785             /* Get j neighbor index, and coordinate index */
1786             jnrA             = jjnr[jidx];
1787             jnrB             = jjnr[jidx+1];
1788             jnrC             = jjnr[jidx+2];
1789             jnrD             = jjnr[jidx+3];
1790             j_coord_offsetA  = DIM*jnrA;
1791             j_coord_offsetB  = DIM*jnrB;
1792             j_coord_offsetC  = DIM*jnrC;
1793             j_coord_offsetD  = DIM*jnrD;
1794
1795             /* load j atom coordinates */
1796             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1797                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1798                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1799
1800             /* Calculate displacement vector */
1801             dx11             = _mm_sub_ps(ix1,jx1);
1802             dy11             = _mm_sub_ps(iy1,jy1);
1803             dz11             = _mm_sub_ps(iz1,jz1);
1804             dx12             = _mm_sub_ps(ix1,jx2);
1805             dy12             = _mm_sub_ps(iy1,jy2);
1806             dz12             = _mm_sub_ps(iz1,jz2);
1807             dx13             = _mm_sub_ps(ix1,jx3);
1808             dy13             = _mm_sub_ps(iy1,jy3);
1809             dz13             = _mm_sub_ps(iz1,jz3);
1810             dx21             = _mm_sub_ps(ix2,jx1);
1811             dy21             = _mm_sub_ps(iy2,jy1);
1812             dz21             = _mm_sub_ps(iz2,jz1);
1813             dx22             = _mm_sub_ps(ix2,jx2);
1814             dy22             = _mm_sub_ps(iy2,jy2);
1815             dz22             = _mm_sub_ps(iz2,jz2);
1816             dx23             = _mm_sub_ps(ix2,jx3);
1817             dy23             = _mm_sub_ps(iy2,jy3);
1818             dz23             = _mm_sub_ps(iz2,jz3);
1819             dx31             = _mm_sub_ps(ix3,jx1);
1820             dy31             = _mm_sub_ps(iy3,jy1);
1821             dz31             = _mm_sub_ps(iz3,jz1);
1822             dx32             = _mm_sub_ps(ix3,jx2);
1823             dy32             = _mm_sub_ps(iy3,jy2);
1824             dz32             = _mm_sub_ps(iz3,jz2);
1825             dx33             = _mm_sub_ps(ix3,jx3);
1826             dy33             = _mm_sub_ps(iy3,jy3);
1827             dz33             = _mm_sub_ps(iz3,jz3);
1828
1829             /* Calculate squared distance and things based on it */
1830             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1831             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1832             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1833             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1834             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1835             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1836             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1837             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1838             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1839
1840             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1841             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1842             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1843             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1844             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1845             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1846             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1847             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1848             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1849
1850             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1851             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1852             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1853             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1854             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1855             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1856             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1857             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1858             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1859
1860             fjx1             = _mm_setzero_ps();
1861             fjy1             = _mm_setzero_ps();
1862             fjz1             = _mm_setzero_ps();
1863             fjx2             = _mm_setzero_ps();
1864             fjy2             = _mm_setzero_ps();
1865             fjz2             = _mm_setzero_ps();
1866             fjx3             = _mm_setzero_ps();
1867             fjy3             = _mm_setzero_ps();
1868             fjz3             = _mm_setzero_ps();
1869
1870             /**************************
1871              * CALCULATE INTERACTIONS *
1872              **************************/
1873
1874             if (gmx_mm_any_lt(rsq11,rcutoff2))
1875             {
1876
1877             r11              = _mm_mul_ps(rsq11,rinv11);
1878
1879             /* EWALD ELECTROSTATICS */
1880
1881             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1882             ewrt             = _mm_mul_ps(r11,ewtabscale);
1883             ewitab           = _mm_cvttps_epi32(ewrt);
1884             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1885             ewitab           = _mm_slli_epi32(ewitab,2);
1886             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1887             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1888             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1889             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1890             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1891             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1892             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1893             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1894             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1895
1896             d                = _mm_sub_ps(r11,rswitch);
1897             d                = _mm_max_ps(d,_mm_setzero_ps());
1898             d2               = _mm_mul_ps(d,d);
1899             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1900
1901             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1902
1903             /* Evaluate switch function */
1904             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1905             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1906             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1907
1908             fscal            = felec;
1909
1910             fscal            = _mm_and_ps(fscal,cutoff_mask);
1911
1912             /* Calculate temporary vectorial force */
1913             tx               = _mm_mul_ps(fscal,dx11);
1914             ty               = _mm_mul_ps(fscal,dy11);
1915             tz               = _mm_mul_ps(fscal,dz11);
1916
1917             /* Update vectorial force */
1918             fix1             = _mm_add_ps(fix1,tx);
1919             fiy1             = _mm_add_ps(fiy1,ty);
1920             fiz1             = _mm_add_ps(fiz1,tz);
1921
1922             fjx1             = _mm_add_ps(fjx1,tx);
1923             fjy1             = _mm_add_ps(fjy1,ty);
1924             fjz1             = _mm_add_ps(fjz1,tz);
1925
1926             }
1927
1928             /**************************
1929              * CALCULATE INTERACTIONS *
1930              **************************/
1931
1932             if (gmx_mm_any_lt(rsq12,rcutoff2))
1933             {
1934
1935             r12              = _mm_mul_ps(rsq12,rinv12);
1936
1937             /* EWALD ELECTROSTATICS */
1938
1939             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1940             ewrt             = _mm_mul_ps(r12,ewtabscale);
1941             ewitab           = _mm_cvttps_epi32(ewrt);
1942             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1943             ewitab           = _mm_slli_epi32(ewitab,2);
1944             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1945             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1946             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1947             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1948             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1949             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1950             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1951             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1952             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1953
1954             d                = _mm_sub_ps(r12,rswitch);
1955             d                = _mm_max_ps(d,_mm_setzero_ps());
1956             d2               = _mm_mul_ps(d,d);
1957             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1958
1959             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1960
1961             /* Evaluate switch function */
1962             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1963             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1964             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1965
1966             fscal            = felec;
1967
1968             fscal            = _mm_and_ps(fscal,cutoff_mask);
1969
1970             /* Calculate temporary vectorial force */
1971             tx               = _mm_mul_ps(fscal,dx12);
1972             ty               = _mm_mul_ps(fscal,dy12);
1973             tz               = _mm_mul_ps(fscal,dz12);
1974
1975             /* Update vectorial force */
1976             fix1             = _mm_add_ps(fix1,tx);
1977             fiy1             = _mm_add_ps(fiy1,ty);
1978             fiz1             = _mm_add_ps(fiz1,tz);
1979
1980             fjx2             = _mm_add_ps(fjx2,tx);
1981             fjy2             = _mm_add_ps(fjy2,ty);
1982             fjz2             = _mm_add_ps(fjz2,tz);
1983
1984             }
1985
1986             /**************************
1987              * CALCULATE INTERACTIONS *
1988              **************************/
1989
1990             if (gmx_mm_any_lt(rsq13,rcutoff2))
1991             {
1992
1993             r13              = _mm_mul_ps(rsq13,rinv13);
1994
1995             /* EWALD ELECTROSTATICS */
1996
1997             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1998             ewrt             = _mm_mul_ps(r13,ewtabscale);
1999             ewitab           = _mm_cvttps_epi32(ewrt);
2000             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2001             ewitab           = _mm_slli_epi32(ewitab,2);
2002             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2003             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2004             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2005             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2006             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2007             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2008             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2009             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2010             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2011
2012             d                = _mm_sub_ps(r13,rswitch);
2013             d                = _mm_max_ps(d,_mm_setzero_ps());
2014             d2               = _mm_mul_ps(d,d);
2015             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2016
2017             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2018
2019             /* Evaluate switch function */
2020             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2021             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2022             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
2023
2024             fscal            = felec;
2025
2026             fscal            = _mm_and_ps(fscal,cutoff_mask);
2027
2028             /* Calculate temporary vectorial force */
2029             tx               = _mm_mul_ps(fscal,dx13);
2030             ty               = _mm_mul_ps(fscal,dy13);
2031             tz               = _mm_mul_ps(fscal,dz13);
2032
2033             /* Update vectorial force */
2034             fix1             = _mm_add_ps(fix1,tx);
2035             fiy1             = _mm_add_ps(fiy1,ty);
2036             fiz1             = _mm_add_ps(fiz1,tz);
2037
2038             fjx3             = _mm_add_ps(fjx3,tx);
2039             fjy3             = _mm_add_ps(fjy3,ty);
2040             fjz3             = _mm_add_ps(fjz3,tz);
2041
2042             }
2043
2044             /**************************
2045              * CALCULATE INTERACTIONS *
2046              **************************/
2047
2048             if (gmx_mm_any_lt(rsq21,rcutoff2))
2049             {
2050
2051             r21              = _mm_mul_ps(rsq21,rinv21);
2052
2053             /* EWALD ELECTROSTATICS */
2054
2055             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2056             ewrt             = _mm_mul_ps(r21,ewtabscale);
2057             ewitab           = _mm_cvttps_epi32(ewrt);
2058             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2059             ewitab           = _mm_slli_epi32(ewitab,2);
2060             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2061             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2062             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2063             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2064             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2065             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2066             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2067             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2068             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2069
2070             d                = _mm_sub_ps(r21,rswitch);
2071             d                = _mm_max_ps(d,_mm_setzero_ps());
2072             d2               = _mm_mul_ps(d,d);
2073             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2074
2075             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2076
2077             /* Evaluate switch function */
2078             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2079             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2080             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2081
2082             fscal            = felec;
2083
2084             fscal            = _mm_and_ps(fscal,cutoff_mask);
2085
2086             /* Calculate temporary vectorial force */
2087             tx               = _mm_mul_ps(fscal,dx21);
2088             ty               = _mm_mul_ps(fscal,dy21);
2089             tz               = _mm_mul_ps(fscal,dz21);
2090
2091             /* Update vectorial force */
2092             fix2             = _mm_add_ps(fix2,tx);
2093             fiy2             = _mm_add_ps(fiy2,ty);
2094             fiz2             = _mm_add_ps(fiz2,tz);
2095
2096             fjx1             = _mm_add_ps(fjx1,tx);
2097             fjy1             = _mm_add_ps(fjy1,ty);
2098             fjz1             = _mm_add_ps(fjz1,tz);
2099
2100             }
2101
2102             /**************************
2103              * CALCULATE INTERACTIONS *
2104              **************************/
2105
2106             if (gmx_mm_any_lt(rsq22,rcutoff2))
2107             {
2108
2109             r22              = _mm_mul_ps(rsq22,rinv22);
2110
2111             /* EWALD ELECTROSTATICS */
2112
2113             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2114             ewrt             = _mm_mul_ps(r22,ewtabscale);
2115             ewitab           = _mm_cvttps_epi32(ewrt);
2116             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2117             ewitab           = _mm_slli_epi32(ewitab,2);
2118             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2119             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2120             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2121             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2122             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2123             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2124             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2125             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2126             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2127
2128             d                = _mm_sub_ps(r22,rswitch);
2129             d                = _mm_max_ps(d,_mm_setzero_ps());
2130             d2               = _mm_mul_ps(d,d);
2131             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2132
2133             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2134
2135             /* Evaluate switch function */
2136             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2137             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2138             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2139
2140             fscal            = felec;
2141
2142             fscal            = _mm_and_ps(fscal,cutoff_mask);
2143
2144             /* Calculate temporary vectorial force */
2145             tx               = _mm_mul_ps(fscal,dx22);
2146             ty               = _mm_mul_ps(fscal,dy22);
2147             tz               = _mm_mul_ps(fscal,dz22);
2148
2149             /* Update vectorial force */
2150             fix2             = _mm_add_ps(fix2,tx);
2151             fiy2             = _mm_add_ps(fiy2,ty);
2152             fiz2             = _mm_add_ps(fiz2,tz);
2153
2154             fjx2             = _mm_add_ps(fjx2,tx);
2155             fjy2             = _mm_add_ps(fjy2,ty);
2156             fjz2             = _mm_add_ps(fjz2,tz);
2157
2158             }
2159
2160             /**************************
2161              * CALCULATE INTERACTIONS *
2162              **************************/
2163
2164             if (gmx_mm_any_lt(rsq23,rcutoff2))
2165             {
2166
2167             r23              = _mm_mul_ps(rsq23,rinv23);
2168
2169             /* EWALD ELECTROSTATICS */
2170
2171             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2172             ewrt             = _mm_mul_ps(r23,ewtabscale);
2173             ewitab           = _mm_cvttps_epi32(ewrt);
2174             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2175             ewitab           = _mm_slli_epi32(ewitab,2);
2176             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2177             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2178             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2179             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2180             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2181             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2182             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2183             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
2184             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2185
2186             d                = _mm_sub_ps(r23,rswitch);
2187             d                = _mm_max_ps(d,_mm_setzero_ps());
2188             d2               = _mm_mul_ps(d,d);
2189             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2190
2191             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2192
2193             /* Evaluate switch function */
2194             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2195             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
2196             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
2197
2198             fscal            = felec;
2199
2200             fscal            = _mm_and_ps(fscal,cutoff_mask);
2201
2202             /* Calculate temporary vectorial force */
2203             tx               = _mm_mul_ps(fscal,dx23);
2204             ty               = _mm_mul_ps(fscal,dy23);
2205             tz               = _mm_mul_ps(fscal,dz23);
2206
2207             /* Update vectorial force */
2208             fix2             = _mm_add_ps(fix2,tx);
2209             fiy2             = _mm_add_ps(fiy2,ty);
2210             fiz2             = _mm_add_ps(fiz2,tz);
2211
2212             fjx3             = _mm_add_ps(fjx3,tx);
2213             fjy3             = _mm_add_ps(fjy3,ty);
2214             fjz3             = _mm_add_ps(fjz3,tz);
2215
2216             }
2217
2218             /**************************
2219              * CALCULATE INTERACTIONS *
2220              **************************/
2221
2222             if (gmx_mm_any_lt(rsq31,rcutoff2))
2223             {
2224
2225             r31              = _mm_mul_ps(rsq31,rinv31);
2226
2227             /* EWALD ELECTROSTATICS */
2228
2229             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2230             ewrt             = _mm_mul_ps(r31,ewtabscale);
2231             ewitab           = _mm_cvttps_epi32(ewrt);
2232             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2233             ewitab           = _mm_slli_epi32(ewitab,2);
2234             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2235             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2236             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2237             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2238             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2239             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2240             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2241             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
2242             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2243
2244             d                = _mm_sub_ps(r31,rswitch);
2245             d                = _mm_max_ps(d,_mm_setzero_ps());
2246             d2               = _mm_mul_ps(d,d);
2247             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2248
2249             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2250
2251             /* Evaluate switch function */
2252             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2253             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2254             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
2255
2256             fscal            = felec;
2257
2258             fscal            = _mm_and_ps(fscal,cutoff_mask);
2259
2260             /* Calculate temporary vectorial force */
2261             tx               = _mm_mul_ps(fscal,dx31);
2262             ty               = _mm_mul_ps(fscal,dy31);
2263             tz               = _mm_mul_ps(fscal,dz31);
2264
2265             /* Update vectorial force */
2266             fix3             = _mm_add_ps(fix3,tx);
2267             fiy3             = _mm_add_ps(fiy3,ty);
2268             fiz3             = _mm_add_ps(fiz3,tz);
2269
2270             fjx1             = _mm_add_ps(fjx1,tx);
2271             fjy1             = _mm_add_ps(fjy1,ty);
2272             fjz1             = _mm_add_ps(fjz1,tz);
2273
2274             }
2275
2276             /**************************
2277              * CALCULATE INTERACTIONS *
2278              **************************/
2279
2280             if (gmx_mm_any_lt(rsq32,rcutoff2))
2281             {
2282
2283             r32              = _mm_mul_ps(rsq32,rinv32);
2284
2285             /* EWALD ELECTROSTATICS */
2286
2287             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2288             ewrt             = _mm_mul_ps(r32,ewtabscale);
2289             ewitab           = _mm_cvttps_epi32(ewrt);
2290             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2291             ewitab           = _mm_slli_epi32(ewitab,2);
2292             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2293             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2294             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2295             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2296             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2297             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2298             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2299             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
2300             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2301
2302             d                = _mm_sub_ps(r32,rswitch);
2303             d                = _mm_max_ps(d,_mm_setzero_ps());
2304             d2               = _mm_mul_ps(d,d);
2305             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2306
2307             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2308
2309             /* Evaluate switch function */
2310             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2311             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2312             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
2313
2314             fscal            = felec;
2315
2316             fscal            = _mm_and_ps(fscal,cutoff_mask);
2317
2318             /* Calculate temporary vectorial force */
2319             tx               = _mm_mul_ps(fscal,dx32);
2320             ty               = _mm_mul_ps(fscal,dy32);
2321             tz               = _mm_mul_ps(fscal,dz32);
2322
2323             /* Update vectorial force */
2324             fix3             = _mm_add_ps(fix3,tx);
2325             fiy3             = _mm_add_ps(fiy3,ty);
2326             fiz3             = _mm_add_ps(fiz3,tz);
2327
2328             fjx2             = _mm_add_ps(fjx2,tx);
2329             fjy2             = _mm_add_ps(fjy2,ty);
2330             fjz2             = _mm_add_ps(fjz2,tz);
2331
2332             }
2333
2334             /**************************
2335              * CALCULATE INTERACTIONS *
2336              **************************/
2337
2338             if (gmx_mm_any_lt(rsq33,rcutoff2))
2339             {
2340
2341             r33              = _mm_mul_ps(rsq33,rinv33);
2342
2343             /* EWALD ELECTROSTATICS */
2344
2345             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2346             ewrt             = _mm_mul_ps(r33,ewtabscale);
2347             ewitab           = _mm_cvttps_epi32(ewrt);
2348             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2349             ewitab           = _mm_slli_epi32(ewitab,2);
2350             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2351             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2352             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2353             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2354             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2355             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2356             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2357             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
2358             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2359
2360             d                = _mm_sub_ps(r33,rswitch);
2361             d                = _mm_max_ps(d,_mm_setzero_ps());
2362             d2               = _mm_mul_ps(d,d);
2363             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2364
2365             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2366
2367             /* Evaluate switch function */
2368             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2369             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
2370             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
2371
2372             fscal            = felec;
2373
2374             fscal            = _mm_and_ps(fscal,cutoff_mask);
2375
2376             /* Calculate temporary vectorial force */
2377             tx               = _mm_mul_ps(fscal,dx33);
2378             ty               = _mm_mul_ps(fscal,dy33);
2379             tz               = _mm_mul_ps(fscal,dz33);
2380
2381             /* Update vectorial force */
2382             fix3             = _mm_add_ps(fix3,tx);
2383             fiy3             = _mm_add_ps(fiy3,ty);
2384             fiz3             = _mm_add_ps(fiz3,tz);
2385
2386             fjx3             = _mm_add_ps(fjx3,tx);
2387             fjy3             = _mm_add_ps(fjy3,ty);
2388             fjz3             = _mm_add_ps(fjz3,tz);
2389
2390             }
2391
2392             fjptrA             = f+j_coord_offsetA;
2393             fjptrB             = f+j_coord_offsetB;
2394             fjptrC             = f+j_coord_offsetC;
2395             fjptrD             = f+j_coord_offsetD;
2396
2397             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2398                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2399
2400             /* Inner loop uses 558 flops */
2401         }
2402
2403         if(jidx<j_index_end)
2404         {
2405
2406             /* Get j neighbor index, and coordinate index */
2407             jnrlistA         = jjnr[jidx];
2408             jnrlistB         = jjnr[jidx+1];
2409             jnrlistC         = jjnr[jidx+2];
2410             jnrlistD         = jjnr[jidx+3];
2411             /* Sign of each element will be negative for non-real atoms.
2412              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2413              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2414              */
2415             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2416             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2417             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2418             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2419             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2420             j_coord_offsetA  = DIM*jnrA;
2421             j_coord_offsetB  = DIM*jnrB;
2422             j_coord_offsetC  = DIM*jnrC;
2423             j_coord_offsetD  = DIM*jnrD;
2424
2425             /* load j atom coordinates */
2426             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
2427                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
2428                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
2429
2430             /* Calculate displacement vector */
2431             dx11             = _mm_sub_ps(ix1,jx1);
2432             dy11             = _mm_sub_ps(iy1,jy1);
2433             dz11             = _mm_sub_ps(iz1,jz1);
2434             dx12             = _mm_sub_ps(ix1,jx2);
2435             dy12             = _mm_sub_ps(iy1,jy2);
2436             dz12             = _mm_sub_ps(iz1,jz2);
2437             dx13             = _mm_sub_ps(ix1,jx3);
2438             dy13             = _mm_sub_ps(iy1,jy3);
2439             dz13             = _mm_sub_ps(iz1,jz3);
2440             dx21             = _mm_sub_ps(ix2,jx1);
2441             dy21             = _mm_sub_ps(iy2,jy1);
2442             dz21             = _mm_sub_ps(iz2,jz1);
2443             dx22             = _mm_sub_ps(ix2,jx2);
2444             dy22             = _mm_sub_ps(iy2,jy2);
2445             dz22             = _mm_sub_ps(iz2,jz2);
2446             dx23             = _mm_sub_ps(ix2,jx3);
2447             dy23             = _mm_sub_ps(iy2,jy3);
2448             dz23             = _mm_sub_ps(iz2,jz3);
2449             dx31             = _mm_sub_ps(ix3,jx1);
2450             dy31             = _mm_sub_ps(iy3,jy1);
2451             dz31             = _mm_sub_ps(iz3,jz1);
2452             dx32             = _mm_sub_ps(ix3,jx2);
2453             dy32             = _mm_sub_ps(iy3,jy2);
2454             dz32             = _mm_sub_ps(iz3,jz2);
2455             dx33             = _mm_sub_ps(ix3,jx3);
2456             dy33             = _mm_sub_ps(iy3,jy3);
2457             dz33             = _mm_sub_ps(iz3,jz3);
2458
2459             /* Calculate squared distance and things based on it */
2460             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2461             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2462             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2463             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2464             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2465             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2466             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2467             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2468             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2469
2470             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2471             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2472             rinv13           = gmx_mm_invsqrt_ps(rsq13);
2473             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2474             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2475             rinv23           = gmx_mm_invsqrt_ps(rsq23);
2476             rinv31           = gmx_mm_invsqrt_ps(rsq31);
2477             rinv32           = gmx_mm_invsqrt_ps(rsq32);
2478             rinv33           = gmx_mm_invsqrt_ps(rsq33);
2479
2480             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2481             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2482             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
2483             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2484             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2485             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
2486             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
2487             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
2488             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
2489
2490             fjx1             = _mm_setzero_ps();
2491             fjy1             = _mm_setzero_ps();
2492             fjz1             = _mm_setzero_ps();
2493             fjx2             = _mm_setzero_ps();
2494             fjy2             = _mm_setzero_ps();
2495             fjz2             = _mm_setzero_ps();
2496             fjx3             = _mm_setzero_ps();
2497             fjy3             = _mm_setzero_ps();
2498             fjz3             = _mm_setzero_ps();
2499
2500             /**************************
2501              * CALCULATE INTERACTIONS *
2502              **************************/
2503
2504             if (gmx_mm_any_lt(rsq11,rcutoff2))
2505             {
2506
2507             r11              = _mm_mul_ps(rsq11,rinv11);
2508             r11              = _mm_andnot_ps(dummy_mask,r11);
2509
2510             /* EWALD ELECTROSTATICS */
2511
2512             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2513             ewrt             = _mm_mul_ps(r11,ewtabscale);
2514             ewitab           = _mm_cvttps_epi32(ewrt);
2515             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2516             ewitab           = _mm_slli_epi32(ewitab,2);
2517             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2518             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2519             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2520             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2521             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2522             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2523             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2524             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2525             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2526
2527             d                = _mm_sub_ps(r11,rswitch);
2528             d                = _mm_max_ps(d,_mm_setzero_ps());
2529             d2               = _mm_mul_ps(d,d);
2530             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2531
2532             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2533
2534             /* Evaluate switch function */
2535             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2536             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2537             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2538
2539             fscal            = felec;
2540
2541             fscal            = _mm_and_ps(fscal,cutoff_mask);
2542
2543             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2544
2545             /* Calculate temporary vectorial force */
2546             tx               = _mm_mul_ps(fscal,dx11);
2547             ty               = _mm_mul_ps(fscal,dy11);
2548             tz               = _mm_mul_ps(fscal,dz11);
2549
2550             /* Update vectorial force */
2551             fix1             = _mm_add_ps(fix1,tx);
2552             fiy1             = _mm_add_ps(fiy1,ty);
2553             fiz1             = _mm_add_ps(fiz1,tz);
2554
2555             fjx1             = _mm_add_ps(fjx1,tx);
2556             fjy1             = _mm_add_ps(fjy1,ty);
2557             fjz1             = _mm_add_ps(fjz1,tz);
2558
2559             }
2560
2561             /**************************
2562              * CALCULATE INTERACTIONS *
2563              **************************/
2564
2565             if (gmx_mm_any_lt(rsq12,rcutoff2))
2566             {
2567
2568             r12              = _mm_mul_ps(rsq12,rinv12);
2569             r12              = _mm_andnot_ps(dummy_mask,r12);
2570
2571             /* EWALD ELECTROSTATICS */
2572
2573             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2574             ewrt             = _mm_mul_ps(r12,ewtabscale);
2575             ewitab           = _mm_cvttps_epi32(ewrt);
2576             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2577             ewitab           = _mm_slli_epi32(ewitab,2);
2578             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2579             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2580             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2581             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2582             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2583             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2584             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2585             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2586             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2587
2588             d                = _mm_sub_ps(r12,rswitch);
2589             d                = _mm_max_ps(d,_mm_setzero_ps());
2590             d2               = _mm_mul_ps(d,d);
2591             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2592
2593             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2594
2595             /* Evaluate switch function */
2596             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2597             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2598             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2599
2600             fscal            = felec;
2601
2602             fscal            = _mm_and_ps(fscal,cutoff_mask);
2603
2604             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2605
2606             /* Calculate temporary vectorial force */
2607             tx               = _mm_mul_ps(fscal,dx12);
2608             ty               = _mm_mul_ps(fscal,dy12);
2609             tz               = _mm_mul_ps(fscal,dz12);
2610
2611             /* Update vectorial force */
2612             fix1             = _mm_add_ps(fix1,tx);
2613             fiy1             = _mm_add_ps(fiy1,ty);
2614             fiz1             = _mm_add_ps(fiz1,tz);
2615
2616             fjx2             = _mm_add_ps(fjx2,tx);
2617             fjy2             = _mm_add_ps(fjy2,ty);
2618             fjz2             = _mm_add_ps(fjz2,tz);
2619
2620             }
2621
2622             /**************************
2623              * CALCULATE INTERACTIONS *
2624              **************************/
2625
2626             if (gmx_mm_any_lt(rsq13,rcutoff2))
2627             {
2628
2629             r13              = _mm_mul_ps(rsq13,rinv13);
2630             r13              = _mm_andnot_ps(dummy_mask,r13);
2631
2632             /* EWALD ELECTROSTATICS */
2633
2634             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2635             ewrt             = _mm_mul_ps(r13,ewtabscale);
2636             ewitab           = _mm_cvttps_epi32(ewrt);
2637             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2638             ewitab           = _mm_slli_epi32(ewitab,2);
2639             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2640             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2641             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2642             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2643             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2644             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2645             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2646             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2647             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2648
2649             d                = _mm_sub_ps(r13,rswitch);
2650             d                = _mm_max_ps(d,_mm_setzero_ps());
2651             d2               = _mm_mul_ps(d,d);
2652             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2653
2654             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2655
2656             /* Evaluate switch function */
2657             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2658             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2659             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
2660
2661             fscal            = felec;
2662
2663             fscal            = _mm_and_ps(fscal,cutoff_mask);
2664
2665             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2666
2667             /* Calculate temporary vectorial force */
2668             tx               = _mm_mul_ps(fscal,dx13);
2669             ty               = _mm_mul_ps(fscal,dy13);
2670             tz               = _mm_mul_ps(fscal,dz13);
2671
2672             /* Update vectorial force */
2673             fix1             = _mm_add_ps(fix1,tx);
2674             fiy1             = _mm_add_ps(fiy1,ty);
2675             fiz1             = _mm_add_ps(fiz1,tz);
2676
2677             fjx3             = _mm_add_ps(fjx3,tx);
2678             fjy3             = _mm_add_ps(fjy3,ty);
2679             fjz3             = _mm_add_ps(fjz3,tz);
2680
2681             }
2682
2683             /**************************
2684              * CALCULATE INTERACTIONS *
2685              **************************/
2686
2687             if (gmx_mm_any_lt(rsq21,rcutoff2))
2688             {
2689
2690             r21              = _mm_mul_ps(rsq21,rinv21);
2691             r21              = _mm_andnot_ps(dummy_mask,r21);
2692
2693             /* EWALD ELECTROSTATICS */
2694
2695             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2696             ewrt             = _mm_mul_ps(r21,ewtabscale);
2697             ewitab           = _mm_cvttps_epi32(ewrt);
2698             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2699             ewitab           = _mm_slli_epi32(ewitab,2);
2700             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2701             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2702             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2703             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2704             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2705             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2706             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2707             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2708             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2709
2710             d                = _mm_sub_ps(r21,rswitch);
2711             d                = _mm_max_ps(d,_mm_setzero_ps());
2712             d2               = _mm_mul_ps(d,d);
2713             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2714
2715             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2716
2717             /* Evaluate switch function */
2718             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2719             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2720             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2721
2722             fscal            = felec;
2723
2724             fscal            = _mm_and_ps(fscal,cutoff_mask);
2725
2726             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2727
2728             /* Calculate temporary vectorial force */
2729             tx               = _mm_mul_ps(fscal,dx21);
2730             ty               = _mm_mul_ps(fscal,dy21);
2731             tz               = _mm_mul_ps(fscal,dz21);
2732
2733             /* Update vectorial force */
2734             fix2             = _mm_add_ps(fix2,tx);
2735             fiy2             = _mm_add_ps(fiy2,ty);
2736             fiz2             = _mm_add_ps(fiz2,tz);
2737
2738             fjx1             = _mm_add_ps(fjx1,tx);
2739             fjy1             = _mm_add_ps(fjy1,ty);
2740             fjz1             = _mm_add_ps(fjz1,tz);
2741
2742             }
2743
2744             /**************************
2745              * CALCULATE INTERACTIONS *
2746              **************************/
2747
2748             if (gmx_mm_any_lt(rsq22,rcutoff2))
2749             {
2750
2751             r22              = _mm_mul_ps(rsq22,rinv22);
2752             r22              = _mm_andnot_ps(dummy_mask,r22);
2753
2754             /* EWALD ELECTROSTATICS */
2755
2756             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2757             ewrt             = _mm_mul_ps(r22,ewtabscale);
2758             ewitab           = _mm_cvttps_epi32(ewrt);
2759             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2760             ewitab           = _mm_slli_epi32(ewitab,2);
2761             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2762             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2763             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2764             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2765             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2766             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2767             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2768             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2769             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2770
2771             d                = _mm_sub_ps(r22,rswitch);
2772             d                = _mm_max_ps(d,_mm_setzero_ps());
2773             d2               = _mm_mul_ps(d,d);
2774             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2775
2776             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2777
2778             /* Evaluate switch function */
2779             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2780             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2781             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2782
2783             fscal            = felec;
2784
2785             fscal            = _mm_and_ps(fscal,cutoff_mask);
2786
2787             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2788
2789             /* Calculate temporary vectorial force */
2790             tx               = _mm_mul_ps(fscal,dx22);
2791             ty               = _mm_mul_ps(fscal,dy22);
2792             tz               = _mm_mul_ps(fscal,dz22);
2793
2794             /* Update vectorial force */
2795             fix2             = _mm_add_ps(fix2,tx);
2796             fiy2             = _mm_add_ps(fiy2,ty);
2797             fiz2             = _mm_add_ps(fiz2,tz);
2798
2799             fjx2             = _mm_add_ps(fjx2,tx);
2800             fjy2             = _mm_add_ps(fjy2,ty);
2801             fjz2             = _mm_add_ps(fjz2,tz);
2802
2803             }
2804
2805             /**************************
2806              * CALCULATE INTERACTIONS *
2807              **************************/
2808
2809             if (gmx_mm_any_lt(rsq23,rcutoff2))
2810             {
2811
2812             r23              = _mm_mul_ps(rsq23,rinv23);
2813             r23              = _mm_andnot_ps(dummy_mask,r23);
2814
2815             /* EWALD ELECTROSTATICS */
2816
2817             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2818             ewrt             = _mm_mul_ps(r23,ewtabscale);
2819             ewitab           = _mm_cvttps_epi32(ewrt);
2820             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2821             ewitab           = _mm_slli_epi32(ewitab,2);
2822             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2823             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2824             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2825             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2826             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2827             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2828             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2829             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
2830             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2831
2832             d                = _mm_sub_ps(r23,rswitch);
2833             d                = _mm_max_ps(d,_mm_setzero_ps());
2834             d2               = _mm_mul_ps(d,d);
2835             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2836
2837             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2838
2839             /* Evaluate switch function */
2840             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2841             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
2842             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
2843
2844             fscal            = felec;
2845
2846             fscal            = _mm_and_ps(fscal,cutoff_mask);
2847
2848             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2849
2850             /* Calculate temporary vectorial force */
2851             tx               = _mm_mul_ps(fscal,dx23);
2852             ty               = _mm_mul_ps(fscal,dy23);
2853             tz               = _mm_mul_ps(fscal,dz23);
2854
2855             /* Update vectorial force */
2856             fix2             = _mm_add_ps(fix2,tx);
2857             fiy2             = _mm_add_ps(fiy2,ty);
2858             fiz2             = _mm_add_ps(fiz2,tz);
2859
2860             fjx3             = _mm_add_ps(fjx3,tx);
2861             fjy3             = _mm_add_ps(fjy3,ty);
2862             fjz3             = _mm_add_ps(fjz3,tz);
2863
2864             }
2865
2866             /**************************
2867              * CALCULATE INTERACTIONS *
2868              **************************/
2869
2870             if (gmx_mm_any_lt(rsq31,rcutoff2))
2871             {
2872
2873             r31              = _mm_mul_ps(rsq31,rinv31);
2874             r31              = _mm_andnot_ps(dummy_mask,r31);
2875
2876             /* EWALD ELECTROSTATICS */
2877
2878             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2879             ewrt             = _mm_mul_ps(r31,ewtabscale);
2880             ewitab           = _mm_cvttps_epi32(ewrt);
2881             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2882             ewitab           = _mm_slli_epi32(ewitab,2);
2883             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2884             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2885             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2886             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2887             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2888             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2889             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2890             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
2891             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2892
2893             d                = _mm_sub_ps(r31,rswitch);
2894             d                = _mm_max_ps(d,_mm_setzero_ps());
2895             d2               = _mm_mul_ps(d,d);
2896             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2897
2898             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2899
2900             /* Evaluate switch function */
2901             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2902             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2903             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
2904
2905             fscal            = felec;
2906
2907             fscal            = _mm_and_ps(fscal,cutoff_mask);
2908
2909             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2910
2911             /* Calculate temporary vectorial force */
2912             tx               = _mm_mul_ps(fscal,dx31);
2913             ty               = _mm_mul_ps(fscal,dy31);
2914             tz               = _mm_mul_ps(fscal,dz31);
2915
2916             /* Update vectorial force */
2917             fix3             = _mm_add_ps(fix3,tx);
2918             fiy3             = _mm_add_ps(fiy3,ty);
2919             fiz3             = _mm_add_ps(fiz3,tz);
2920
2921             fjx1             = _mm_add_ps(fjx1,tx);
2922             fjy1             = _mm_add_ps(fjy1,ty);
2923             fjz1             = _mm_add_ps(fjz1,tz);
2924
2925             }
2926
2927             /**************************
2928              * CALCULATE INTERACTIONS *
2929              **************************/
2930
2931             if (gmx_mm_any_lt(rsq32,rcutoff2))
2932             {
2933
2934             r32              = _mm_mul_ps(rsq32,rinv32);
2935             r32              = _mm_andnot_ps(dummy_mask,r32);
2936
2937             /* EWALD ELECTROSTATICS */
2938
2939             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2940             ewrt             = _mm_mul_ps(r32,ewtabscale);
2941             ewitab           = _mm_cvttps_epi32(ewrt);
2942             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2943             ewitab           = _mm_slli_epi32(ewitab,2);
2944             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2945             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2946             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2947             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2948             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2949             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2950             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2951             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
2952             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2953
2954             d                = _mm_sub_ps(r32,rswitch);
2955             d                = _mm_max_ps(d,_mm_setzero_ps());
2956             d2               = _mm_mul_ps(d,d);
2957             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2958
2959             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2960
2961             /* Evaluate switch function */
2962             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2963             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2964             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
2965
2966             fscal            = felec;
2967
2968             fscal            = _mm_and_ps(fscal,cutoff_mask);
2969
2970             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2971
2972             /* Calculate temporary vectorial force */
2973             tx               = _mm_mul_ps(fscal,dx32);
2974             ty               = _mm_mul_ps(fscal,dy32);
2975             tz               = _mm_mul_ps(fscal,dz32);
2976
2977             /* Update vectorial force */
2978             fix3             = _mm_add_ps(fix3,tx);
2979             fiy3             = _mm_add_ps(fiy3,ty);
2980             fiz3             = _mm_add_ps(fiz3,tz);
2981
2982             fjx2             = _mm_add_ps(fjx2,tx);
2983             fjy2             = _mm_add_ps(fjy2,ty);
2984             fjz2             = _mm_add_ps(fjz2,tz);
2985
2986             }
2987
2988             /**************************
2989              * CALCULATE INTERACTIONS *
2990              **************************/
2991
2992             if (gmx_mm_any_lt(rsq33,rcutoff2))
2993             {
2994
2995             r33              = _mm_mul_ps(rsq33,rinv33);
2996             r33              = _mm_andnot_ps(dummy_mask,r33);
2997
2998             /* EWALD ELECTROSTATICS */
2999
3000             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3001             ewrt             = _mm_mul_ps(r33,ewtabscale);
3002             ewitab           = _mm_cvttps_epi32(ewrt);
3003             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
3004             ewitab           = _mm_slli_epi32(ewitab,2);
3005             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3006             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3007             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3008             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3009             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3010             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3011             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3012             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
3013             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
3014
3015             d                = _mm_sub_ps(r33,rswitch);
3016             d                = _mm_max_ps(d,_mm_setzero_ps());
3017             d2               = _mm_mul_ps(d,d);
3018             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3019
3020             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3021
3022             /* Evaluate switch function */
3023             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3024             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
3025             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
3026
3027             fscal            = felec;
3028
3029             fscal            = _mm_and_ps(fscal,cutoff_mask);
3030
3031             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3032
3033             /* Calculate temporary vectorial force */
3034             tx               = _mm_mul_ps(fscal,dx33);
3035             ty               = _mm_mul_ps(fscal,dy33);
3036             tz               = _mm_mul_ps(fscal,dz33);
3037
3038             /* Update vectorial force */
3039             fix3             = _mm_add_ps(fix3,tx);
3040             fiy3             = _mm_add_ps(fiy3,ty);
3041             fiz3             = _mm_add_ps(fiz3,tz);
3042
3043             fjx3             = _mm_add_ps(fjx3,tx);
3044             fjy3             = _mm_add_ps(fjy3,ty);
3045             fjz3             = _mm_add_ps(fjz3,tz);
3046
3047             }
3048
3049             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3050             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3051             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3052             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3053
3054             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
3055                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3056
3057             /* Inner loop uses 567 flops */
3058         }
3059
3060         /* End of innermost loop */
3061
3062         gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3063                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
3064
3065         /* Increment number of inner iterations */
3066         inneriter                  += j_index_end - j_index_start;
3067
3068         /* Outer loop uses 18 flops */
3069     }
3070
3071     /* Increment number of outer iterations */
3072     outeriter        += nri;
3073
3074     /* Update outer/inner flops */
3075
3076     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*567);
3077 }