6add1448d31f758468d129959da4383d6ff0d778
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_ElecEwSw_VdwNone_GeomW4W4_sse4_1_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
46
47 #include "gromacs/simd/math_x86_sse4_1_single.h"
48 #include "kernelutil_x86_sse4_1_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse4_1_single
52  * Electrostatics interaction: Ewald
53  * VdW interaction:            None
54  * Geometry:                   Water4-Water4
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse4_1_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83     int              vdwioffset1;
84     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85     int              vdwioffset2;
86     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87     int              vdwioffset3;
88     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
89     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
90     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
91     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
92     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
93     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
94     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
95     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
96     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
97     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
98     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
99     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
100     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
101     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
102     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
103     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
104     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
105     real             *charge;
106     __m128i          ewitab;
107     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
108     real             *ewtab;
109     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
110     real             rswitch_scalar,d_scalar;
111     __m128           dummy_mask,cutoff_mask;
112     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
113     __m128           one     = _mm_set1_ps(1.0);
114     __m128           two     = _mm_set1_ps(2.0);
115     x                = xx[0];
116     f                = ff[0];
117
118     nri              = nlist->nri;
119     iinr             = nlist->iinr;
120     jindex           = nlist->jindex;
121     jjnr             = nlist->jjnr;
122     shiftidx         = nlist->shift;
123     gid              = nlist->gid;
124     shiftvec         = fr->shift_vec[0];
125     fshift           = fr->fshift[0];
126     facel            = _mm_set1_ps(fr->epsfac);
127     charge           = mdatoms->chargeA;
128
129     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
130     ewtab            = fr->ic->tabq_coul_FDV0;
131     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
132     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
133
134     /* Setup water-specific parameters */
135     inr              = nlist->iinr[0];
136     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
137     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
138     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
139
140     jq1              = _mm_set1_ps(charge[inr+1]);
141     jq2              = _mm_set1_ps(charge[inr+2]);
142     jq3              = _mm_set1_ps(charge[inr+3]);
143     qq11             = _mm_mul_ps(iq1,jq1);
144     qq12             = _mm_mul_ps(iq1,jq2);
145     qq13             = _mm_mul_ps(iq1,jq3);
146     qq21             = _mm_mul_ps(iq2,jq1);
147     qq22             = _mm_mul_ps(iq2,jq2);
148     qq23             = _mm_mul_ps(iq2,jq3);
149     qq31             = _mm_mul_ps(iq3,jq1);
150     qq32             = _mm_mul_ps(iq3,jq2);
151     qq33             = _mm_mul_ps(iq3,jq3);
152
153     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
154     rcutoff_scalar   = fr->rcoulomb;
155     rcutoff          = _mm_set1_ps(rcutoff_scalar);
156     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
157
158     rswitch_scalar   = fr->rcoulomb_switch;
159     rswitch          = _mm_set1_ps(rswitch_scalar);
160     /* Setup switch parameters */
161     d_scalar         = rcutoff_scalar-rswitch_scalar;
162     d                = _mm_set1_ps(d_scalar);
163     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
164     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
165     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
166     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
167     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
168     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
169
170     /* Avoid stupid compiler warnings */
171     jnrA = jnrB = jnrC = jnrD = 0;
172     j_coord_offsetA = 0;
173     j_coord_offsetB = 0;
174     j_coord_offsetC = 0;
175     j_coord_offsetD = 0;
176
177     outeriter        = 0;
178     inneriter        = 0;
179
180     for(iidx=0;iidx<4*DIM;iidx++)
181     {
182         scratch[iidx] = 0.0;
183     }
184
185     /* Start outer loop over neighborlists */
186     for(iidx=0; iidx<nri; iidx++)
187     {
188         /* Load shift vector for this list */
189         i_shift_offset   = DIM*shiftidx[iidx];
190
191         /* Load limits for loop over neighbors */
192         j_index_start    = jindex[iidx];
193         j_index_end      = jindex[iidx+1];
194
195         /* Get outer coordinate index */
196         inr              = iinr[iidx];
197         i_coord_offset   = DIM*inr;
198
199         /* Load i particle coords and add shift vector */
200         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
201                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
202
203         fix1             = _mm_setzero_ps();
204         fiy1             = _mm_setzero_ps();
205         fiz1             = _mm_setzero_ps();
206         fix2             = _mm_setzero_ps();
207         fiy2             = _mm_setzero_ps();
208         fiz2             = _mm_setzero_ps();
209         fix3             = _mm_setzero_ps();
210         fiy3             = _mm_setzero_ps();
211         fiz3             = _mm_setzero_ps();
212
213         /* Reset potential sums */
214         velecsum         = _mm_setzero_ps();
215
216         /* Start inner kernel loop */
217         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
218         {
219
220             /* Get j neighbor index, and coordinate index */
221             jnrA             = jjnr[jidx];
222             jnrB             = jjnr[jidx+1];
223             jnrC             = jjnr[jidx+2];
224             jnrD             = jjnr[jidx+3];
225             j_coord_offsetA  = DIM*jnrA;
226             j_coord_offsetB  = DIM*jnrB;
227             j_coord_offsetC  = DIM*jnrC;
228             j_coord_offsetD  = DIM*jnrD;
229
230             /* load j atom coordinates */
231             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
232                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
233                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
234
235             /* Calculate displacement vector */
236             dx11             = _mm_sub_ps(ix1,jx1);
237             dy11             = _mm_sub_ps(iy1,jy1);
238             dz11             = _mm_sub_ps(iz1,jz1);
239             dx12             = _mm_sub_ps(ix1,jx2);
240             dy12             = _mm_sub_ps(iy1,jy2);
241             dz12             = _mm_sub_ps(iz1,jz2);
242             dx13             = _mm_sub_ps(ix1,jx3);
243             dy13             = _mm_sub_ps(iy1,jy3);
244             dz13             = _mm_sub_ps(iz1,jz3);
245             dx21             = _mm_sub_ps(ix2,jx1);
246             dy21             = _mm_sub_ps(iy2,jy1);
247             dz21             = _mm_sub_ps(iz2,jz1);
248             dx22             = _mm_sub_ps(ix2,jx2);
249             dy22             = _mm_sub_ps(iy2,jy2);
250             dz22             = _mm_sub_ps(iz2,jz2);
251             dx23             = _mm_sub_ps(ix2,jx3);
252             dy23             = _mm_sub_ps(iy2,jy3);
253             dz23             = _mm_sub_ps(iz2,jz3);
254             dx31             = _mm_sub_ps(ix3,jx1);
255             dy31             = _mm_sub_ps(iy3,jy1);
256             dz31             = _mm_sub_ps(iz3,jz1);
257             dx32             = _mm_sub_ps(ix3,jx2);
258             dy32             = _mm_sub_ps(iy3,jy2);
259             dz32             = _mm_sub_ps(iz3,jz2);
260             dx33             = _mm_sub_ps(ix3,jx3);
261             dy33             = _mm_sub_ps(iy3,jy3);
262             dz33             = _mm_sub_ps(iz3,jz3);
263
264             /* Calculate squared distance and things based on it */
265             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
266             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
267             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
268             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
269             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
270             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
271             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
272             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
273             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
274
275             rinv11           = gmx_mm_invsqrt_ps(rsq11);
276             rinv12           = gmx_mm_invsqrt_ps(rsq12);
277             rinv13           = gmx_mm_invsqrt_ps(rsq13);
278             rinv21           = gmx_mm_invsqrt_ps(rsq21);
279             rinv22           = gmx_mm_invsqrt_ps(rsq22);
280             rinv23           = gmx_mm_invsqrt_ps(rsq23);
281             rinv31           = gmx_mm_invsqrt_ps(rsq31);
282             rinv32           = gmx_mm_invsqrt_ps(rsq32);
283             rinv33           = gmx_mm_invsqrt_ps(rsq33);
284
285             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
286             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
287             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
288             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
289             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
290             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
291             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
292             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
293             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
294
295             fjx1             = _mm_setzero_ps();
296             fjy1             = _mm_setzero_ps();
297             fjz1             = _mm_setzero_ps();
298             fjx2             = _mm_setzero_ps();
299             fjy2             = _mm_setzero_ps();
300             fjz2             = _mm_setzero_ps();
301             fjx3             = _mm_setzero_ps();
302             fjy3             = _mm_setzero_ps();
303             fjz3             = _mm_setzero_ps();
304
305             /**************************
306              * CALCULATE INTERACTIONS *
307              **************************/
308
309             if (gmx_mm_any_lt(rsq11,rcutoff2))
310             {
311
312             r11              = _mm_mul_ps(rsq11,rinv11);
313
314             /* EWALD ELECTROSTATICS */
315
316             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
317             ewrt             = _mm_mul_ps(r11,ewtabscale);
318             ewitab           = _mm_cvttps_epi32(ewrt);
319             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
320             ewitab           = _mm_slli_epi32(ewitab,2);
321             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
322             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
323             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
324             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
325             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
326             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
327             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
328             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
329             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
330
331             d                = _mm_sub_ps(r11,rswitch);
332             d                = _mm_max_ps(d,_mm_setzero_ps());
333             d2               = _mm_mul_ps(d,d);
334             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
335
336             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
337
338             /* Evaluate switch function */
339             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
340             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
341             velec            = _mm_mul_ps(velec,sw);
342             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
343
344             /* Update potential sum for this i atom from the interaction with this j atom. */
345             velec            = _mm_and_ps(velec,cutoff_mask);
346             velecsum         = _mm_add_ps(velecsum,velec);
347
348             fscal            = felec;
349
350             fscal            = _mm_and_ps(fscal,cutoff_mask);
351
352             /* Calculate temporary vectorial force */
353             tx               = _mm_mul_ps(fscal,dx11);
354             ty               = _mm_mul_ps(fscal,dy11);
355             tz               = _mm_mul_ps(fscal,dz11);
356
357             /* Update vectorial force */
358             fix1             = _mm_add_ps(fix1,tx);
359             fiy1             = _mm_add_ps(fiy1,ty);
360             fiz1             = _mm_add_ps(fiz1,tz);
361
362             fjx1             = _mm_add_ps(fjx1,tx);
363             fjy1             = _mm_add_ps(fjy1,ty);
364             fjz1             = _mm_add_ps(fjz1,tz);
365
366             }
367
368             /**************************
369              * CALCULATE INTERACTIONS *
370              **************************/
371
372             if (gmx_mm_any_lt(rsq12,rcutoff2))
373             {
374
375             r12              = _mm_mul_ps(rsq12,rinv12);
376
377             /* EWALD ELECTROSTATICS */
378
379             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
380             ewrt             = _mm_mul_ps(r12,ewtabscale);
381             ewitab           = _mm_cvttps_epi32(ewrt);
382             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
383             ewitab           = _mm_slli_epi32(ewitab,2);
384             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
385             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
386             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
387             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
388             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
389             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
390             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
391             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
392             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
393
394             d                = _mm_sub_ps(r12,rswitch);
395             d                = _mm_max_ps(d,_mm_setzero_ps());
396             d2               = _mm_mul_ps(d,d);
397             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
398
399             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
400
401             /* Evaluate switch function */
402             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
403             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
404             velec            = _mm_mul_ps(velec,sw);
405             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
406
407             /* Update potential sum for this i atom from the interaction with this j atom. */
408             velec            = _mm_and_ps(velec,cutoff_mask);
409             velecsum         = _mm_add_ps(velecsum,velec);
410
411             fscal            = felec;
412
413             fscal            = _mm_and_ps(fscal,cutoff_mask);
414
415             /* Calculate temporary vectorial force */
416             tx               = _mm_mul_ps(fscal,dx12);
417             ty               = _mm_mul_ps(fscal,dy12);
418             tz               = _mm_mul_ps(fscal,dz12);
419
420             /* Update vectorial force */
421             fix1             = _mm_add_ps(fix1,tx);
422             fiy1             = _mm_add_ps(fiy1,ty);
423             fiz1             = _mm_add_ps(fiz1,tz);
424
425             fjx2             = _mm_add_ps(fjx2,tx);
426             fjy2             = _mm_add_ps(fjy2,ty);
427             fjz2             = _mm_add_ps(fjz2,tz);
428
429             }
430
431             /**************************
432              * CALCULATE INTERACTIONS *
433              **************************/
434
435             if (gmx_mm_any_lt(rsq13,rcutoff2))
436             {
437
438             r13              = _mm_mul_ps(rsq13,rinv13);
439
440             /* EWALD ELECTROSTATICS */
441
442             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
443             ewrt             = _mm_mul_ps(r13,ewtabscale);
444             ewitab           = _mm_cvttps_epi32(ewrt);
445             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
446             ewitab           = _mm_slli_epi32(ewitab,2);
447             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
448             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
449             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
450             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
451             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
452             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
453             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
454             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
455             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
456
457             d                = _mm_sub_ps(r13,rswitch);
458             d                = _mm_max_ps(d,_mm_setzero_ps());
459             d2               = _mm_mul_ps(d,d);
460             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
461
462             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
463
464             /* Evaluate switch function */
465             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
466             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
467             velec            = _mm_mul_ps(velec,sw);
468             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
469
470             /* Update potential sum for this i atom from the interaction with this j atom. */
471             velec            = _mm_and_ps(velec,cutoff_mask);
472             velecsum         = _mm_add_ps(velecsum,velec);
473
474             fscal            = felec;
475
476             fscal            = _mm_and_ps(fscal,cutoff_mask);
477
478             /* Calculate temporary vectorial force */
479             tx               = _mm_mul_ps(fscal,dx13);
480             ty               = _mm_mul_ps(fscal,dy13);
481             tz               = _mm_mul_ps(fscal,dz13);
482
483             /* Update vectorial force */
484             fix1             = _mm_add_ps(fix1,tx);
485             fiy1             = _mm_add_ps(fiy1,ty);
486             fiz1             = _mm_add_ps(fiz1,tz);
487
488             fjx3             = _mm_add_ps(fjx3,tx);
489             fjy3             = _mm_add_ps(fjy3,ty);
490             fjz3             = _mm_add_ps(fjz3,tz);
491
492             }
493
494             /**************************
495              * CALCULATE INTERACTIONS *
496              **************************/
497
498             if (gmx_mm_any_lt(rsq21,rcutoff2))
499             {
500
501             r21              = _mm_mul_ps(rsq21,rinv21);
502
503             /* EWALD ELECTROSTATICS */
504
505             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
506             ewrt             = _mm_mul_ps(r21,ewtabscale);
507             ewitab           = _mm_cvttps_epi32(ewrt);
508             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
509             ewitab           = _mm_slli_epi32(ewitab,2);
510             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
511             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
512             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
513             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
514             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
515             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
516             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
517             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
518             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
519
520             d                = _mm_sub_ps(r21,rswitch);
521             d                = _mm_max_ps(d,_mm_setzero_ps());
522             d2               = _mm_mul_ps(d,d);
523             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
524
525             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
526
527             /* Evaluate switch function */
528             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
529             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
530             velec            = _mm_mul_ps(velec,sw);
531             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
532
533             /* Update potential sum for this i atom from the interaction with this j atom. */
534             velec            = _mm_and_ps(velec,cutoff_mask);
535             velecsum         = _mm_add_ps(velecsum,velec);
536
537             fscal            = felec;
538
539             fscal            = _mm_and_ps(fscal,cutoff_mask);
540
541             /* Calculate temporary vectorial force */
542             tx               = _mm_mul_ps(fscal,dx21);
543             ty               = _mm_mul_ps(fscal,dy21);
544             tz               = _mm_mul_ps(fscal,dz21);
545
546             /* Update vectorial force */
547             fix2             = _mm_add_ps(fix2,tx);
548             fiy2             = _mm_add_ps(fiy2,ty);
549             fiz2             = _mm_add_ps(fiz2,tz);
550
551             fjx1             = _mm_add_ps(fjx1,tx);
552             fjy1             = _mm_add_ps(fjy1,ty);
553             fjz1             = _mm_add_ps(fjz1,tz);
554
555             }
556
557             /**************************
558              * CALCULATE INTERACTIONS *
559              **************************/
560
561             if (gmx_mm_any_lt(rsq22,rcutoff2))
562             {
563
564             r22              = _mm_mul_ps(rsq22,rinv22);
565
566             /* EWALD ELECTROSTATICS */
567
568             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
569             ewrt             = _mm_mul_ps(r22,ewtabscale);
570             ewitab           = _mm_cvttps_epi32(ewrt);
571             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
572             ewitab           = _mm_slli_epi32(ewitab,2);
573             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
574             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
575             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
576             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
577             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
578             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
579             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
580             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
581             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
582
583             d                = _mm_sub_ps(r22,rswitch);
584             d                = _mm_max_ps(d,_mm_setzero_ps());
585             d2               = _mm_mul_ps(d,d);
586             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
587
588             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
589
590             /* Evaluate switch function */
591             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
592             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
593             velec            = _mm_mul_ps(velec,sw);
594             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
595
596             /* Update potential sum for this i atom from the interaction with this j atom. */
597             velec            = _mm_and_ps(velec,cutoff_mask);
598             velecsum         = _mm_add_ps(velecsum,velec);
599
600             fscal            = felec;
601
602             fscal            = _mm_and_ps(fscal,cutoff_mask);
603
604             /* Calculate temporary vectorial force */
605             tx               = _mm_mul_ps(fscal,dx22);
606             ty               = _mm_mul_ps(fscal,dy22);
607             tz               = _mm_mul_ps(fscal,dz22);
608
609             /* Update vectorial force */
610             fix2             = _mm_add_ps(fix2,tx);
611             fiy2             = _mm_add_ps(fiy2,ty);
612             fiz2             = _mm_add_ps(fiz2,tz);
613
614             fjx2             = _mm_add_ps(fjx2,tx);
615             fjy2             = _mm_add_ps(fjy2,ty);
616             fjz2             = _mm_add_ps(fjz2,tz);
617
618             }
619
620             /**************************
621              * CALCULATE INTERACTIONS *
622              **************************/
623
624             if (gmx_mm_any_lt(rsq23,rcutoff2))
625             {
626
627             r23              = _mm_mul_ps(rsq23,rinv23);
628
629             /* EWALD ELECTROSTATICS */
630
631             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
632             ewrt             = _mm_mul_ps(r23,ewtabscale);
633             ewitab           = _mm_cvttps_epi32(ewrt);
634             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
635             ewitab           = _mm_slli_epi32(ewitab,2);
636             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
637             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
638             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
639             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
640             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
641             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
642             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
643             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
644             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
645
646             d                = _mm_sub_ps(r23,rswitch);
647             d                = _mm_max_ps(d,_mm_setzero_ps());
648             d2               = _mm_mul_ps(d,d);
649             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
650
651             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
652
653             /* Evaluate switch function */
654             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
655             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
656             velec            = _mm_mul_ps(velec,sw);
657             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
658
659             /* Update potential sum for this i atom from the interaction with this j atom. */
660             velec            = _mm_and_ps(velec,cutoff_mask);
661             velecsum         = _mm_add_ps(velecsum,velec);
662
663             fscal            = felec;
664
665             fscal            = _mm_and_ps(fscal,cutoff_mask);
666
667             /* Calculate temporary vectorial force */
668             tx               = _mm_mul_ps(fscal,dx23);
669             ty               = _mm_mul_ps(fscal,dy23);
670             tz               = _mm_mul_ps(fscal,dz23);
671
672             /* Update vectorial force */
673             fix2             = _mm_add_ps(fix2,tx);
674             fiy2             = _mm_add_ps(fiy2,ty);
675             fiz2             = _mm_add_ps(fiz2,tz);
676
677             fjx3             = _mm_add_ps(fjx3,tx);
678             fjy3             = _mm_add_ps(fjy3,ty);
679             fjz3             = _mm_add_ps(fjz3,tz);
680
681             }
682
683             /**************************
684              * CALCULATE INTERACTIONS *
685              **************************/
686
687             if (gmx_mm_any_lt(rsq31,rcutoff2))
688             {
689
690             r31              = _mm_mul_ps(rsq31,rinv31);
691
692             /* EWALD ELECTROSTATICS */
693
694             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
695             ewrt             = _mm_mul_ps(r31,ewtabscale);
696             ewitab           = _mm_cvttps_epi32(ewrt);
697             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
698             ewitab           = _mm_slli_epi32(ewitab,2);
699             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
700             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
701             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
702             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
703             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
704             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
705             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
706             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
707             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
708
709             d                = _mm_sub_ps(r31,rswitch);
710             d                = _mm_max_ps(d,_mm_setzero_ps());
711             d2               = _mm_mul_ps(d,d);
712             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
713
714             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
715
716             /* Evaluate switch function */
717             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
718             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
719             velec            = _mm_mul_ps(velec,sw);
720             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
721
722             /* Update potential sum for this i atom from the interaction with this j atom. */
723             velec            = _mm_and_ps(velec,cutoff_mask);
724             velecsum         = _mm_add_ps(velecsum,velec);
725
726             fscal            = felec;
727
728             fscal            = _mm_and_ps(fscal,cutoff_mask);
729
730             /* Calculate temporary vectorial force */
731             tx               = _mm_mul_ps(fscal,dx31);
732             ty               = _mm_mul_ps(fscal,dy31);
733             tz               = _mm_mul_ps(fscal,dz31);
734
735             /* Update vectorial force */
736             fix3             = _mm_add_ps(fix3,tx);
737             fiy3             = _mm_add_ps(fiy3,ty);
738             fiz3             = _mm_add_ps(fiz3,tz);
739
740             fjx1             = _mm_add_ps(fjx1,tx);
741             fjy1             = _mm_add_ps(fjy1,ty);
742             fjz1             = _mm_add_ps(fjz1,tz);
743
744             }
745
746             /**************************
747              * CALCULATE INTERACTIONS *
748              **************************/
749
750             if (gmx_mm_any_lt(rsq32,rcutoff2))
751             {
752
753             r32              = _mm_mul_ps(rsq32,rinv32);
754
755             /* EWALD ELECTROSTATICS */
756
757             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
758             ewrt             = _mm_mul_ps(r32,ewtabscale);
759             ewitab           = _mm_cvttps_epi32(ewrt);
760             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
761             ewitab           = _mm_slli_epi32(ewitab,2);
762             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
763             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
764             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
765             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
766             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
767             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
768             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
769             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
770             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
771
772             d                = _mm_sub_ps(r32,rswitch);
773             d                = _mm_max_ps(d,_mm_setzero_ps());
774             d2               = _mm_mul_ps(d,d);
775             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
776
777             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
778
779             /* Evaluate switch function */
780             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
781             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
782             velec            = _mm_mul_ps(velec,sw);
783             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
784
785             /* Update potential sum for this i atom from the interaction with this j atom. */
786             velec            = _mm_and_ps(velec,cutoff_mask);
787             velecsum         = _mm_add_ps(velecsum,velec);
788
789             fscal            = felec;
790
791             fscal            = _mm_and_ps(fscal,cutoff_mask);
792
793             /* Calculate temporary vectorial force */
794             tx               = _mm_mul_ps(fscal,dx32);
795             ty               = _mm_mul_ps(fscal,dy32);
796             tz               = _mm_mul_ps(fscal,dz32);
797
798             /* Update vectorial force */
799             fix3             = _mm_add_ps(fix3,tx);
800             fiy3             = _mm_add_ps(fiy3,ty);
801             fiz3             = _mm_add_ps(fiz3,tz);
802
803             fjx2             = _mm_add_ps(fjx2,tx);
804             fjy2             = _mm_add_ps(fjy2,ty);
805             fjz2             = _mm_add_ps(fjz2,tz);
806
807             }
808
809             /**************************
810              * CALCULATE INTERACTIONS *
811              **************************/
812
813             if (gmx_mm_any_lt(rsq33,rcutoff2))
814             {
815
816             r33              = _mm_mul_ps(rsq33,rinv33);
817
818             /* EWALD ELECTROSTATICS */
819
820             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
821             ewrt             = _mm_mul_ps(r33,ewtabscale);
822             ewitab           = _mm_cvttps_epi32(ewrt);
823             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
824             ewitab           = _mm_slli_epi32(ewitab,2);
825             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
826             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
827             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
828             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
829             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
830             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
831             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
832             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
833             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
834
835             d                = _mm_sub_ps(r33,rswitch);
836             d                = _mm_max_ps(d,_mm_setzero_ps());
837             d2               = _mm_mul_ps(d,d);
838             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
839
840             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
841
842             /* Evaluate switch function */
843             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
844             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
845             velec            = _mm_mul_ps(velec,sw);
846             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
847
848             /* Update potential sum for this i atom from the interaction with this j atom. */
849             velec            = _mm_and_ps(velec,cutoff_mask);
850             velecsum         = _mm_add_ps(velecsum,velec);
851
852             fscal            = felec;
853
854             fscal            = _mm_and_ps(fscal,cutoff_mask);
855
856             /* Calculate temporary vectorial force */
857             tx               = _mm_mul_ps(fscal,dx33);
858             ty               = _mm_mul_ps(fscal,dy33);
859             tz               = _mm_mul_ps(fscal,dz33);
860
861             /* Update vectorial force */
862             fix3             = _mm_add_ps(fix3,tx);
863             fiy3             = _mm_add_ps(fiy3,ty);
864             fiz3             = _mm_add_ps(fiz3,tz);
865
866             fjx3             = _mm_add_ps(fjx3,tx);
867             fjy3             = _mm_add_ps(fjy3,ty);
868             fjz3             = _mm_add_ps(fjz3,tz);
869
870             }
871
872             fjptrA             = f+j_coord_offsetA;
873             fjptrB             = f+j_coord_offsetB;
874             fjptrC             = f+j_coord_offsetC;
875             fjptrD             = f+j_coord_offsetD;
876
877             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
878                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
879
880             /* Inner loop uses 585 flops */
881         }
882
883         if(jidx<j_index_end)
884         {
885
886             /* Get j neighbor index, and coordinate index */
887             jnrlistA         = jjnr[jidx];
888             jnrlistB         = jjnr[jidx+1];
889             jnrlistC         = jjnr[jidx+2];
890             jnrlistD         = jjnr[jidx+3];
891             /* Sign of each element will be negative for non-real atoms.
892              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
893              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
894              */
895             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
896             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
897             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
898             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
899             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
900             j_coord_offsetA  = DIM*jnrA;
901             j_coord_offsetB  = DIM*jnrB;
902             j_coord_offsetC  = DIM*jnrC;
903             j_coord_offsetD  = DIM*jnrD;
904
905             /* load j atom coordinates */
906             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
907                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
908                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
909
910             /* Calculate displacement vector */
911             dx11             = _mm_sub_ps(ix1,jx1);
912             dy11             = _mm_sub_ps(iy1,jy1);
913             dz11             = _mm_sub_ps(iz1,jz1);
914             dx12             = _mm_sub_ps(ix1,jx2);
915             dy12             = _mm_sub_ps(iy1,jy2);
916             dz12             = _mm_sub_ps(iz1,jz2);
917             dx13             = _mm_sub_ps(ix1,jx3);
918             dy13             = _mm_sub_ps(iy1,jy3);
919             dz13             = _mm_sub_ps(iz1,jz3);
920             dx21             = _mm_sub_ps(ix2,jx1);
921             dy21             = _mm_sub_ps(iy2,jy1);
922             dz21             = _mm_sub_ps(iz2,jz1);
923             dx22             = _mm_sub_ps(ix2,jx2);
924             dy22             = _mm_sub_ps(iy2,jy2);
925             dz22             = _mm_sub_ps(iz2,jz2);
926             dx23             = _mm_sub_ps(ix2,jx3);
927             dy23             = _mm_sub_ps(iy2,jy3);
928             dz23             = _mm_sub_ps(iz2,jz3);
929             dx31             = _mm_sub_ps(ix3,jx1);
930             dy31             = _mm_sub_ps(iy3,jy1);
931             dz31             = _mm_sub_ps(iz3,jz1);
932             dx32             = _mm_sub_ps(ix3,jx2);
933             dy32             = _mm_sub_ps(iy3,jy2);
934             dz32             = _mm_sub_ps(iz3,jz2);
935             dx33             = _mm_sub_ps(ix3,jx3);
936             dy33             = _mm_sub_ps(iy3,jy3);
937             dz33             = _mm_sub_ps(iz3,jz3);
938
939             /* Calculate squared distance and things based on it */
940             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
941             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
942             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
943             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
944             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
945             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
946             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
947             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
948             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
949
950             rinv11           = gmx_mm_invsqrt_ps(rsq11);
951             rinv12           = gmx_mm_invsqrt_ps(rsq12);
952             rinv13           = gmx_mm_invsqrt_ps(rsq13);
953             rinv21           = gmx_mm_invsqrt_ps(rsq21);
954             rinv22           = gmx_mm_invsqrt_ps(rsq22);
955             rinv23           = gmx_mm_invsqrt_ps(rsq23);
956             rinv31           = gmx_mm_invsqrt_ps(rsq31);
957             rinv32           = gmx_mm_invsqrt_ps(rsq32);
958             rinv33           = gmx_mm_invsqrt_ps(rsq33);
959
960             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
961             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
962             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
963             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
964             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
965             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
966             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
967             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
968             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
969
970             fjx1             = _mm_setzero_ps();
971             fjy1             = _mm_setzero_ps();
972             fjz1             = _mm_setzero_ps();
973             fjx2             = _mm_setzero_ps();
974             fjy2             = _mm_setzero_ps();
975             fjz2             = _mm_setzero_ps();
976             fjx3             = _mm_setzero_ps();
977             fjy3             = _mm_setzero_ps();
978             fjz3             = _mm_setzero_ps();
979
980             /**************************
981              * CALCULATE INTERACTIONS *
982              **************************/
983
984             if (gmx_mm_any_lt(rsq11,rcutoff2))
985             {
986
987             r11              = _mm_mul_ps(rsq11,rinv11);
988             r11              = _mm_andnot_ps(dummy_mask,r11);
989
990             /* EWALD ELECTROSTATICS */
991
992             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
993             ewrt             = _mm_mul_ps(r11,ewtabscale);
994             ewitab           = _mm_cvttps_epi32(ewrt);
995             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
996             ewitab           = _mm_slli_epi32(ewitab,2);
997             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
998             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
999             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1000             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1001             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1002             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1003             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1004             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1005             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1006
1007             d                = _mm_sub_ps(r11,rswitch);
1008             d                = _mm_max_ps(d,_mm_setzero_ps());
1009             d2               = _mm_mul_ps(d,d);
1010             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1011
1012             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1013
1014             /* Evaluate switch function */
1015             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1016             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1017             velec            = _mm_mul_ps(velec,sw);
1018             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1019
1020             /* Update potential sum for this i atom from the interaction with this j atom. */
1021             velec            = _mm_and_ps(velec,cutoff_mask);
1022             velec            = _mm_andnot_ps(dummy_mask,velec);
1023             velecsum         = _mm_add_ps(velecsum,velec);
1024
1025             fscal            = felec;
1026
1027             fscal            = _mm_and_ps(fscal,cutoff_mask);
1028
1029             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1030
1031             /* Calculate temporary vectorial force */
1032             tx               = _mm_mul_ps(fscal,dx11);
1033             ty               = _mm_mul_ps(fscal,dy11);
1034             tz               = _mm_mul_ps(fscal,dz11);
1035
1036             /* Update vectorial force */
1037             fix1             = _mm_add_ps(fix1,tx);
1038             fiy1             = _mm_add_ps(fiy1,ty);
1039             fiz1             = _mm_add_ps(fiz1,tz);
1040
1041             fjx1             = _mm_add_ps(fjx1,tx);
1042             fjy1             = _mm_add_ps(fjy1,ty);
1043             fjz1             = _mm_add_ps(fjz1,tz);
1044
1045             }
1046
1047             /**************************
1048              * CALCULATE INTERACTIONS *
1049              **************************/
1050
1051             if (gmx_mm_any_lt(rsq12,rcutoff2))
1052             {
1053
1054             r12              = _mm_mul_ps(rsq12,rinv12);
1055             r12              = _mm_andnot_ps(dummy_mask,r12);
1056
1057             /* EWALD ELECTROSTATICS */
1058
1059             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1060             ewrt             = _mm_mul_ps(r12,ewtabscale);
1061             ewitab           = _mm_cvttps_epi32(ewrt);
1062             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1063             ewitab           = _mm_slli_epi32(ewitab,2);
1064             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1065             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1066             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1067             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1068             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1069             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1070             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1071             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1072             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1073
1074             d                = _mm_sub_ps(r12,rswitch);
1075             d                = _mm_max_ps(d,_mm_setzero_ps());
1076             d2               = _mm_mul_ps(d,d);
1077             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1078
1079             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1080
1081             /* Evaluate switch function */
1082             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1083             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1084             velec            = _mm_mul_ps(velec,sw);
1085             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1086
1087             /* Update potential sum for this i atom from the interaction with this j atom. */
1088             velec            = _mm_and_ps(velec,cutoff_mask);
1089             velec            = _mm_andnot_ps(dummy_mask,velec);
1090             velecsum         = _mm_add_ps(velecsum,velec);
1091
1092             fscal            = felec;
1093
1094             fscal            = _mm_and_ps(fscal,cutoff_mask);
1095
1096             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1097
1098             /* Calculate temporary vectorial force */
1099             tx               = _mm_mul_ps(fscal,dx12);
1100             ty               = _mm_mul_ps(fscal,dy12);
1101             tz               = _mm_mul_ps(fscal,dz12);
1102
1103             /* Update vectorial force */
1104             fix1             = _mm_add_ps(fix1,tx);
1105             fiy1             = _mm_add_ps(fiy1,ty);
1106             fiz1             = _mm_add_ps(fiz1,tz);
1107
1108             fjx2             = _mm_add_ps(fjx2,tx);
1109             fjy2             = _mm_add_ps(fjy2,ty);
1110             fjz2             = _mm_add_ps(fjz2,tz);
1111
1112             }
1113
1114             /**************************
1115              * CALCULATE INTERACTIONS *
1116              **************************/
1117
1118             if (gmx_mm_any_lt(rsq13,rcutoff2))
1119             {
1120
1121             r13              = _mm_mul_ps(rsq13,rinv13);
1122             r13              = _mm_andnot_ps(dummy_mask,r13);
1123
1124             /* EWALD ELECTROSTATICS */
1125
1126             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1127             ewrt             = _mm_mul_ps(r13,ewtabscale);
1128             ewitab           = _mm_cvttps_epi32(ewrt);
1129             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1130             ewitab           = _mm_slli_epi32(ewitab,2);
1131             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1132             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1133             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1134             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1135             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1136             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1137             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1138             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
1139             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1140
1141             d                = _mm_sub_ps(r13,rswitch);
1142             d                = _mm_max_ps(d,_mm_setzero_ps());
1143             d2               = _mm_mul_ps(d,d);
1144             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1145
1146             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1147
1148             /* Evaluate switch function */
1149             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1150             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
1151             velec            = _mm_mul_ps(velec,sw);
1152             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
1153
1154             /* Update potential sum for this i atom from the interaction with this j atom. */
1155             velec            = _mm_and_ps(velec,cutoff_mask);
1156             velec            = _mm_andnot_ps(dummy_mask,velec);
1157             velecsum         = _mm_add_ps(velecsum,velec);
1158
1159             fscal            = felec;
1160
1161             fscal            = _mm_and_ps(fscal,cutoff_mask);
1162
1163             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1164
1165             /* Calculate temporary vectorial force */
1166             tx               = _mm_mul_ps(fscal,dx13);
1167             ty               = _mm_mul_ps(fscal,dy13);
1168             tz               = _mm_mul_ps(fscal,dz13);
1169
1170             /* Update vectorial force */
1171             fix1             = _mm_add_ps(fix1,tx);
1172             fiy1             = _mm_add_ps(fiy1,ty);
1173             fiz1             = _mm_add_ps(fiz1,tz);
1174
1175             fjx3             = _mm_add_ps(fjx3,tx);
1176             fjy3             = _mm_add_ps(fjy3,ty);
1177             fjz3             = _mm_add_ps(fjz3,tz);
1178
1179             }
1180
1181             /**************************
1182              * CALCULATE INTERACTIONS *
1183              **************************/
1184
1185             if (gmx_mm_any_lt(rsq21,rcutoff2))
1186             {
1187
1188             r21              = _mm_mul_ps(rsq21,rinv21);
1189             r21              = _mm_andnot_ps(dummy_mask,r21);
1190
1191             /* EWALD ELECTROSTATICS */
1192
1193             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1194             ewrt             = _mm_mul_ps(r21,ewtabscale);
1195             ewitab           = _mm_cvttps_epi32(ewrt);
1196             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1197             ewitab           = _mm_slli_epi32(ewitab,2);
1198             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1199             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1200             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1201             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1202             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1203             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1204             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1205             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1206             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1207
1208             d                = _mm_sub_ps(r21,rswitch);
1209             d                = _mm_max_ps(d,_mm_setzero_ps());
1210             d2               = _mm_mul_ps(d,d);
1211             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1212
1213             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1214
1215             /* Evaluate switch function */
1216             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1217             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1218             velec            = _mm_mul_ps(velec,sw);
1219             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1220
1221             /* Update potential sum for this i atom from the interaction with this j atom. */
1222             velec            = _mm_and_ps(velec,cutoff_mask);
1223             velec            = _mm_andnot_ps(dummy_mask,velec);
1224             velecsum         = _mm_add_ps(velecsum,velec);
1225
1226             fscal            = felec;
1227
1228             fscal            = _mm_and_ps(fscal,cutoff_mask);
1229
1230             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1231
1232             /* Calculate temporary vectorial force */
1233             tx               = _mm_mul_ps(fscal,dx21);
1234             ty               = _mm_mul_ps(fscal,dy21);
1235             tz               = _mm_mul_ps(fscal,dz21);
1236
1237             /* Update vectorial force */
1238             fix2             = _mm_add_ps(fix2,tx);
1239             fiy2             = _mm_add_ps(fiy2,ty);
1240             fiz2             = _mm_add_ps(fiz2,tz);
1241
1242             fjx1             = _mm_add_ps(fjx1,tx);
1243             fjy1             = _mm_add_ps(fjy1,ty);
1244             fjz1             = _mm_add_ps(fjz1,tz);
1245
1246             }
1247
1248             /**************************
1249              * CALCULATE INTERACTIONS *
1250              **************************/
1251
1252             if (gmx_mm_any_lt(rsq22,rcutoff2))
1253             {
1254
1255             r22              = _mm_mul_ps(rsq22,rinv22);
1256             r22              = _mm_andnot_ps(dummy_mask,r22);
1257
1258             /* EWALD ELECTROSTATICS */
1259
1260             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1261             ewrt             = _mm_mul_ps(r22,ewtabscale);
1262             ewitab           = _mm_cvttps_epi32(ewrt);
1263             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1264             ewitab           = _mm_slli_epi32(ewitab,2);
1265             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1266             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1267             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1268             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1269             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1270             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1271             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1272             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1273             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1274
1275             d                = _mm_sub_ps(r22,rswitch);
1276             d                = _mm_max_ps(d,_mm_setzero_ps());
1277             d2               = _mm_mul_ps(d,d);
1278             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1279
1280             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1281
1282             /* Evaluate switch function */
1283             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1284             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1285             velec            = _mm_mul_ps(velec,sw);
1286             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1287
1288             /* Update potential sum for this i atom from the interaction with this j atom. */
1289             velec            = _mm_and_ps(velec,cutoff_mask);
1290             velec            = _mm_andnot_ps(dummy_mask,velec);
1291             velecsum         = _mm_add_ps(velecsum,velec);
1292
1293             fscal            = felec;
1294
1295             fscal            = _mm_and_ps(fscal,cutoff_mask);
1296
1297             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1298
1299             /* Calculate temporary vectorial force */
1300             tx               = _mm_mul_ps(fscal,dx22);
1301             ty               = _mm_mul_ps(fscal,dy22);
1302             tz               = _mm_mul_ps(fscal,dz22);
1303
1304             /* Update vectorial force */
1305             fix2             = _mm_add_ps(fix2,tx);
1306             fiy2             = _mm_add_ps(fiy2,ty);
1307             fiz2             = _mm_add_ps(fiz2,tz);
1308
1309             fjx2             = _mm_add_ps(fjx2,tx);
1310             fjy2             = _mm_add_ps(fjy2,ty);
1311             fjz2             = _mm_add_ps(fjz2,tz);
1312
1313             }
1314
1315             /**************************
1316              * CALCULATE INTERACTIONS *
1317              **************************/
1318
1319             if (gmx_mm_any_lt(rsq23,rcutoff2))
1320             {
1321
1322             r23              = _mm_mul_ps(rsq23,rinv23);
1323             r23              = _mm_andnot_ps(dummy_mask,r23);
1324
1325             /* EWALD ELECTROSTATICS */
1326
1327             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1328             ewrt             = _mm_mul_ps(r23,ewtabscale);
1329             ewitab           = _mm_cvttps_epi32(ewrt);
1330             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1331             ewitab           = _mm_slli_epi32(ewitab,2);
1332             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1333             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1334             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1335             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1336             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1337             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1338             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1339             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
1340             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1341
1342             d                = _mm_sub_ps(r23,rswitch);
1343             d                = _mm_max_ps(d,_mm_setzero_ps());
1344             d2               = _mm_mul_ps(d,d);
1345             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1346
1347             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1348
1349             /* Evaluate switch function */
1350             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1351             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
1352             velec            = _mm_mul_ps(velec,sw);
1353             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1354
1355             /* Update potential sum for this i atom from the interaction with this j atom. */
1356             velec            = _mm_and_ps(velec,cutoff_mask);
1357             velec            = _mm_andnot_ps(dummy_mask,velec);
1358             velecsum         = _mm_add_ps(velecsum,velec);
1359
1360             fscal            = felec;
1361
1362             fscal            = _mm_and_ps(fscal,cutoff_mask);
1363
1364             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1365
1366             /* Calculate temporary vectorial force */
1367             tx               = _mm_mul_ps(fscal,dx23);
1368             ty               = _mm_mul_ps(fscal,dy23);
1369             tz               = _mm_mul_ps(fscal,dz23);
1370
1371             /* Update vectorial force */
1372             fix2             = _mm_add_ps(fix2,tx);
1373             fiy2             = _mm_add_ps(fiy2,ty);
1374             fiz2             = _mm_add_ps(fiz2,tz);
1375
1376             fjx3             = _mm_add_ps(fjx3,tx);
1377             fjy3             = _mm_add_ps(fjy3,ty);
1378             fjz3             = _mm_add_ps(fjz3,tz);
1379
1380             }
1381
1382             /**************************
1383              * CALCULATE INTERACTIONS *
1384              **************************/
1385
1386             if (gmx_mm_any_lt(rsq31,rcutoff2))
1387             {
1388
1389             r31              = _mm_mul_ps(rsq31,rinv31);
1390             r31              = _mm_andnot_ps(dummy_mask,r31);
1391
1392             /* EWALD ELECTROSTATICS */
1393
1394             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1395             ewrt             = _mm_mul_ps(r31,ewtabscale);
1396             ewitab           = _mm_cvttps_epi32(ewrt);
1397             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1398             ewitab           = _mm_slli_epi32(ewitab,2);
1399             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1400             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1401             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1402             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1403             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1404             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1405             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1406             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
1407             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1408
1409             d                = _mm_sub_ps(r31,rswitch);
1410             d                = _mm_max_ps(d,_mm_setzero_ps());
1411             d2               = _mm_mul_ps(d,d);
1412             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1413
1414             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1415
1416             /* Evaluate switch function */
1417             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1418             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
1419             velec            = _mm_mul_ps(velec,sw);
1420             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1421
1422             /* Update potential sum for this i atom from the interaction with this j atom. */
1423             velec            = _mm_and_ps(velec,cutoff_mask);
1424             velec            = _mm_andnot_ps(dummy_mask,velec);
1425             velecsum         = _mm_add_ps(velecsum,velec);
1426
1427             fscal            = felec;
1428
1429             fscal            = _mm_and_ps(fscal,cutoff_mask);
1430
1431             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1432
1433             /* Calculate temporary vectorial force */
1434             tx               = _mm_mul_ps(fscal,dx31);
1435             ty               = _mm_mul_ps(fscal,dy31);
1436             tz               = _mm_mul_ps(fscal,dz31);
1437
1438             /* Update vectorial force */
1439             fix3             = _mm_add_ps(fix3,tx);
1440             fiy3             = _mm_add_ps(fiy3,ty);
1441             fiz3             = _mm_add_ps(fiz3,tz);
1442
1443             fjx1             = _mm_add_ps(fjx1,tx);
1444             fjy1             = _mm_add_ps(fjy1,ty);
1445             fjz1             = _mm_add_ps(fjz1,tz);
1446
1447             }
1448
1449             /**************************
1450              * CALCULATE INTERACTIONS *
1451              **************************/
1452
1453             if (gmx_mm_any_lt(rsq32,rcutoff2))
1454             {
1455
1456             r32              = _mm_mul_ps(rsq32,rinv32);
1457             r32              = _mm_andnot_ps(dummy_mask,r32);
1458
1459             /* EWALD ELECTROSTATICS */
1460
1461             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1462             ewrt             = _mm_mul_ps(r32,ewtabscale);
1463             ewitab           = _mm_cvttps_epi32(ewrt);
1464             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1465             ewitab           = _mm_slli_epi32(ewitab,2);
1466             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1467             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1468             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1469             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1470             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1471             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1472             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1473             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
1474             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1475
1476             d                = _mm_sub_ps(r32,rswitch);
1477             d                = _mm_max_ps(d,_mm_setzero_ps());
1478             d2               = _mm_mul_ps(d,d);
1479             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1480
1481             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1482
1483             /* Evaluate switch function */
1484             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1485             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
1486             velec            = _mm_mul_ps(velec,sw);
1487             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1488
1489             /* Update potential sum for this i atom from the interaction with this j atom. */
1490             velec            = _mm_and_ps(velec,cutoff_mask);
1491             velec            = _mm_andnot_ps(dummy_mask,velec);
1492             velecsum         = _mm_add_ps(velecsum,velec);
1493
1494             fscal            = felec;
1495
1496             fscal            = _mm_and_ps(fscal,cutoff_mask);
1497
1498             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1499
1500             /* Calculate temporary vectorial force */
1501             tx               = _mm_mul_ps(fscal,dx32);
1502             ty               = _mm_mul_ps(fscal,dy32);
1503             tz               = _mm_mul_ps(fscal,dz32);
1504
1505             /* Update vectorial force */
1506             fix3             = _mm_add_ps(fix3,tx);
1507             fiy3             = _mm_add_ps(fiy3,ty);
1508             fiz3             = _mm_add_ps(fiz3,tz);
1509
1510             fjx2             = _mm_add_ps(fjx2,tx);
1511             fjy2             = _mm_add_ps(fjy2,ty);
1512             fjz2             = _mm_add_ps(fjz2,tz);
1513
1514             }
1515
1516             /**************************
1517              * CALCULATE INTERACTIONS *
1518              **************************/
1519
1520             if (gmx_mm_any_lt(rsq33,rcutoff2))
1521             {
1522
1523             r33              = _mm_mul_ps(rsq33,rinv33);
1524             r33              = _mm_andnot_ps(dummy_mask,r33);
1525
1526             /* EWALD ELECTROSTATICS */
1527
1528             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1529             ewrt             = _mm_mul_ps(r33,ewtabscale);
1530             ewitab           = _mm_cvttps_epi32(ewrt);
1531             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1532             ewitab           = _mm_slli_epi32(ewitab,2);
1533             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1534             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1535             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1536             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1537             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1538             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1539             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1540             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
1541             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1542
1543             d                = _mm_sub_ps(r33,rswitch);
1544             d                = _mm_max_ps(d,_mm_setzero_ps());
1545             d2               = _mm_mul_ps(d,d);
1546             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1547
1548             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1549
1550             /* Evaluate switch function */
1551             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1552             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
1553             velec            = _mm_mul_ps(velec,sw);
1554             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1555
1556             /* Update potential sum for this i atom from the interaction with this j atom. */
1557             velec            = _mm_and_ps(velec,cutoff_mask);
1558             velec            = _mm_andnot_ps(dummy_mask,velec);
1559             velecsum         = _mm_add_ps(velecsum,velec);
1560
1561             fscal            = felec;
1562
1563             fscal            = _mm_and_ps(fscal,cutoff_mask);
1564
1565             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1566
1567             /* Calculate temporary vectorial force */
1568             tx               = _mm_mul_ps(fscal,dx33);
1569             ty               = _mm_mul_ps(fscal,dy33);
1570             tz               = _mm_mul_ps(fscal,dz33);
1571
1572             /* Update vectorial force */
1573             fix3             = _mm_add_ps(fix3,tx);
1574             fiy3             = _mm_add_ps(fiy3,ty);
1575             fiz3             = _mm_add_ps(fiz3,tz);
1576
1577             fjx3             = _mm_add_ps(fjx3,tx);
1578             fjy3             = _mm_add_ps(fjy3,ty);
1579             fjz3             = _mm_add_ps(fjz3,tz);
1580
1581             }
1582
1583             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1584             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1585             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1586             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1587
1588             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1589                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1590
1591             /* Inner loop uses 594 flops */
1592         }
1593
1594         /* End of innermost loop */
1595
1596         gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1597                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
1598
1599         ggid                        = gid[iidx];
1600         /* Update potential energies */
1601         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1602
1603         /* Increment number of inner iterations */
1604         inneriter                  += j_index_end - j_index_start;
1605
1606         /* Outer loop uses 19 flops */
1607     }
1608
1609     /* Increment number of outer iterations */
1610     outeriter        += nri;
1611
1612     /* Update outer/inner flops */
1613
1614     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*594);
1615 }
1616 /*
1617  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse4_1_single
1618  * Electrostatics interaction: Ewald
1619  * VdW interaction:            None
1620  * Geometry:                   Water4-Water4
1621  * Calculate force/pot:        Force
1622  */
1623 void
1624 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse4_1_single
1625                     (t_nblist                    * gmx_restrict       nlist,
1626                      rvec                        * gmx_restrict          xx,
1627                      rvec                        * gmx_restrict          ff,
1628                      t_forcerec                  * gmx_restrict          fr,
1629                      t_mdatoms                   * gmx_restrict     mdatoms,
1630                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1631                      t_nrnb                      * gmx_restrict        nrnb)
1632 {
1633     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1634      * just 0 for non-waters.
1635      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1636      * jnr indices corresponding to data put in the four positions in the SIMD register.
1637      */
1638     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1639     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1640     int              jnrA,jnrB,jnrC,jnrD;
1641     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1642     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1643     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1644     real             rcutoff_scalar;
1645     real             *shiftvec,*fshift,*x,*f;
1646     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1647     real             scratch[4*DIM];
1648     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1649     int              vdwioffset1;
1650     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1651     int              vdwioffset2;
1652     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1653     int              vdwioffset3;
1654     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1655     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1656     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1657     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1658     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1659     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1660     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1661     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1662     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1663     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1664     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1665     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1666     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1667     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1668     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1669     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1670     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1671     real             *charge;
1672     __m128i          ewitab;
1673     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1674     real             *ewtab;
1675     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1676     real             rswitch_scalar,d_scalar;
1677     __m128           dummy_mask,cutoff_mask;
1678     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1679     __m128           one     = _mm_set1_ps(1.0);
1680     __m128           two     = _mm_set1_ps(2.0);
1681     x                = xx[0];
1682     f                = ff[0];
1683
1684     nri              = nlist->nri;
1685     iinr             = nlist->iinr;
1686     jindex           = nlist->jindex;
1687     jjnr             = nlist->jjnr;
1688     shiftidx         = nlist->shift;
1689     gid              = nlist->gid;
1690     shiftvec         = fr->shift_vec[0];
1691     fshift           = fr->fshift[0];
1692     facel            = _mm_set1_ps(fr->epsfac);
1693     charge           = mdatoms->chargeA;
1694
1695     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1696     ewtab            = fr->ic->tabq_coul_FDV0;
1697     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1698     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1699
1700     /* Setup water-specific parameters */
1701     inr              = nlist->iinr[0];
1702     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1703     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1704     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1705
1706     jq1              = _mm_set1_ps(charge[inr+1]);
1707     jq2              = _mm_set1_ps(charge[inr+2]);
1708     jq3              = _mm_set1_ps(charge[inr+3]);
1709     qq11             = _mm_mul_ps(iq1,jq1);
1710     qq12             = _mm_mul_ps(iq1,jq2);
1711     qq13             = _mm_mul_ps(iq1,jq3);
1712     qq21             = _mm_mul_ps(iq2,jq1);
1713     qq22             = _mm_mul_ps(iq2,jq2);
1714     qq23             = _mm_mul_ps(iq2,jq3);
1715     qq31             = _mm_mul_ps(iq3,jq1);
1716     qq32             = _mm_mul_ps(iq3,jq2);
1717     qq33             = _mm_mul_ps(iq3,jq3);
1718
1719     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1720     rcutoff_scalar   = fr->rcoulomb;
1721     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1722     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1723
1724     rswitch_scalar   = fr->rcoulomb_switch;
1725     rswitch          = _mm_set1_ps(rswitch_scalar);
1726     /* Setup switch parameters */
1727     d_scalar         = rcutoff_scalar-rswitch_scalar;
1728     d                = _mm_set1_ps(d_scalar);
1729     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1730     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1731     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1732     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1733     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1734     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1735
1736     /* Avoid stupid compiler warnings */
1737     jnrA = jnrB = jnrC = jnrD = 0;
1738     j_coord_offsetA = 0;
1739     j_coord_offsetB = 0;
1740     j_coord_offsetC = 0;
1741     j_coord_offsetD = 0;
1742
1743     outeriter        = 0;
1744     inneriter        = 0;
1745
1746     for(iidx=0;iidx<4*DIM;iidx++)
1747     {
1748         scratch[iidx] = 0.0;
1749     }
1750
1751     /* Start outer loop over neighborlists */
1752     for(iidx=0; iidx<nri; iidx++)
1753     {
1754         /* Load shift vector for this list */
1755         i_shift_offset   = DIM*shiftidx[iidx];
1756
1757         /* Load limits for loop over neighbors */
1758         j_index_start    = jindex[iidx];
1759         j_index_end      = jindex[iidx+1];
1760
1761         /* Get outer coordinate index */
1762         inr              = iinr[iidx];
1763         i_coord_offset   = DIM*inr;
1764
1765         /* Load i particle coords and add shift vector */
1766         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1767                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1768
1769         fix1             = _mm_setzero_ps();
1770         fiy1             = _mm_setzero_ps();
1771         fiz1             = _mm_setzero_ps();
1772         fix2             = _mm_setzero_ps();
1773         fiy2             = _mm_setzero_ps();
1774         fiz2             = _mm_setzero_ps();
1775         fix3             = _mm_setzero_ps();
1776         fiy3             = _mm_setzero_ps();
1777         fiz3             = _mm_setzero_ps();
1778
1779         /* Start inner kernel loop */
1780         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1781         {
1782
1783             /* Get j neighbor index, and coordinate index */
1784             jnrA             = jjnr[jidx];
1785             jnrB             = jjnr[jidx+1];
1786             jnrC             = jjnr[jidx+2];
1787             jnrD             = jjnr[jidx+3];
1788             j_coord_offsetA  = DIM*jnrA;
1789             j_coord_offsetB  = DIM*jnrB;
1790             j_coord_offsetC  = DIM*jnrC;
1791             j_coord_offsetD  = DIM*jnrD;
1792
1793             /* load j atom coordinates */
1794             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1795                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1796                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1797
1798             /* Calculate displacement vector */
1799             dx11             = _mm_sub_ps(ix1,jx1);
1800             dy11             = _mm_sub_ps(iy1,jy1);
1801             dz11             = _mm_sub_ps(iz1,jz1);
1802             dx12             = _mm_sub_ps(ix1,jx2);
1803             dy12             = _mm_sub_ps(iy1,jy2);
1804             dz12             = _mm_sub_ps(iz1,jz2);
1805             dx13             = _mm_sub_ps(ix1,jx3);
1806             dy13             = _mm_sub_ps(iy1,jy3);
1807             dz13             = _mm_sub_ps(iz1,jz3);
1808             dx21             = _mm_sub_ps(ix2,jx1);
1809             dy21             = _mm_sub_ps(iy2,jy1);
1810             dz21             = _mm_sub_ps(iz2,jz1);
1811             dx22             = _mm_sub_ps(ix2,jx2);
1812             dy22             = _mm_sub_ps(iy2,jy2);
1813             dz22             = _mm_sub_ps(iz2,jz2);
1814             dx23             = _mm_sub_ps(ix2,jx3);
1815             dy23             = _mm_sub_ps(iy2,jy3);
1816             dz23             = _mm_sub_ps(iz2,jz3);
1817             dx31             = _mm_sub_ps(ix3,jx1);
1818             dy31             = _mm_sub_ps(iy3,jy1);
1819             dz31             = _mm_sub_ps(iz3,jz1);
1820             dx32             = _mm_sub_ps(ix3,jx2);
1821             dy32             = _mm_sub_ps(iy3,jy2);
1822             dz32             = _mm_sub_ps(iz3,jz2);
1823             dx33             = _mm_sub_ps(ix3,jx3);
1824             dy33             = _mm_sub_ps(iy3,jy3);
1825             dz33             = _mm_sub_ps(iz3,jz3);
1826
1827             /* Calculate squared distance and things based on it */
1828             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1829             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1830             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1831             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1832             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1833             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1834             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1835             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1836             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1837
1838             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1839             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1840             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1841             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1842             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1843             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1844             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1845             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1846             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1847
1848             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1849             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1850             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1851             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1852             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1853             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1854             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1855             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1856             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1857
1858             fjx1             = _mm_setzero_ps();
1859             fjy1             = _mm_setzero_ps();
1860             fjz1             = _mm_setzero_ps();
1861             fjx2             = _mm_setzero_ps();
1862             fjy2             = _mm_setzero_ps();
1863             fjz2             = _mm_setzero_ps();
1864             fjx3             = _mm_setzero_ps();
1865             fjy3             = _mm_setzero_ps();
1866             fjz3             = _mm_setzero_ps();
1867
1868             /**************************
1869              * CALCULATE INTERACTIONS *
1870              **************************/
1871
1872             if (gmx_mm_any_lt(rsq11,rcutoff2))
1873             {
1874
1875             r11              = _mm_mul_ps(rsq11,rinv11);
1876
1877             /* EWALD ELECTROSTATICS */
1878
1879             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1880             ewrt             = _mm_mul_ps(r11,ewtabscale);
1881             ewitab           = _mm_cvttps_epi32(ewrt);
1882             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1883             ewitab           = _mm_slli_epi32(ewitab,2);
1884             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1885             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1886             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1887             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1888             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1889             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1890             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1891             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1892             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1893
1894             d                = _mm_sub_ps(r11,rswitch);
1895             d                = _mm_max_ps(d,_mm_setzero_ps());
1896             d2               = _mm_mul_ps(d,d);
1897             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1898
1899             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1900
1901             /* Evaluate switch function */
1902             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1903             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1904             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1905
1906             fscal            = felec;
1907
1908             fscal            = _mm_and_ps(fscal,cutoff_mask);
1909
1910             /* Calculate temporary vectorial force */
1911             tx               = _mm_mul_ps(fscal,dx11);
1912             ty               = _mm_mul_ps(fscal,dy11);
1913             tz               = _mm_mul_ps(fscal,dz11);
1914
1915             /* Update vectorial force */
1916             fix1             = _mm_add_ps(fix1,tx);
1917             fiy1             = _mm_add_ps(fiy1,ty);
1918             fiz1             = _mm_add_ps(fiz1,tz);
1919
1920             fjx1             = _mm_add_ps(fjx1,tx);
1921             fjy1             = _mm_add_ps(fjy1,ty);
1922             fjz1             = _mm_add_ps(fjz1,tz);
1923
1924             }
1925
1926             /**************************
1927              * CALCULATE INTERACTIONS *
1928              **************************/
1929
1930             if (gmx_mm_any_lt(rsq12,rcutoff2))
1931             {
1932
1933             r12              = _mm_mul_ps(rsq12,rinv12);
1934
1935             /* EWALD ELECTROSTATICS */
1936
1937             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1938             ewrt             = _mm_mul_ps(r12,ewtabscale);
1939             ewitab           = _mm_cvttps_epi32(ewrt);
1940             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1941             ewitab           = _mm_slli_epi32(ewitab,2);
1942             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1943             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1944             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1945             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1946             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1947             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1948             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1949             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1950             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1951
1952             d                = _mm_sub_ps(r12,rswitch);
1953             d                = _mm_max_ps(d,_mm_setzero_ps());
1954             d2               = _mm_mul_ps(d,d);
1955             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1956
1957             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1958
1959             /* Evaluate switch function */
1960             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1961             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1962             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1963
1964             fscal            = felec;
1965
1966             fscal            = _mm_and_ps(fscal,cutoff_mask);
1967
1968             /* Calculate temporary vectorial force */
1969             tx               = _mm_mul_ps(fscal,dx12);
1970             ty               = _mm_mul_ps(fscal,dy12);
1971             tz               = _mm_mul_ps(fscal,dz12);
1972
1973             /* Update vectorial force */
1974             fix1             = _mm_add_ps(fix1,tx);
1975             fiy1             = _mm_add_ps(fiy1,ty);
1976             fiz1             = _mm_add_ps(fiz1,tz);
1977
1978             fjx2             = _mm_add_ps(fjx2,tx);
1979             fjy2             = _mm_add_ps(fjy2,ty);
1980             fjz2             = _mm_add_ps(fjz2,tz);
1981
1982             }
1983
1984             /**************************
1985              * CALCULATE INTERACTIONS *
1986              **************************/
1987
1988             if (gmx_mm_any_lt(rsq13,rcutoff2))
1989             {
1990
1991             r13              = _mm_mul_ps(rsq13,rinv13);
1992
1993             /* EWALD ELECTROSTATICS */
1994
1995             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1996             ewrt             = _mm_mul_ps(r13,ewtabscale);
1997             ewitab           = _mm_cvttps_epi32(ewrt);
1998             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1999             ewitab           = _mm_slli_epi32(ewitab,2);
2000             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2001             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2002             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2003             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2004             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2005             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2006             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2007             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2008             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2009
2010             d                = _mm_sub_ps(r13,rswitch);
2011             d                = _mm_max_ps(d,_mm_setzero_ps());
2012             d2               = _mm_mul_ps(d,d);
2013             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2014
2015             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2016
2017             /* Evaluate switch function */
2018             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2019             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2020             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
2021
2022             fscal            = felec;
2023
2024             fscal            = _mm_and_ps(fscal,cutoff_mask);
2025
2026             /* Calculate temporary vectorial force */
2027             tx               = _mm_mul_ps(fscal,dx13);
2028             ty               = _mm_mul_ps(fscal,dy13);
2029             tz               = _mm_mul_ps(fscal,dz13);
2030
2031             /* Update vectorial force */
2032             fix1             = _mm_add_ps(fix1,tx);
2033             fiy1             = _mm_add_ps(fiy1,ty);
2034             fiz1             = _mm_add_ps(fiz1,tz);
2035
2036             fjx3             = _mm_add_ps(fjx3,tx);
2037             fjy3             = _mm_add_ps(fjy3,ty);
2038             fjz3             = _mm_add_ps(fjz3,tz);
2039
2040             }
2041
2042             /**************************
2043              * CALCULATE INTERACTIONS *
2044              **************************/
2045
2046             if (gmx_mm_any_lt(rsq21,rcutoff2))
2047             {
2048
2049             r21              = _mm_mul_ps(rsq21,rinv21);
2050
2051             /* EWALD ELECTROSTATICS */
2052
2053             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2054             ewrt             = _mm_mul_ps(r21,ewtabscale);
2055             ewitab           = _mm_cvttps_epi32(ewrt);
2056             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2057             ewitab           = _mm_slli_epi32(ewitab,2);
2058             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2059             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2060             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2061             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2062             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2063             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2064             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2065             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2066             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2067
2068             d                = _mm_sub_ps(r21,rswitch);
2069             d                = _mm_max_ps(d,_mm_setzero_ps());
2070             d2               = _mm_mul_ps(d,d);
2071             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2072
2073             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2074
2075             /* Evaluate switch function */
2076             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2077             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2078             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2079
2080             fscal            = felec;
2081
2082             fscal            = _mm_and_ps(fscal,cutoff_mask);
2083
2084             /* Calculate temporary vectorial force */
2085             tx               = _mm_mul_ps(fscal,dx21);
2086             ty               = _mm_mul_ps(fscal,dy21);
2087             tz               = _mm_mul_ps(fscal,dz21);
2088
2089             /* Update vectorial force */
2090             fix2             = _mm_add_ps(fix2,tx);
2091             fiy2             = _mm_add_ps(fiy2,ty);
2092             fiz2             = _mm_add_ps(fiz2,tz);
2093
2094             fjx1             = _mm_add_ps(fjx1,tx);
2095             fjy1             = _mm_add_ps(fjy1,ty);
2096             fjz1             = _mm_add_ps(fjz1,tz);
2097
2098             }
2099
2100             /**************************
2101              * CALCULATE INTERACTIONS *
2102              **************************/
2103
2104             if (gmx_mm_any_lt(rsq22,rcutoff2))
2105             {
2106
2107             r22              = _mm_mul_ps(rsq22,rinv22);
2108
2109             /* EWALD ELECTROSTATICS */
2110
2111             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2112             ewrt             = _mm_mul_ps(r22,ewtabscale);
2113             ewitab           = _mm_cvttps_epi32(ewrt);
2114             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2115             ewitab           = _mm_slli_epi32(ewitab,2);
2116             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2117             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2118             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2119             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2120             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2121             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2122             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2123             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2124             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2125
2126             d                = _mm_sub_ps(r22,rswitch);
2127             d                = _mm_max_ps(d,_mm_setzero_ps());
2128             d2               = _mm_mul_ps(d,d);
2129             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2130
2131             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2132
2133             /* Evaluate switch function */
2134             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2135             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2136             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2137
2138             fscal            = felec;
2139
2140             fscal            = _mm_and_ps(fscal,cutoff_mask);
2141
2142             /* Calculate temporary vectorial force */
2143             tx               = _mm_mul_ps(fscal,dx22);
2144             ty               = _mm_mul_ps(fscal,dy22);
2145             tz               = _mm_mul_ps(fscal,dz22);
2146
2147             /* Update vectorial force */
2148             fix2             = _mm_add_ps(fix2,tx);
2149             fiy2             = _mm_add_ps(fiy2,ty);
2150             fiz2             = _mm_add_ps(fiz2,tz);
2151
2152             fjx2             = _mm_add_ps(fjx2,tx);
2153             fjy2             = _mm_add_ps(fjy2,ty);
2154             fjz2             = _mm_add_ps(fjz2,tz);
2155
2156             }
2157
2158             /**************************
2159              * CALCULATE INTERACTIONS *
2160              **************************/
2161
2162             if (gmx_mm_any_lt(rsq23,rcutoff2))
2163             {
2164
2165             r23              = _mm_mul_ps(rsq23,rinv23);
2166
2167             /* EWALD ELECTROSTATICS */
2168
2169             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2170             ewrt             = _mm_mul_ps(r23,ewtabscale);
2171             ewitab           = _mm_cvttps_epi32(ewrt);
2172             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2173             ewitab           = _mm_slli_epi32(ewitab,2);
2174             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2175             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2176             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2177             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2178             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2179             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2180             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2181             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
2182             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2183
2184             d                = _mm_sub_ps(r23,rswitch);
2185             d                = _mm_max_ps(d,_mm_setzero_ps());
2186             d2               = _mm_mul_ps(d,d);
2187             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2188
2189             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2190
2191             /* Evaluate switch function */
2192             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2193             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
2194             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
2195
2196             fscal            = felec;
2197
2198             fscal            = _mm_and_ps(fscal,cutoff_mask);
2199
2200             /* Calculate temporary vectorial force */
2201             tx               = _mm_mul_ps(fscal,dx23);
2202             ty               = _mm_mul_ps(fscal,dy23);
2203             tz               = _mm_mul_ps(fscal,dz23);
2204
2205             /* Update vectorial force */
2206             fix2             = _mm_add_ps(fix2,tx);
2207             fiy2             = _mm_add_ps(fiy2,ty);
2208             fiz2             = _mm_add_ps(fiz2,tz);
2209
2210             fjx3             = _mm_add_ps(fjx3,tx);
2211             fjy3             = _mm_add_ps(fjy3,ty);
2212             fjz3             = _mm_add_ps(fjz3,tz);
2213
2214             }
2215
2216             /**************************
2217              * CALCULATE INTERACTIONS *
2218              **************************/
2219
2220             if (gmx_mm_any_lt(rsq31,rcutoff2))
2221             {
2222
2223             r31              = _mm_mul_ps(rsq31,rinv31);
2224
2225             /* EWALD ELECTROSTATICS */
2226
2227             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2228             ewrt             = _mm_mul_ps(r31,ewtabscale);
2229             ewitab           = _mm_cvttps_epi32(ewrt);
2230             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2231             ewitab           = _mm_slli_epi32(ewitab,2);
2232             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2233             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2234             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2235             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2236             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2237             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2238             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2239             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
2240             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2241
2242             d                = _mm_sub_ps(r31,rswitch);
2243             d                = _mm_max_ps(d,_mm_setzero_ps());
2244             d2               = _mm_mul_ps(d,d);
2245             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2246
2247             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2248
2249             /* Evaluate switch function */
2250             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2251             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2252             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
2253
2254             fscal            = felec;
2255
2256             fscal            = _mm_and_ps(fscal,cutoff_mask);
2257
2258             /* Calculate temporary vectorial force */
2259             tx               = _mm_mul_ps(fscal,dx31);
2260             ty               = _mm_mul_ps(fscal,dy31);
2261             tz               = _mm_mul_ps(fscal,dz31);
2262
2263             /* Update vectorial force */
2264             fix3             = _mm_add_ps(fix3,tx);
2265             fiy3             = _mm_add_ps(fiy3,ty);
2266             fiz3             = _mm_add_ps(fiz3,tz);
2267
2268             fjx1             = _mm_add_ps(fjx1,tx);
2269             fjy1             = _mm_add_ps(fjy1,ty);
2270             fjz1             = _mm_add_ps(fjz1,tz);
2271
2272             }
2273
2274             /**************************
2275              * CALCULATE INTERACTIONS *
2276              **************************/
2277
2278             if (gmx_mm_any_lt(rsq32,rcutoff2))
2279             {
2280
2281             r32              = _mm_mul_ps(rsq32,rinv32);
2282
2283             /* EWALD ELECTROSTATICS */
2284
2285             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2286             ewrt             = _mm_mul_ps(r32,ewtabscale);
2287             ewitab           = _mm_cvttps_epi32(ewrt);
2288             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2289             ewitab           = _mm_slli_epi32(ewitab,2);
2290             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2291             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2292             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2293             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2294             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2295             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2296             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2297             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
2298             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2299
2300             d                = _mm_sub_ps(r32,rswitch);
2301             d                = _mm_max_ps(d,_mm_setzero_ps());
2302             d2               = _mm_mul_ps(d,d);
2303             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2304
2305             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2306
2307             /* Evaluate switch function */
2308             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2309             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2310             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
2311
2312             fscal            = felec;
2313
2314             fscal            = _mm_and_ps(fscal,cutoff_mask);
2315
2316             /* Calculate temporary vectorial force */
2317             tx               = _mm_mul_ps(fscal,dx32);
2318             ty               = _mm_mul_ps(fscal,dy32);
2319             tz               = _mm_mul_ps(fscal,dz32);
2320
2321             /* Update vectorial force */
2322             fix3             = _mm_add_ps(fix3,tx);
2323             fiy3             = _mm_add_ps(fiy3,ty);
2324             fiz3             = _mm_add_ps(fiz3,tz);
2325
2326             fjx2             = _mm_add_ps(fjx2,tx);
2327             fjy2             = _mm_add_ps(fjy2,ty);
2328             fjz2             = _mm_add_ps(fjz2,tz);
2329
2330             }
2331
2332             /**************************
2333              * CALCULATE INTERACTIONS *
2334              **************************/
2335
2336             if (gmx_mm_any_lt(rsq33,rcutoff2))
2337             {
2338
2339             r33              = _mm_mul_ps(rsq33,rinv33);
2340
2341             /* EWALD ELECTROSTATICS */
2342
2343             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2344             ewrt             = _mm_mul_ps(r33,ewtabscale);
2345             ewitab           = _mm_cvttps_epi32(ewrt);
2346             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2347             ewitab           = _mm_slli_epi32(ewitab,2);
2348             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2349             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2350             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2351             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2352             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2353             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2354             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2355             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
2356             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2357
2358             d                = _mm_sub_ps(r33,rswitch);
2359             d                = _mm_max_ps(d,_mm_setzero_ps());
2360             d2               = _mm_mul_ps(d,d);
2361             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2362
2363             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2364
2365             /* Evaluate switch function */
2366             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2367             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
2368             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
2369
2370             fscal            = felec;
2371
2372             fscal            = _mm_and_ps(fscal,cutoff_mask);
2373
2374             /* Calculate temporary vectorial force */
2375             tx               = _mm_mul_ps(fscal,dx33);
2376             ty               = _mm_mul_ps(fscal,dy33);
2377             tz               = _mm_mul_ps(fscal,dz33);
2378
2379             /* Update vectorial force */
2380             fix3             = _mm_add_ps(fix3,tx);
2381             fiy3             = _mm_add_ps(fiy3,ty);
2382             fiz3             = _mm_add_ps(fiz3,tz);
2383
2384             fjx3             = _mm_add_ps(fjx3,tx);
2385             fjy3             = _mm_add_ps(fjy3,ty);
2386             fjz3             = _mm_add_ps(fjz3,tz);
2387
2388             }
2389
2390             fjptrA             = f+j_coord_offsetA;
2391             fjptrB             = f+j_coord_offsetB;
2392             fjptrC             = f+j_coord_offsetC;
2393             fjptrD             = f+j_coord_offsetD;
2394
2395             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2396                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2397
2398             /* Inner loop uses 558 flops */
2399         }
2400
2401         if(jidx<j_index_end)
2402         {
2403
2404             /* Get j neighbor index, and coordinate index */
2405             jnrlistA         = jjnr[jidx];
2406             jnrlistB         = jjnr[jidx+1];
2407             jnrlistC         = jjnr[jidx+2];
2408             jnrlistD         = jjnr[jidx+3];
2409             /* Sign of each element will be negative for non-real atoms.
2410              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2411              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2412              */
2413             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2414             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2415             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2416             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2417             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2418             j_coord_offsetA  = DIM*jnrA;
2419             j_coord_offsetB  = DIM*jnrB;
2420             j_coord_offsetC  = DIM*jnrC;
2421             j_coord_offsetD  = DIM*jnrD;
2422
2423             /* load j atom coordinates */
2424             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
2425                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
2426                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
2427
2428             /* Calculate displacement vector */
2429             dx11             = _mm_sub_ps(ix1,jx1);
2430             dy11             = _mm_sub_ps(iy1,jy1);
2431             dz11             = _mm_sub_ps(iz1,jz1);
2432             dx12             = _mm_sub_ps(ix1,jx2);
2433             dy12             = _mm_sub_ps(iy1,jy2);
2434             dz12             = _mm_sub_ps(iz1,jz2);
2435             dx13             = _mm_sub_ps(ix1,jx3);
2436             dy13             = _mm_sub_ps(iy1,jy3);
2437             dz13             = _mm_sub_ps(iz1,jz3);
2438             dx21             = _mm_sub_ps(ix2,jx1);
2439             dy21             = _mm_sub_ps(iy2,jy1);
2440             dz21             = _mm_sub_ps(iz2,jz1);
2441             dx22             = _mm_sub_ps(ix2,jx2);
2442             dy22             = _mm_sub_ps(iy2,jy2);
2443             dz22             = _mm_sub_ps(iz2,jz2);
2444             dx23             = _mm_sub_ps(ix2,jx3);
2445             dy23             = _mm_sub_ps(iy2,jy3);
2446             dz23             = _mm_sub_ps(iz2,jz3);
2447             dx31             = _mm_sub_ps(ix3,jx1);
2448             dy31             = _mm_sub_ps(iy3,jy1);
2449             dz31             = _mm_sub_ps(iz3,jz1);
2450             dx32             = _mm_sub_ps(ix3,jx2);
2451             dy32             = _mm_sub_ps(iy3,jy2);
2452             dz32             = _mm_sub_ps(iz3,jz2);
2453             dx33             = _mm_sub_ps(ix3,jx3);
2454             dy33             = _mm_sub_ps(iy3,jy3);
2455             dz33             = _mm_sub_ps(iz3,jz3);
2456
2457             /* Calculate squared distance and things based on it */
2458             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2459             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2460             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2461             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2462             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2463             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2464             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2465             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2466             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2467
2468             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2469             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2470             rinv13           = gmx_mm_invsqrt_ps(rsq13);
2471             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2472             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2473             rinv23           = gmx_mm_invsqrt_ps(rsq23);
2474             rinv31           = gmx_mm_invsqrt_ps(rsq31);
2475             rinv32           = gmx_mm_invsqrt_ps(rsq32);
2476             rinv33           = gmx_mm_invsqrt_ps(rsq33);
2477
2478             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2479             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2480             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
2481             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2482             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2483             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
2484             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
2485             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
2486             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
2487
2488             fjx1             = _mm_setzero_ps();
2489             fjy1             = _mm_setzero_ps();
2490             fjz1             = _mm_setzero_ps();
2491             fjx2             = _mm_setzero_ps();
2492             fjy2             = _mm_setzero_ps();
2493             fjz2             = _mm_setzero_ps();
2494             fjx3             = _mm_setzero_ps();
2495             fjy3             = _mm_setzero_ps();
2496             fjz3             = _mm_setzero_ps();
2497
2498             /**************************
2499              * CALCULATE INTERACTIONS *
2500              **************************/
2501
2502             if (gmx_mm_any_lt(rsq11,rcutoff2))
2503             {
2504
2505             r11              = _mm_mul_ps(rsq11,rinv11);
2506             r11              = _mm_andnot_ps(dummy_mask,r11);
2507
2508             /* EWALD ELECTROSTATICS */
2509
2510             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2511             ewrt             = _mm_mul_ps(r11,ewtabscale);
2512             ewitab           = _mm_cvttps_epi32(ewrt);
2513             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2514             ewitab           = _mm_slli_epi32(ewitab,2);
2515             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2516             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2517             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2518             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2519             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2520             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2521             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2522             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2523             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2524
2525             d                = _mm_sub_ps(r11,rswitch);
2526             d                = _mm_max_ps(d,_mm_setzero_ps());
2527             d2               = _mm_mul_ps(d,d);
2528             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2529
2530             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2531
2532             /* Evaluate switch function */
2533             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2534             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2535             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2536
2537             fscal            = felec;
2538
2539             fscal            = _mm_and_ps(fscal,cutoff_mask);
2540
2541             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2542
2543             /* Calculate temporary vectorial force */
2544             tx               = _mm_mul_ps(fscal,dx11);
2545             ty               = _mm_mul_ps(fscal,dy11);
2546             tz               = _mm_mul_ps(fscal,dz11);
2547
2548             /* Update vectorial force */
2549             fix1             = _mm_add_ps(fix1,tx);
2550             fiy1             = _mm_add_ps(fiy1,ty);
2551             fiz1             = _mm_add_ps(fiz1,tz);
2552
2553             fjx1             = _mm_add_ps(fjx1,tx);
2554             fjy1             = _mm_add_ps(fjy1,ty);
2555             fjz1             = _mm_add_ps(fjz1,tz);
2556
2557             }
2558
2559             /**************************
2560              * CALCULATE INTERACTIONS *
2561              **************************/
2562
2563             if (gmx_mm_any_lt(rsq12,rcutoff2))
2564             {
2565
2566             r12              = _mm_mul_ps(rsq12,rinv12);
2567             r12              = _mm_andnot_ps(dummy_mask,r12);
2568
2569             /* EWALD ELECTROSTATICS */
2570
2571             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2572             ewrt             = _mm_mul_ps(r12,ewtabscale);
2573             ewitab           = _mm_cvttps_epi32(ewrt);
2574             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2575             ewitab           = _mm_slli_epi32(ewitab,2);
2576             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2577             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2578             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2579             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2580             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2581             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2582             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2583             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2584             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2585
2586             d                = _mm_sub_ps(r12,rswitch);
2587             d                = _mm_max_ps(d,_mm_setzero_ps());
2588             d2               = _mm_mul_ps(d,d);
2589             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2590
2591             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2592
2593             /* Evaluate switch function */
2594             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2595             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2596             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2597
2598             fscal            = felec;
2599
2600             fscal            = _mm_and_ps(fscal,cutoff_mask);
2601
2602             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2603
2604             /* Calculate temporary vectorial force */
2605             tx               = _mm_mul_ps(fscal,dx12);
2606             ty               = _mm_mul_ps(fscal,dy12);
2607             tz               = _mm_mul_ps(fscal,dz12);
2608
2609             /* Update vectorial force */
2610             fix1             = _mm_add_ps(fix1,tx);
2611             fiy1             = _mm_add_ps(fiy1,ty);
2612             fiz1             = _mm_add_ps(fiz1,tz);
2613
2614             fjx2             = _mm_add_ps(fjx2,tx);
2615             fjy2             = _mm_add_ps(fjy2,ty);
2616             fjz2             = _mm_add_ps(fjz2,tz);
2617
2618             }
2619
2620             /**************************
2621              * CALCULATE INTERACTIONS *
2622              **************************/
2623
2624             if (gmx_mm_any_lt(rsq13,rcutoff2))
2625             {
2626
2627             r13              = _mm_mul_ps(rsq13,rinv13);
2628             r13              = _mm_andnot_ps(dummy_mask,r13);
2629
2630             /* EWALD ELECTROSTATICS */
2631
2632             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2633             ewrt             = _mm_mul_ps(r13,ewtabscale);
2634             ewitab           = _mm_cvttps_epi32(ewrt);
2635             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2636             ewitab           = _mm_slli_epi32(ewitab,2);
2637             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2638             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2639             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2640             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2641             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2642             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2643             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2644             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2645             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2646
2647             d                = _mm_sub_ps(r13,rswitch);
2648             d                = _mm_max_ps(d,_mm_setzero_ps());
2649             d2               = _mm_mul_ps(d,d);
2650             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2651
2652             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2653
2654             /* Evaluate switch function */
2655             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2656             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2657             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
2658
2659             fscal            = felec;
2660
2661             fscal            = _mm_and_ps(fscal,cutoff_mask);
2662
2663             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2664
2665             /* Calculate temporary vectorial force */
2666             tx               = _mm_mul_ps(fscal,dx13);
2667             ty               = _mm_mul_ps(fscal,dy13);
2668             tz               = _mm_mul_ps(fscal,dz13);
2669
2670             /* Update vectorial force */
2671             fix1             = _mm_add_ps(fix1,tx);
2672             fiy1             = _mm_add_ps(fiy1,ty);
2673             fiz1             = _mm_add_ps(fiz1,tz);
2674
2675             fjx3             = _mm_add_ps(fjx3,tx);
2676             fjy3             = _mm_add_ps(fjy3,ty);
2677             fjz3             = _mm_add_ps(fjz3,tz);
2678
2679             }
2680
2681             /**************************
2682              * CALCULATE INTERACTIONS *
2683              **************************/
2684
2685             if (gmx_mm_any_lt(rsq21,rcutoff2))
2686             {
2687
2688             r21              = _mm_mul_ps(rsq21,rinv21);
2689             r21              = _mm_andnot_ps(dummy_mask,r21);
2690
2691             /* EWALD ELECTROSTATICS */
2692
2693             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2694             ewrt             = _mm_mul_ps(r21,ewtabscale);
2695             ewitab           = _mm_cvttps_epi32(ewrt);
2696             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2697             ewitab           = _mm_slli_epi32(ewitab,2);
2698             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2699             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2700             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2701             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2702             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2703             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2704             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2705             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2706             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2707
2708             d                = _mm_sub_ps(r21,rswitch);
2709             d                = _mm_max_ps(d,_mm_setzero_ps());
2710             d2               = _mm_mul_ps(d,d);
2711             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2712
2713             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2714
2715             /* Evaluate switch function */
2716             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2717             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2718             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2719
2720             fscal            = felec;
2721
2722             fscal            = _mm_and_ps(fscal,cutoff_mask);
2723
2724             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2725
2726             /* Calculate temporary vectorial force */
2727             tx               = _mm_mul_ps(fscal,dx21);
2728             ty               = _mm_mul_ps(fscal,dy21);
2729             tz               = _mm_mul_ps(fscal,dz21);
2730
2731             /* Update vectorial force */
2732             fix2             = _mm_add_ps(fix2,tx);
2733             fiy2             = _mm_add_ps(fiy2,ty);
2734             fiz2             = _mm_add_ps(fiz2,tz);
2735
2736             fjx1             = _mm_add_ps(fjx1,tx);
2737             fjy1             = _mm_add_ps(fjy1,ty);
2738             fjz1             = _mm_add_ps(fjz1,tz);
2739
2740             }
2741
2742             /**************************
2743              * CALCULATE INTERACTIONS *
2744              **************************/
2745
2746             if (gmx_mm_any_lt(rsq22,rcutoff2))
2747             {
2748
2749             r22              = _mm_mul_ps(rsq22,rinv22);
2750             r22              = _mm_andnot_ps(dummy_mask,r22);
2751
2752             /* EWALD ELECTROSTATICS */
2753
2754             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2755             ewrt             = _mm_mul_ps(r22,ewtabscale);
2756             ewitab           = _mm_cvttps_epi32(ewrt);
2757             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2758             ewitab           = _mm_slli_epi32(ewitab,2);
2759             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2760             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2761             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2762             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2763             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2764             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2765             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2766             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2767             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2768
2769             d                = _mm_sub_ps(r22,rswitch);
2770             d                = _mm_max_ps(d,_mm_setzero_ps());
2771             d2               = _mm_mul_ps(d,d);
2772             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2773
2774             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2775
2776             /* Evaluate switch function */
2777             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2778             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2779             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2780
2781             fscal            = felec;
2782
2783             fscal            = _mm_and_ps(fscal,cutoff_mask);
2784
2785             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2786
2787             /* Calculate temporary vectorial force */
2788             tx               = _mm_mul_ps(fscal,dx22);
2789             ty               = _mm_mul_ps(fscal,dy22);
2790             tz               = _mm_mul_ps(fscal,dz22);
2791
2792             /* Update vectorial force */
2793             fix2             = _mm_add_ps(fix2,tx);
2794             fiy2             = _mm_add_ps(fiy2,ty);
2795             fiz2             = _mm_add_ps(fiz2,tz);
2796
2797             fjx2             = _mm_add_ps(fjx2,tx);
2798             fjy2             = _mm_add_ps(fjy2,ty);
2799             fjz2             = _mm_add_ps(fjz2,tz);
2800
2801             }
2802
2803             /**************************
2804              * CALCULATE INTERACTIONS *
2805              **************************/
2806
2807             if (gmx_mm_any_lt(rsq23,rcutoff2))
2808             {
2809
2810             r23              = _mm_mul_ps(rsq23,rinv23);
2811             r23              = _mm_andnot_ps(dummy_mask,r23);
2812
2813             /* EWALD ELECTROSTATICS */
2814
2815             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2816             ewrt             = _mm_mul_ps(r23,ewtabscale);
2817             ewitab           = _mm_cvttps_epi32(ewrt);
2818             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2819             ewitab           = _mm_slli_epi32(ewitab,2);
2820             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2821             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2822             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2823             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2824             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2825             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2826             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2827             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
2828             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2829
2830             d                = _mm_sub_ps(r23,rswitch);
2831             d                = _mm_max_ps(d,_mm_setzero_ps());
2832             d2               = _mm_mul_ps(d,d);
2833             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2834
2835             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2836
2837             /* Evaluate switch function */
2838             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2839             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
2840             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
2841
2842             fscal            = felec;
2843
2844             fscal            = _mm_and_ps(fscal,cutoff_mask);
2845
2846             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2847
2848             /* Calculate temporary vectorial force */
2849             tx               = _mm_mul_ps(fscal,dx23);
2850             ty               = _mm_mul_ps(fscal,dy23);
2851             tz               = _mm_mul_ps(fscal,dz23);
2852
2853             /* Update vectorial force */
2854             fix2             = _mm_add_ps(fix2,tx);
2855             fiy2             = _mm_add_ps(fiy2,ty);
2856             fiz2             = _mm_add_ps(fiz2,tz);
2857
2858             fjx3             = _mm_add_ps(fjx3,tx);
2859             fjy3             = _mm_add_ps(fjy3,ty);
2860             fjz3             = _mm_add_ps(fjz3,tz);
2861
2862             }
2863
2864             /**************************
2865              * CALCULATE INTERACTIONS *
2866              **************************/
2867
2868             if (gmx_mm_any_lt(rsq31,rcutoff2))
2869             {
2870
2871             r31              = _mm_mul_ps(rsq31,rinv31);
2872             r31              = _mm_andnot_ps(dummy_mask,r31);
2873
2874             /* EWALD ELECTROSTATICS */
2875
2876             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2877             ewrt             = _mm_mul_ps(r31,ewtabscale);
2878             ewitab           = _mm_cvttps_epi32(ewrt);
2879             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2880             ewitab           = _mm_slli_epi32(ewitab,2);
2881             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2882             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2883             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2884             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2885             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2886             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2887             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2888             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
2889             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2890
2891             d                = _mm_sub_ps(r31,rswitch);
2892             d                = _mm_max_ps(d,_mm_setzero_ps());
2893             d2               = _mm_mul_ps(d,d);
2894             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2895
2896             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2897
2898             /* Evaluate switch function */
2899             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2900             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2901             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
2902
2903             fscal            = felec;
2904
2905             fscal            = _mm_and_ps(fscal,cutoff_mask);
2906
2907             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2908
2909             /* Calculate temporary vectorial force */
2910             tx               = _mm_mul_ps(fscal,dx31);
2911             ty               = _mm_mul_ps(fscal,dy31);
2912             tz               = _mm_mul_ps(fscal,dz31);
2913
2914             /* Update vectorial force */
2915             fix3             = _mm_add_ps(fix3,tx);
2916             fiy3             = _mm_add_ps(fiy3,ty);
2917             fiz3             = _mm_add_ps(fiz3,tz);
2918
2919             fjx1             = _mm_add_ps(fjx1,tx);
2920             fjy1             = _mm_add_ps(fjy1,ty);
2921             fjz1             = _mm_add_ps(fjz1,tz);
2922
2923             }
2924
2925             /**************************
2926              * CALCULATE INTERACTIONS *
2927              **************************/
2928
2929             if (gmx_mm_any_lt(rsq32,rcutoff2))
2930             {
2931
2932             r32              = _mm_mul_ps(rsq32,rinv32);
2933             r32              = _mm_andnot_ps(dummy_mask,r32);
2934
2935             /* EWALD ELECTROSTATICS */
2936
2937             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2938             ewrt             = _mm_mul_ps(r32,ewtabscale);
2939             ewitab           = _mm_cvttps_epi32(ewrt);
2940             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2941             ewitab           = _mm_slli_epi32(ewitab,2);
2942             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2943             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2944             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2945             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2946             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2947             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2948             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2949             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
2950             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2951
2952             d                = _mm_sub_ps(r32,rswitch);
2953             d                = _mm_max_ps(d,_mm_setzero_ps());
2954             d2               = _mm_mul_ps(d,d);
2955             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2956
2957             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2958
2959             /* Evaluate switch function */
2960             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2961             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2962             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
2963
2964             fscal            = felec;
2965
2966             fscal            = _mm_and_ps(fscal,cutoff_mask);
2967
2968             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2969
2970             /* Calculate temporary vectorial force */
2971             tx               = _mm_mul_ps(fscal,dx32);
2972             ty               = _mm_mul_ps(fscal,dy32);
2973             tz               = _mm_mul_ps(fscal,dz32);
2974
2975             /* Update vectorial force */
2976             fix3             = _mm_add_ps(fix3,tx);
2977             fiy3             = _mm_add_ps(fiy3,ty);
2978             fiz3             = _mm_add_ps(fiz3,tz);
2979
2980             fjx2             = _mm_add_ps(fjx2,tx);
2981             fjy2             = _mm_add_ps(fjy2,ty);
2982             fjz2             = _mm_add_ps(fjz2,tz);
2983
2984             }
2985
2986             /**************************
2987              * CALCULATE INTERACTIONS *
2988              **************************/
2989
2990             if (gmx_mm_any_lt(rsq33,rcutoff2))
2991             {
2992
2993             r33              = _mm_mul_ps(rsq33,rinv33);
2994             r33              = _mm_andnot_ps(dummy_mask,r33);
2995
2996             /* EWALD ELECTROSTATICS */
2997
2998             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2999             ewrt             = _mm_mul_ps(r33,ewtabscale);
3000             ewitab           = _mm_cvttps_epi32(ewrt);
3001             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
3002             ewitab           = _mm_slli_epi32(ewitab,2);
3003             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3004             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3005             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3006             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3007             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3008             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3009             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3010             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
3011             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
3012
3013             d                = _mm_sub_ps(r33,rswitch);
3014             d                = _mm_max_ps(d,_mm_setzero_ps());
3015             d2               = _mm_mul_ps(d,d);
3016             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3017
3018             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3019
3020             /* Evaluate switch function */
3021             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3022             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
3023             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
3024
3025             fscal            = felec;
3026
3027             fscal            = _mm_and_ps(fscal,cutoff_mask);
3028
3029             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3030
3031             /* Calculate temporary vectorial force */
3032             tx               = _mm_mul_ps(fscal,dx33);
3033             ty               = _mm_mul_ps(fscal,dy33);
3034             tz               = _mm_mul_ps(fscal,dz33);
3035
3036             /* Update vectorial force */
3037             fix3             = _mm_add_ps(fix3,tx);
3038             fiy3             = _mm_add_ps(fiy3,ty);
3039             fiz3             = _mm_add_ps(fiz3,tz);
3040
3041             fjx3             = _mm_add_ps(fjx3,tx);
3042             fjy3             = _mm_add_ps(fjy3,ty);
3043             fjz3             = _mm_add_ps(fjz3,tz);
3044
3045             }
3046
3047             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3048             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3049             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3050             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3051
3052             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
3053                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3054
3055             /* Inner loop uses 567 flops */
3056         }
3057
3058         /* End of innermost loop */
3059
3060         gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3061                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
3062
3063         /* Increment number of inner iterations */
3064         inneriter                  += j_index_end - j_index_start;
3065
3066         /* Outer loop uses 18 flops */
3067     }
3068
3069     /* Increment number of outer iterations */
3070     outeriter        += nri;
3071
3072     /* Update outer/inner flops */
3073
3074     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*567);
3075 }