Rename remaining GMX_ACCELERATION to GMX_CPU_ACCELERATION
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecEwSh_VdwNone_GeomW4W4_sse2_single.c
1 /*
2  * Note: this file was generated by the Gromacs sse2_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse2_single
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            None
40  * Geometry:                   Water4-Water4
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse2_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
63     real             shX,shY,shZ,rcutoff_scalar;
64     real             *shiftvec,*fshift,*x,*f;
65     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66     int              vdwioffset1;
67     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
68     int              vdwioffset2;
69     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
70     int              vdwioffset3;
71     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
72     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
73     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
74     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
75     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
76     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
77     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
78     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
79     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
80     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
81     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
82     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
83     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
84     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
85     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
86     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
87     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
88     real             *charge;
89     __m128i          ewitab;
90     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
91     real             *ewtab;
92     __m128           dummy_mask,cutoff_mask;
93     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
94     __m128           one     = _mm_set1_ps(1.0);
95     __m128           two     = _mm_set1_ps(2.0);
96     x                = xx[0];
97     f                = ff[0];
98
99     nri              = nlist->nri;
100     iinr             = nlist->iinr;
101     jindex           = nlist->jindex;
102     jjnr             = nlist->jjnr;
103     shiftidx         = nlist->shift;
104     gid              = nlist->gid;
105     shiftvec         = fr->shift_vec[0];
106     fshift           = fr->fshift[0];
107     facel            = _mm_set1_ps(fr->epsfac);
108     charge           = mdatoms->chargeA;
109
110     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
111     ewtab            = fr->ic->tabq_coul_FDV0;
112     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
113     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
114
115     /* Setup water-specific parameters */
116     inr              = nlist->iinr[0];
117     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
118     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
119     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
120
121     jq1              = _mm_set1_ps(charge[inr+1]);
122     jq2              = _mm_set1_ps(charge[inr+2]);
123     jq3              = _mm_set1_ps(charge[inr+3]);
124     qq11             = _mm_mul_ps(iq1,jq1);
125     qq12             = _mm_mul_ps(iq1,jq2);
126     qq13             = _mm_mul_ps(iq1,jq3);
127     qq21             = _mm_mul_ps(iq2,jq1);
128     qq22             = _mm_mul_ps(iq2,jq2);
129     qq23             = _mm_mul_ps(iq2,jq3);
130     qq31             = _mm_mul_ps(iq3,jq1);
131     qq32             = _mm_mul_ps(iq3,jq2);
132     qq33             = _mm_mul_ps(iq3,jq3);
133
134     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
135     rcutoff_scalar   = fr->rcoulomb;
136     rcutoff          = _mm_set1_ps(rcutoff_scalar);
137     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
138
139     /* Avoid stupid compiler warnings */
140     jnrA = jnrB = jnrC = jnrD = 0;
141     j_coord_offsetA = 0;
142     j_coord_offsetB = 0;
143     j_coord_offsetC = 0;
144     j_coord_offsetD = 0;
145
146     outeriter        = 0;
147     inneriter        = 0;
148
149     /* Start outer loop over neighborlists */
150     for(iidx=0; iidx<nri; iidx++)
151     {
152         /* Load shift vector for this list */
153         i_shift_offset   = DIM*shiftidx[iidx];
154         shX              = shiftvec[i_shift_offset+XX];
155         shY              = shiftvec[i_shift_offset+YY];
156         shZ              = shiftvec[i_shift_offset+ZZ];
157
158         /* Load limits for loop over neighbors */
159         j_index_start    = jindex[iidx];
160         j_index_end      = jindex[iidx+1];
161
162         /* Get outer coordinate index */
163         inr              = iinr[iidx];
164         i_coord_offset   = DIM*inr;
165
166         /* Load i particle coords and add shift vector */
167         ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
168         iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
169         iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
170         ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
171         iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
172         iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
173         ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
174         iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
175         iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
176
177         fix1             = _mm_setzero_ps();
178         fiy1             = _mm_setzero_ps();
179         fiz1             = _mm_setzero_ps();
180         fix2             = _mm_setzero_ps();
181         fiy2             = _mm_setzero_ps();
182         fiz2             = _mm_setzero_ps();
183         fix3             = _mm_setzero_ps();
184         fiy3             = _mm_setzero_ps();
185         fiz3             = _mm_setzero_ps();
186
187         /* Reset potential sums */
188         velecsum         = _mm_setzero_ps();
189
190         /* Start inner kernel loop */
191         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
192         {
193
194             /* Get j neighbor index, and coordinate index */
195             jnrA             = jjnr[jidx];
196             jnrB             = jjnr[jidx+1];
197             jnrC             = jjnr[jidx+2];
198             jnrD             = jjnr[jidx+3];
199
200             j_coord_offsetA  = DIM*jnrA;
201             j_coord_offsetB  = DIM*jnrB;
202             j_coord_offsetC  = DIM*jnrC;
203             j_coord_offsetD  = DIM*jnrD;
204
205             /* load j atom coordinates */
206             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
207                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
208                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
209
210             /* Calculate displacement vector */
211             dx11             = _mm_sub_ps(ix1,jx1);
212             dy11             = _mm_sub_ps(iy1,jy1);
213             dz11             = _mm_sub_ps(iz1,jz1);
214             dx12             = _mm_sub_ps(ix1,jx2);
215             dy12             = _mm_sub_ps(iy1,jy2);
216             dz12             = _mm_sub_ps(iz1,jz2);
217             dx13             = _mm_sub_ps(ix1,jx3);
218             dy13             = _mm_sub_ps(iy1,jy3);
219             dz13             = _mm_sub_ps(iz1,jz3);
220             dx21             = _mm_sub_ps(ix2,jx1);
221             dy21             = _mm_sub_ps(iy2,jy1);
222             dz21             = _mm_sub_ps(iz2,jz1);
223             dx22             = _mm_sub_ps(ix2,jx2);
224             dy22             = _mm_sub_ps(iy2,jy2);
225             dz22             = _mm_sub_ps(iz2,jz2);
226             dx23             = _mm_sub_ps(ix2,jx3);
227             dy23             = _mm_sub_ps(iy2,jy3);
228             dz23             = _mm_sub_ps(iz2,jz3);
229             dx31             = _mm_sub_ps(ix3,jx1);
230             dy31             = _mm_sub_ps(iy3,jy1);
231             dz31             = _mm_sub_ps(iz3,jz1);
232             dx32             = _mm_sub_ps(ix3,jx2);
233             dy32             = _mm_sub_ps(iy3,jy2);
234             dz32             = _mm_sub_ps(iz3,jz2);
235             dx33             = _mm_sub_ps(ix3,jx3);
236             dy33             = _mm_sub_ps(iy3,jy3);
237             dz33             = _mm_sub_ps(iz3,jz3);
238
239             /* Calculate squared distance and things based on it */
240             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
241             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
242             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
243             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
244             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
245             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
246             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
247             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
248             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
249
250             rinv11           = gmx_mm_invsqrt_ps(rsq11);
251             rinv12           = gmx_mm_invsqrt_ps(rsq12);
252             rinv13           = gmx_mm_invsqrt_ps(rsq13);
253             rinv21           = gmx_mm_invsqrt_ps(rsq21);
254             rinv22           = gmx_mm_invsqrt_ps(rsq22);
255             rinv23           = gmx_mm_invsqrt_ps(rsq23);
256             rinv31           = gmx_mm_invsqrt_ps(rsq31);
257             rinv32           = gmx_mm_invsqrt_ps(rsq32);
258             rinv33           = gmx_mm_invsqrt_ps(rsq33);
259
260             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
261             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
262             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
263             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
264             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
265             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
266             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
267             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
268             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
269
270             fjx1             = _mm_setzero_ps();
271             fjy1             = _mm_setzero_ps();
272             fjz1             = _mm_setzero_ps();
273             fjx2             = _mm_setzero_ps();
274             fjy2             = _mm_setzero_ps();
275             fjz2             = _mm_setzero_ps();
276             fjx3             = _mm_setzero_ps();
277             fjy3             = _mm_setzero_ps();
278             fjz3             = _mm_setzero_ps();
279
280             /**************************
281              * CALCULATE INTERACTIONS *
282              **************************/
283
284             if (gmx_mm_any_lt(rsq11,rcutoff2))
285             {
286
287             r11              = _mm_mul_ps(rsq11,rinv11);
288
289             /* EWALD ELECTROSTATICS */
290
291             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
292             ewrt             = _mm_mul_ps(r11,ewtabscale);
293             ewitab           = _mm_cvttps_epi32(ewrt);
294             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
295             ewitab           = _mm_slli_epi32(ewitab,2);
296             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
297             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
298             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
299             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
300             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
301             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
302             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
303             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
304             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
305
306             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
307
308             /* Update potential sum for this i atom from the interaction with this j atom. */
309             velec            = _mm_and_ps(velec,cutoff_mask);
310             velecsum         = _mm_add_ps(velecsum,velec);
311
312             fscal            = felec;
313
314             fscal            = _mm_and_ps(fscal,cutoff_mask);
315
316             /* Calculate temporary vectorial force */
317             tx               = _mm_mul_ps(fscal,dx11);
318             ty               = _mm_mul_ps(fscal,dy11);
319             tz               = _mm_mul_ps(fscal,dz11);
320
321             /* Update vectorial force */
322             fix1             = _mm_add_ps(fix1,tx);
323             fiy1             = _mm_add_ps(fiy1,ty);
324             fiz1             = _mm_add_ps(fiz1,tz);
325
326             fjx1             = _mm_add_ps(fjx1,tx);
327             fjy1             = _mm_add_ps(fjy1,ty);
328             fjz1             = _mm_add_ps(fjz1,tz);
329
330             }
331
332             /**************************
333              * CALCULATE INTERACTIONS *
334              **************************/
335
336             if (gmx_mm_any_lt(rsq12,rcutoff2))
337             {
338
339             r12              = _mm_mul_ps(rsq12,rinv12);
340
341             /* EWALD ELECTROSTATICS */
342
343             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
344             ewrt             = _mm_mul_ps(r12,ewtabscale);
345             ewitab           = _mm_cvttps_epi32(ewrt);
346             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
347             ewitab           = _mm_slli_epi32(ewitab,2);
348             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
349             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
350             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
351             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
352             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
353             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
354             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
355             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
356             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
357
358             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
359
360             /* Update potential sum for this i atom from the interaction with this j atom. */
361             velec            = _mm_and_ps(velec,cutoff_mask);
362             velecsum         = _mm_add_ps(velecsum,velec);
363
364             fscal            = felec;
365
366             fscal            = _mm_and_ps(fscal,cutoff_mask);
367
368             /* Calculate temporary vectorial force */
369             tx               = _mm_mul_ps(fscal,dx12);
370             ty               = _mm_mul_ps(fscal,dy12);
371             tz               = _mm_mul_ps(fscal,dz12);
372
373             /* Update vectorial force */
374             fix1             = _mm_add_ps(fix1,tx);
375             fiy1             = _mm_add_ps(fiy1,ty);
376             fiz1             = _mm_add_ps(fiz1,tz);
377
378             fjx2             = _mm_add_ps(fjx2,tx);
379             fjy2             = _mm_add_ps(fjy2,ty);
380             fjz2             = _mm_add_ps(fjz2,tz);
381
382             }
383
384             /**************************
385              * CALCULATE INTERACTIONS *
386              **************************/
387
388             if (gmx_mm_any_lt(rsq13,rcutoff2))
389             {
390
391             r13              = _mm_mul_ps(rsq13,rinv13);
392
393             /* EWALD ELECTROSTATICS */
394
395             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
396             ewrt             = _mm_mul_ps(r13,ewtabscale);
397             ewitab           = _mm_cvttps_epi32(ewrt);
398             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
399             ewitab           = _mm_slli_epi32(ewitab,2);
400             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
401             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
402             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
403             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
404             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
405             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
406             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
407             velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
408             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
409
410             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
411
412             /* Update potential sum for this i atom from the interaction with this j atom. */
413             velec            = _mm_and_ps(velec,cutoff_mask);
414             velecsum         = _mm_add_ps(velecsum,velec);
415
416             fscal            = felec;
417
418             fscal            = _mm_and_ps(fscal,cutoff_mask);
419
420             /* Calculate temporary vectorial force */
421             tx               = _mm_mul_ps(fscal,dx13);
422             ty               = _mm_mul_ps(fscal,dy13);
423             tz               = _mm_mul_ps(fscal,dz13);
424
425             /* Update vectorial force */
426             fix1             = _mm_add_ps(fix1,tx);
427             fiy1             = _mm_add_ps(fiy1,ty);
428             fiz1             = _mm_add_ps(fiz1,tz);
429
430             fjx3             = _mm_add_ps(fjx3,tx);
431             fjy3             = _mm_add_ps(fjy3,ty);
432             fjz3             = _mm_add_ps(fjz3,tz);
433
434             }
435
436             /**************************
437              * CALCULATE INTERACTIONS *
438              **************************/
439
440             if (gmx_mm_any_lt(rsq21,rcutoff2))
441             {
442
443             r21              = _mm_mul_ps(rsq21,rinv21);
444
445             /* EWALD ELECTROSTATICS */
446
447             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
448             ewrt             = _mm_mul_ps(r21,ewtabscale);
449             ewitab           = _mm_cvttps_epi32(ewrt);
450             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
451             ewitab           = _mm_slli_epi32(ewitab,2);
452             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
453             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
454             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
455             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
456             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
457             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
458             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
459             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
460             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
461
462             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
463
464             /* Update potential sum for this i atom from the interaction with this j atom. */
465             velec            = _mm_and_ps(velec,cutoff_mask);
466             velecsum         = _mm_add_ps(velecsum,velec);
467
468             fscal            = felec;
469
470             fscal            = _mm_and_ps(fscal,cutoff_mask);
471
472             /* Calculate temporary vectorial force */
473             tx               = _mm_mul_ps(fscal,dx21);
474             ty               = _mm_mul_ps(fscal,dy21);
475             tz               = _mm_mul_ps(fscal,dz21);
476
477             /* Update vectorial force */
478             fix2             = _mm_add_ps(fix2,tx);
479             fiy2             = _mm_add_ps(fiy2,ty);
480             fiz2             = _mm_add_ps(fiz2,tz);
481
482             fjx1             = _mm_add_ps(fjx1,tx);
483             fjy1             = _mm_add_ps(fjy1,ty);
484             fjz1             = _mm_add_ps(fjz1,tz);
485
486             }
487
488             /**************************
489              * CALCULATE INTERACTIONS *
490              **************************/
491
492             if (gmx_mm_any_lt(rsq22,rcutoff2))
493             {
494
495             r22              = _mm_mul_ps(rsq22,rinv22);
496
497             /* EWALD ELECTROSTATICS */
498
499             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
500             ewrt             = _mm_mul_ps(r22,ewtabscale);
501             ewitab           = _mm_cvttps_epi32(ewrt);
502             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
503             ewitab           = _mm_slli_epi32(ewitab,2);
504             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
505             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
506             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
507             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
508             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
509             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
510             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
511             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
512             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
513
514             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
515
516             /* Update potential sum for this i atom from the interaction with this j atom. */
517             velec            = _mm_and_ps(velec,cutoff_mask);
518             velecsum         = _mm_add_ps(velecsum,velec);
519
520             fscal            = felec;
521
522             fscal            = _mm_and_ps(fscal,cutoff_mask);
523
524             /* Calculate temporary vectorial force */
525             tx               = _mm_mul_ps(fscal,dx22);
526             ty               = _mm_mul_ps(fscal,dy22);
527             tz               = _mm_mul_ps(fscal,dz22);
528
529             /* Update vectorial force */
530             fix2             = _mm_add_ps(fix2,tx);
531             fiy2             = _mm_add_ps(fiy2,ty);
532             fiz2             = _mm_add_ps(fiz2,tz);
533
534             fjx2             = _mm_add_ps(fjx2,tx);
535             fjy2             = _mm_add_ps(fjy2,ty);
536             fjz2             = _mm_add_ps(fjz2,tz);
537
538             }
539
540             /**************************
541              * CALCULATE INTERACTIONS *
542              **************************/
543
544             if (gmx_mm_any_lt(rsq23,rcutoff2))
545             {
546
547             r23              = _mm_mul_ps(rsq23,rinv23);
548
549             /* EWALD ELECTROSTATICS */
550
551             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
552             ewrt             = _mm_mul_ps(r23,ewtabscale);
553             ewitab           = _mm_cvttps_epi32(ewrt);
554             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
555             ewitab           = _mm_slli_epi32(ewitab,2);
556             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
557             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
558             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
559             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
560             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
561             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
562             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
563             velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
564             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
565
566             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
567
568             /* Update potential sum for this i atom from the interaction with this j atom. */
569             velec            = _mm_and_ps(velec,cutoff_mask);
570             velecsum         = _mm_add_ps(velecsum,velec);
571
572             fscal            = felec;
573
574             fscal            = _mm_and_ps(fscal,cutoff_mask);
575
576             /* Calculate temporary vectorial force */
577             tx               = _mm_mul_ps(fscal,dx23);
578             ty               = _mm_mul_ps(fscal,dy23);
579             tz               = _mm_mul_ps(fscal,dz23);
580
581             /* Update vectorial force */
582             fix2             = _mm_add_ps(fix2,tx);
583             fiy2             = _mm_add_ps(fiy2,ty);
584             fiz2             = _mm_add_ps(fiz2,tz);
585
586             fjx3             = _mm_add_ps(fjx3,tx);
587             fjy3             = _mm_add_ps(fjy3,ty);
588             fjz3             = _mm_add_ps(fjz3,tz);
589
590             }
591
592             /**************************
593              * CALCULATE INTERACTIONS *
594              **************************/
595
596             if (gmx_mm_any_lt(rsq31,rcutoff2))
597             {
598
599             r31              = _mm_mul_ps(rsq31,rinv31);
600
601             /* EWALD ELECTROSTATICS */
602
603             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
604             ewrt             = _mm_mul_ps(r31,ewtabscale);
605             ewitab           = _mm_cvttps_epi32(ewrt);
606             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
607             ewitab           = _mm_slli_epi32(ewitab,2);
608             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
609             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
610             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
611             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
612             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
613             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
614             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
615             velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
616             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
617
618             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
619
620             /* Update potential sum for this i atom from the interaction with this j atom. */
621             velec            = _mm_and_ps(velec,cutoff_mask);
622             velecsum         = _mm_add_ps(velecsum,velec);
623
624             fscal            = felec;
625
626             fscal            = _mm_and_ps(fscal,cutoff_mask);
627
628             /* Calculate temporary vectorial force */
629             tx               = _mm_mul_ps(fscal,dx31);
630             ty               = _mm_mul_ps(fscal,dy31);
631             tz               = _mm_mul_ps(fscal,dz31);
632
633             /* Update vectorial force */
634             fix3             = _mm_add_ps(fix3,tx);
635             fiy3             = _mm_add_ps(fiy3,ty);
636             fiz3             = _mm_add_ps(fiz3,tz);
637
638             fjx1             = _mm_add_ps(fjx1,tx);
639             fjy1             = _mm_add_ps(fjy1,ty);
640             fjz1             = _mm_add_ps(fjz1,tz);
641
642             }
643
644             /**************************
645              * CALCULATE INTERACTIONS *
646              **************************/
647
648             if (gmx_mm_any_lt(rsq32,rcutoff2))
649             {
650
651             r32              = _mm_mul_ps(rsq32,rinv32);
652
653             /* EWALD ELECTROSTATICS */
654
655             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
656             ewrt             = _mm_mul_ps(r32,ewtabscale);
657             ewitab           = _mm_cvttps_epi32(ewrt);
658             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
659             ewitab           = _mm_slli_epi32(ewitab,2);
660             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
661             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
662             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
663             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
664             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
665             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
666             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
667             velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
668             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
669
670             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
671
672             /* Update potential sum for this i atom from the interaction with this j atom. */
673             velec            = _mm_and_ps(velec,cutoff_mask);
674             velecsum         = _mm_add_ps(velecsum,velec);
675
676             fscal            = felec;
677
678             fscal            = _mm_and_ps(fscal,cutoff_mask);
679
680             /* Calculate temporary vectorial force */
681             tx               = _mm_mul_ps(fscal,dx32);
682             ty               = _mm_mul_ps(fscal,dy32);
683             tz               = _mm_mul_ps(fscal,dz32);
684
685             /* Update vectorial force */
686             fix3             = _mm_add_ps(fix3,tx);
687             fiy3             = _mm_add_ps(fiy3,ty);
688             fiz3             = _mm_add_ps(fiz3,tz);
689
690             fjx2             = _mm_add_ps(fjx2,tx);
691             fjy2             = _mm_add_ps(fjy2,ty);
692             fjz2             = _mm_add_ps(fjz2,tz);
693
694             }
695
696             /**************************
697              * CALCULATE INTERACTIONS *
698              **************************/
699
700             if (gmx_mm_any_lt(rsq33,rcutoff2))
701             {
702
703             r33              = _mm_mul_ps(rsq33,rinv33);
704
705             /* EWALD ELECTROSTATICS */
706
707             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
708             ewrt             = _mm_mul_ps(r33,ewtabscale);
709             ewitab           = _mm_cvttps_epi32(ewrt);
710             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
711             ewitab           = _mm_slli_epi32(ewitab,2);
712             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
713             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
714             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
715             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
716             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
717             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
718             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
719             velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
720             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
721
722             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
723
724             /* Update potential sum for this i atom from the interaction with this j atom. */
725             velec            = _mm_and_ps(velec,cutoff_mask);
726             velecsum         = _mm_add_ps(velecsum,velec);
727
728             fscal            = felec;
729
730             fscal            = _mm_and_ps(fscal,cutoff_mask);
731
732             /* Calculate temporary vectorial force */
733             tx               = _mm_mul_ps(fscal,dx33);
734             ty               = _mm_mul_ps(fscal,dy33);
735             tz               = _mm_mul_ps(fscal,dz33);
736
737             /* Update vectorial force */
738             fix3             = _mm_add_ps(fix3,tx);
739             fiy3             = _mm_add_ps(fiy3,ty);
740             fiz3             = _mm_add_ps(fiz3,tz);
741
742             fjx3             = _mm_add_ps(fjx3,tx);
743             fjy3             = _mm_add_ps(fjy3,ty);
744             fjz3             = _mm_add_ps(fjz3,tz);
745
746             }
747
748             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
749                                                    f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
750                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
751
752             /* Inner loop uses 414 flops */
753         }
754
755         if(jidx<j_index_end)
756         {
757
758             /* Get j neighbor index, and coordinate index */
759             jnrA             = jjnr[jidx];
760             jnrB             = jjnr[jidx+1];
761             jnrC             = jjnr[jidx+2];
762             jnrD             = jjnr[jidx+3];
763
764             /* Sign of each element will be negative for non-real atoms.
765              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
766              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
767              */
768             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
769             jnrA       = (jnrA>=0) ? jnrA : 0;
770             jnrB       = (jnrB>=0) ? jnrB : 0;
771             jnrC       = (jnrC>=0) ? jnrC : 0;
772             jnrD       = (jnrD>=0) ? jnrD : 0;
773
774             j_coord_offsetA  = DIM*jnrA;
775             j_coord_offsetB  = DIM*jnrB;
776             j_coord_offsetC  = DIM*jnrC;
777             j_coord_offsetD  = DIM*jnrD;
778
779             /* load j atom coordinates */
780             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
781                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
782                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
783
784             /* Calculate displacement vector */
785             dx11             = _mm_sub_ps(ix1,jx1);
786             dy11             = _mm_sub_ps(iy1,jy1);
787             dz11             = _mm_sub_ps(iz1,jz1);
788             dx12             = _mm_sub_ps(ix1,jx2);
789             dy12             = _mm_sub_ps(iy1,jy2);
790             dz12             = _mm_sub_ps(iz1,jz2);
791             dx13             = _mm_sub_ps(ix1,jx3);
792             dy13             = _mm_sub_ps(iy1,jy3);
793             dz13             = _mm_sub_ps(iz1,jz3);
794             dx21             = _mm_sub_ps(ix2,jx1);
795             dy21             = _mm_sub_ps(iy2,jy1);
796             dz21             = _mm_sub_ps(iz2,jz1);
797             dx22             = _mm_sub_ps(ix2,jx2);
798             dy22             = _mm_sub_ps(iy2,jy2);
799             dz22             = _mm_sub_ps(iz2,jz2);
800             dx23             = _mm_sub_ps(ix2,jx3);
801             dy23             = _mm_sub_ps(iy2,jy3);
802             dz23             = _mm_sub_ps(iz2,jz3);
803             dx31             = _mm_sub_ps(ix3,jx1);
804             dy31             = _mm_sub_ps(iy3,jy1);
805             dz31             = _mm_sub_ps(iz3,jz1);
806             dx32             = _mm_sub_ps(ix3,jx2);
807             dy32             = _mm_sub_ps(iy3,jy2);
808             dz32             = _mm_sub_ps(iz3,jz2);
809             dx33             = _mm_sub_ps(ix3,jx3);
810             dy33             = _mm_sub_ps(iy3,jy3);
811             dz33             = _mm_sub_ps(iz3,jz3);
812
813             /* Calculate squared distance and things based on it */
814             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
815             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
816             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
817             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
818             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
819             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
820             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
821             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
822             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
823
824             rinv11           = gmx_mm_invsqrt_ps(rsq11);
825             rinv12           = gmx_mm_invsqrt_ps(rsq12);
826             rinv13           = gmx_mm_invsqrt_ps(rsq13);
827             rinv21           = gmx_mm_invsqrt_ps(rsq21);
828             rinv22           = gmx_mm_invsqrt_ps(rsq22);
829             rinv23           = gmx_mm_invsqrt_ps(rsq23);
830             rinv31           = gmx_mm_invsqrt_ps(rsq31);
831             rinv32           = gmx_mm_invsqrt_ps(rsq32);
832             rinv33           = gmx_mm_invsqrt_ps(rsq33);
833
834             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
835             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
836             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
837             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
838             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
839             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
840             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
841             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
842             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
843
844             fjx1             = _mm_setzero_ps();
845             fjy1             = _mm_setzero_ps();
846             fjz1             = _mm_setzero_ps();
847             fjx2             = _mm_setzero_ps();
848             fjy2             = _mm_setzero_ps();
849             fjz2             = _mm_setzero_ps();
850             fjx3             = _mm_setzero_ps();
851             fjy3             = _mm_setzero_ps();
852             fjz3             = _mm_setzero_ps();
853
854             /**************************
855              * CALCULATE INTERACTIONS *
856              **************************/
857
858             if (gmx_mm_any_lt(rsq11,rcutoff2))
859             {
860
861             r11              = _mm_mul_ps(rsq11,rinv11);
862             r11              = _mm_andnot_ps(dummy_mask,r11);
863
864             /* EWALD ELECTROSTATICS */
865
866             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
867             ewrt             = _mm_mul_ps(r11,ewtabscale);
868             ewitab           = _mm_cvttps_epi32(ewrt);
869             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
870             ewitab           = _mm_slli_epi32(ewitab,2);
871             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
872             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
873             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
874             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
875             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
876             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
877             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
878             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
879             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
880
881             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
882
883             /* Update potential sum for this i atom from the interaction with this j atom. */
884             velec            = _mm_and_ps(velec,cutoff_mask);
885             velec            = _mm_andnot_ps(dummy_mask,velec);
886             velecsum         = _mm_add_ps(velecsum,velec);
887
888             fscal            = felec;
889
890             fscal            = _mm_and_ps(fscal,cutoff_mask);
891
892             fscal            = _mm_andnot_ps(dummy_mask,fscal);
893
894             /* Calculate temporary vectorial force */
895             tx               = _mm_mul_ps(fscal,dx11);
896             ty               = _mm_mul_ps(fscal,dy11);
897             tz               = _mm_mul_ps(fscal,dz11);
898
899             /* Update vectorial force */
900             fix1             = _mm_add_ps(fix1,tx);
901             fiy1             = _mm_add_ps(fiy1,ty);
902             fiz1             = _mm_add_ps(fiz1,tz);
903
904             fjx1             = _mm_add_ps(fjx1,tx);
905             fjy1             = _mm_add_ps(fjy1,ty);
906             fjz1             = _mm_add_ps(fjz1,tz);
907
908             }
909
910             /**************************
911              * CALCULATE INTERACTIONS *
912              **************************/
913
914             if (gmx_mm_any_lt(rsq12,rcutoff2))
915             {
916
917             r12              = _mm_mul_ps(rsq12,rinv12);
918             r12              = _mm_andnot_ps(dummy_mask,r12);
919
920             /* EWALD ELECTROSTATICS */
921
922             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
923             ewrt             = _mm_mul_ps(r12,ewtabscale);
924             ewitab           = _mm_cvttps_epi32(ewrt);
925             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
926             ewitab           = _mm_slli_epi32(ewitab,2);
927             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
928             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
929             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
930             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
931             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
932             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
933             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
934             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
935             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
936
937             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
938
939             /* Update potential sum for this i atom from the interaction with this j atom. */
940             velec            = _mm_and_ps(velec,cutoff_mask);
941             velec            = _mm_andnot_ps(dummy_mask,velec);
942             velecsum         = _mm_add_ps(velecsum,velec);
943
944             fscal            = felec;
945
946             fscal            = _mm_and_ps(fscal,cutoff_mask);
947
948             fscal            = _mm_andnot_ps(dummy_mask,fscal);
949
950             /* Calculate temporary vectorial force */
951             tx               = _mm_mul_ps(fscal,dx12);
952             ty               = _mm_mul_ps(fscal,dy12);
953             tz               = _mm_mul_ps(fscal,dz12);
954
955             /* Update vectorial force */
956             fix1             = _mm_add_ps(fix1,tx);
957             fiy1             = _mm_add_ps(fiy1,ty);
958             fiz1             = _mm_add_ps(fiz1,tz);
959
960             fjx2             = _mm_add_ps(fjx2,tx);
961             fjy2             = _mm_add_ps(fjy2,ty);
962             fjz2             = _mm_add_ps(fjz2,tz);
963
964             }
965
966             /**************************
967              * CALCULATE INTERACTIONS *
968              **************************/
969
970             if (gmx_mm_any_lt(rsq13,rcutoff2))
971             {
972
973             r13              = _mm_mul_ps(rsq13,rinv13);
974             r13              = _mm_andnot_ps(dummy_mask,r13);
975
976             /* EWALD ELECTROSTATICS */
977
978             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
979             ewrt             = _mm_mul_ps(r13,ewtabscale);
980             ewitab           = _mm_cvttps_epi32(ewrt);
981             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
982             ewitab           = _mm_slli_epi32(ewitab,2);
983             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
984             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
985             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
986             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
987             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
988             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
989             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
990             velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
991             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
992
993             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
994
995             /* Update potential sum for this i atom from the interaction with this j atom. */
996             velec            = _mm_and_ps(velec,cutoff_mask);
997             velec            = _mm_andnot_ps(dummy_mask,velec);
998             velecsum         = _mm_add_ps(velecsum,velec);
999
1000             fscal            = felec;
1001
1002             fscal            = _mm_and_ps(fscal,cutoff_mask);
1003
1004             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1005
1006             /* Calculate temporary vectorial force */
1007             tx               = _mm_mul_ps(fscal,dx13);
1008             ty               = _mm_mul_ps(fscal,dy13);
1009             tz               = _mm_mul_ps(fscal,dz13);
1010
1011             /* Update vectorial force */
1012             fix1             = _mm_add_ps(fix1,tx);
1013             fiy1             = _mm_add_ps(fiy1,ty);
1014             fiz1             = _mm_add_ps(fiz1,tz);
1015
1016             fjx3             = _mm_add_ps(fjx3,tx);
1017             fjy3             = _mm_add_ps(fjy3,ty);
1018             fjz3             = _mm_add_ps(fjz3,tz);
1019
1020             }
1021
1022             /**************************
1023              * CALCULATE INTERACTIONS *
1024              **************************/
1025
1026             if (gmx_mm_any_lt(rsq21,rcutoff2))
1027             {
1028
1029             r21              = _mm_mul_ps(rsq21,rinv21);
1030             r21              = _mm_andnot_ps(dummy_mask,r21);
1031
1032             /* EWALD ELECTROSTATICS */
1033
1034             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1035             ewrt             = _mm_mul_ps(r21,ewtabscale);
1036             ewitab           = _mm_cvttps_epi32(ewrt);
1037             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1038             ewitab           = _mm_slli_epi32(ewitab,2);
1039             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1040             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1041             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1042             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1043             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1044             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1045             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1046             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
1047             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1048
1049             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1050
1051             /* Update potential sum for this i atom from the interaction with this j atom. */
1052             velec            = _mm_and_ps(velec,cutoff_mask);
1053             velec            = _mm_andnot_ps(dummy_mask,velec);
1054             velecsum         = _mm_add_ps(velecsum,velec);
1055
1056             fscal            = felec;
1057
1058             fscal            = _mm_and_ps(fscal,cutoff_mask);
1059
1060             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1061
1062             /* Calculate temporary vectorial force */
1063             tx               = _mm_mul_ps(fscal,dx21);
1064             ty               = _mm_mul_ps(fscal,dy21);
1065             tz               = _mm_mul_ps(fscal,dz21);
1066
1067             /* Update vectorial force */
1068             fix2             = _mm_add_ps(fix2,tx);
1069             fiy2             = _mm_add_ps(fiy2,ty);
1070             fiz2             = _mm_add_ps(fiz2,tz);
1071
1072             fjx1             = _mm_add_ps(fjx1,tx);
1073             fjy1             = _mm_add_ps(fjy1,ty);
1074             fjz1             = _mm_add_ps(fjz1,tz);
1075
1076             }
1077
1078             /**************************
1079              * CALCULATE INTERACTIONS *
1080              **************************/
1081
1082             if (gmx_mm_any_lt(rsq22,rcutoff2))
1083             {
1084
1085             r22              = _mm_mul_ps(rsq22,rinv22);
1086             r22              = _mm_andnot_ps(dummy_mask,r22);
1087
1088             /* EWALD ELECTROSTATICS */
1089
1090             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1091             ewrt             = _mm_mul_ps(r22,ewtabscale);
1092             ewitab           = _mm_cvttps_epi32(ewrt);
1093             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1094             ewitab           = _mm_slli_epi32(ewitab,2);
1095             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1096             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1097             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1098             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1099             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1100             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1101             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1102             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
1103             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1104
1105             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1106
1107             /* Update potential sum for this i atom from the interaction with this j atom. */
1108             velec            = _mm_and_ps(velec,cutoff_mask);
1109             velec            = _mm_andnot_ps(dummy_mask,velec);
1110             velecsum         = _mm_add_ps(velecsum,velec);
1111
1112             fscal            = felec;
1113
1114             fscal            = _mm_and_ps(fscal,cutoff_mask);
1115
1116             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1117
1118             /* Calculate temporary vectorial force */
1119             tx               = _mm_mul_ps(fscal,dx22);
1120             ty               = _mm_mul_ps(fscal,dy22);
1121             tz               = _mm_mul_ps(fscal,dz22);
1122
1123             /* Update vectorial force */
1124             fix2             = _mm_add_ps(fix2,tx);
1125             fiy2             = _mm_add_ps(fiy2,ty);
1126             fiz2             = _mm_add_ps(fiz2,tz);
1127
1128             fjx2             = _mm_add_ps(fjx2,tx);
1129             fjy2             = _mm_add_ps(fjy2,ty);
1130             fjz2             = _mm_add_ps(fjz2,tz);
1131
1132             }
1133
1134             /**************************
1135              * CALCULATE INTERACTIONS *
1136              **************************/
1137
1138             if (gmx_mm_any_lt(rsq23,rcutoff2))
1139             {
1140
1141             r23              = _mm_mul_ps(rsq23,rinv23);
1142             r23              = _mm_andnot_ps(dummy_mask,r23);
1143
1144             /* EWALD ELECTROSTATICS */
1145
1146             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1147             ewrt             = _mm_mul_ps(r23,ewtabscale);
1148             ewitab           = _mm_cvttps_epi32(ewrt);
1149             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1150             ewitab           = _mm_slli_epi32(ewitab,2);
1151             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1152             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1153             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1154             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1155             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1156             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1157             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1158             velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
1159             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1160
1161             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1162
1163             /* Update potential sum for this i atom from the interaction with this j atom. */
1164             velec            = _mm_and_ps(velec,cutoff_mask);
1165             velec            = _mm_andnot_ps(dummy_mask,velec);
1166             velecsum         = _mm_add_ps(velecsum,velec);
1167
1168             fscal            = felec;
1169
1170             fscal            = _mm_and_ps(fscal,cutoff_mask);
1171
1172             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1173
1174             /* Calculate temporary vectorial force */
1175             tx               = _mm_mul_ps(fscal,dx23);
1176             ty               = _mm_mul_ps(fscal,dy23);
1177             tz               = _mm_mul_ps(fscal,dz23);
1178
1179             /* Update vectorial force */
1180             fix2             = _mm_add_ps(fix2,tx);
1181             fiy2             = _mm_add_ps(fiy2,ty);
1182             fiz2             = _mm_add_ps(fiz2,tz);
1183
1184             fjx3             = _mm_add_ps(fjx3,tx);
1185             fjy3             = _mm_add_ps(fjy3,ty);
1186             fjz3             = _mm_add_ps(fjz3,tz);
1187
1188             }
1189
1190             /**************************
1191              * CALCULATE INTERACTIONS *
1192              **************************/
1193
1194             if (gmx_mm_any_lt(rsq31,rcutoff2))
1195             {
1196
1197             r31              = _mm_mul_ps(rsq31,rinv31);
1198             r31              = _mm_andnot_ps(dummy_mask,r31);
1199
1200             /* EWALD ELECTROSTATICS */
1201
1202             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1203             ewrt             = _mm_mul_ps(r31,ewtabscale);
1204             ewitab           = _mm_cvttps_epi32(ewrt);
1205             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1206             ewitab           = _mm_slli_epi32(ewitab,2);
1207             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1208             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1209             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1210             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1211             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1212             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1213             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1214             velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
1215             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1216
1217             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1218
1219             /* Update potential sum for this i atom from the interaction with this j atom. */
1220             velec            = _mm_and_ps(velec,cutoff_mask);
1221             velec            = _mm_andnot_ps(dummy_mask,velec);
1222             velecsum         = _mm_add_ps(velecsum,velec);
1223
1224             fscal            = felec;
1225
1226             fscal            = _mm_and_ps(fscal,cutoff_mask);
1227
1228             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1229
1230             /* Calculate temporary vectorial force */
1231             tx               = _mm_mul_ps(fscal,dx31);
1232             ty               = _mm_mul_ps(fscal,dy31);
1233             tz               = _mm_mul_ps(fscal,dz31);
1234
1235             /* Update vectorial force */
1236             fix3             = _mm_add_ps(fix3,tx);
1237             fiy3             = _mm_add_ps(fiy3,ty);
1238             fiz3             = _mm_add_ps(fiz3,tz);
1239
1240             fjx1             = _mm_add_ps(fjx1,tx);
1241             fjy1             = _mm_add_ps(fjy1,ty);
1242             fjz1             = _mm_add_ps(fjz1,tz);
1243
1244             }
1245
1246             /**************************
1247              * CALCULATE INTERACTIONS *
1248              **************************/
1249
1250             if (gmx_mm_any_lt(rsq32,rcutoff2))
1251             {
1252
1253             r32              = _mm_mul_ps(rsq32,rinv32);
1254             r32              = _mm_andnot_ps(dummy_mask,r32);
1255
1256             /* EWALD ELECTROSTATICS */
1257
1258             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1259             ewrt             = _mm_mul_ps(r32,ewtabscale);
1260             ewitab           = _mm_cvttps_epi32(ewrt);
1261             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1262             ewitab           = _mm_slli_epi32(ewitab,2);
1263             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1264             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1265             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1266             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1267             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1268             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1269             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1270             velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
1271             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1272
1273             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1274
1275             /* Update potential sum for this i atom from the interaction with this j atom. */
1276             velec            = _mm_and_ps(velec,cutoff_mask);
1277             velec            = _mm_andnot_ps(dummy_mask,velec);
1278             velecsum         = _mm_add_ps(velecsum,velec);
1279
1280             fscal            = felec;
1281
1282             fscal            = _mm_and_ps(fscal,cutoff_mask);
1283
1284             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1285
1286             /* Calculate temporary vectorial force */
1287             tx               = _mm_mul_ps(fscal,dx32);
1288             ty               = _mm_mul_ps(fscal,dy32);
1289             tz               = _mm_mul_ps(fscal,dz32);
1290
1291             /* Update vectorial force */
1292             fix3             = _mm_add_ps(fix3,tx);
1293             fiy3             = _mm_add_ps(fiy3,ty);
1294             fiz3             = _mm_add_ps(fiz3,tz);
1295
1296             fjx2             = _mm_add_ps(fjx2,tx);
1297             fjy2             = _mm_add_ps(fjy2,ty);
1298             fjz2             = _mm_add_ps(fjz2,tz);
1299
1300             }
1301
1302             /**************************
1303              * CALCULATE INTERACTIONS *
1304              **************************/
1305
1306             if (gmx_mm_any_lt(rsq33,rcutoff2))
1307             {
1308
1309             r33              = _mm_mul_ps(rsq33,rinv33);
1310             r33              = _mm_andnot_ps(dummy_mask,r33);
1311
1312             /* EWALD ELECTROSTATICS */
1313
1314             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1315             ewrt             = _mm_mul_ps(r33,ewtabscale);
1316             ewitab           = _mm_cvttps_epi32(ewrt);
1317             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1318             ewitab           = _mm_slli_epi32(ewitab,2);
1319             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1320             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1321             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1322             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1323             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1324             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1325             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1326             velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
1327             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1328
1329             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1330
1331             /* Update potential sum for this i atom from the interaction with this j atom. */
1332             velec            = _mm_and_ps(velec,cutoff_mask);
1333             velec            = _mm_andnot_ps(dummy_mask,velec);
1334             velecsum         = _mm_add_ps(velecsum,velec);
1335
1336             fscal            = felec;
1337
1338             fscal            = _mm_and_ps(fscal,cutoff_mask);
1339
1340             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1341
1342             /* Calculate temporary vectorial force */
1343             tx               = _mm_mul_ps(fscal,dx33);
1344             ty               = _mm_mul_ps(fscal,dy33);
1345             tz               = _mm_mul_ps(fscal,dz33);
1346
1347             /* Update vectorial force */
1348             fix3             = _mm_add_ps(fix3,tx);
1349             fiy3             = _mm_add_ps(fiy3,ty);
1350             fiz3             = _mm_add_ps(fiz3,tz);
1351
1352             fjx3             = _mm_add_ps(fjx3,tx);
1353             fjy3             = _mm_add_ps(fjy3,ty);
1354             fjz3             = _mm_add_ps(fjz3,tz);
1355
1356             }
1357
1358             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
1359                                                    f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
1360                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1361
1362             /* Inner loop uses 423 flops */
1363         }
1364
1365         /* End of innermost loop */
1366
1367         gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1368                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
1369
1370         ggid                        = gid[iidx];
1371         /* Update potential energies */
1372         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1373
1374         /* Increment number of inner iterations */
1375         inneriter                  += j_index_end - j_index_start;
1376
1377         /* Outer loop uses 28 flops */
1378     }
1379
1380     /* Increment number of outer iterations */
1381     outeriter        += nri;
1382
1383     /* Update outer/inner flops */
1384
1385     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*423);
1386 }
1387 /*
1388  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse2_single
1389  * Electrostatics interaction: Ewald
1390  * VdW interaction:            None
1391  * Geometry:                   Water4-Water4
1392  * Calculate force/pot:        Force
1393  */
1394 void
1395 nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse2_single
1396                     (t_nblist * gmx_restrict                nlist,
1397                      rvec * gmx_restrict                    xx,
1398                      rvec * gmx_restrict                    ff,
1399                      t_forcerec * gmx_restrict              fr,
1400                      t_mdatoms * gmx_restrict               mdatoms,
1401                      nb_kernel_data_t * gmx_restrict        kernel_data,
1402                      t_nrnb * gmx_restrict                  nrnb)
1403 {
1404     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1405      * just 0 for non-waters.
1406      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1407      * jnr indices corresponding to data put in the four positions in the SIMD register.
1408      */
1409     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1410     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1411     int              jnrA,jnrB,jnrC,jnrD;
1412     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1413     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1414     real             shX,shY,shZ,rcutoff_scalar;
1415     real             *shiftvec,*fshift,*x,*f;
1416     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1417     int              vdwioffset1;
1418     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1419     int              vdwioffset2;
1420     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1421     int              vdwioffset3;
1422     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1423     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1424     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1425     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1426     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1427     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1428     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1429     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1430     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1431     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1432     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1433     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1434     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1435     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1436     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1437     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1438     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1439     real             *charge;
1440     __m128i          ewitab;
1441     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1442     real             *ewtab;
1443     __m128           dummy_mask,cutoff_mask;
1444     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1445     __m128           one     = _mm_set1_ps(1.0);
1446     __m128           two     = _mm_set1_ps(2.0);
1447     x                = xx[0];
1448     f                = ff[0];
1449
1450     nri              = nlist->nri;
1451     iinr             = nlist->iinr;
1452     jindex           = nlist->jindex;
1453     jjnr             = nlist->jjnr;
1454     shiftidx         = nlist->shift;
1455     gid              = nlist->gid;
1456     shiftvec         = fr->shift_vec[0];
1457     fshift           = fr->fshift[0];
1458     facel            = _mm_set1_ps(fr->epsfac);
1459     charge           = mdatoms->chargeA;
1460
1461     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1462     ewtab            = fr->ic->tabq_coul_F;
1463     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1464     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1465
1466     /* Setup water-specific parameters */
1467     inr              = nlist->iinr[0];
1468     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1469     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1470     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1471
1472     jq1              = _mm_set1_ps(charge[inr+1]);
1473     jq2              = _mm_set1_ps(charge[inr+2]);
1474     jq3              = _mm_set1_ps(charge[inr+3]);
1475     qq11             = _mm_mul_ps(iq1,jq1);
1476     qq12             = _mm_mul_ps(iq1,jq2);
1477     qq13             = _mm_mul_ps(iq1,jq3);
1478     qq21             = _mm_mul_ps(iq2,jq1);
1479     qq22             = _mm_mul_ps(iq2,jq2);
1480     qq23             = _mm_mul_ps(iq2,jq3);
1481     qq31             = _mm_mul_ps(iq3,jq1);
1482     qq32             = _mm_mul_ps(iq3,jq2);
1483     qq33             = _mm_mul_ps(iq3,jq3);
1484
1485     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1486     rcutoff_scalar   = fr->rcoulomb;
1487     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1488     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1489
1490     /* Avoid stupid compiler warnings */
1491     jnrA = jnrB = jnrC = jnrD = 0;
1492     j_coord_offsetA = 0;
1493     j_coord_offsetB = 0;
1494     j_coord_offsetC = 0;
1495     j_coord_offsetD = 0;
1496
1497     outeriter        = 0;
1498     inneriter        = 0;
1499
1500     /* Start outer loop over neighborlists */
1501     for(iidx=0; iidx<nri; iidx++)
1502     {
1503         /* Load shift vector for this list */
1504         i_shift_offset   = DIM*shiftidx[iidx];
1505         shX              = shiftvec[i_shift_offset+XX];
1506         shY              = shiftvec[i_shift_offset+YY];
1507         shZ              = shiftvec[i_shift_offset+ZZ];
1508
1509         /* Load limits for loop over neighbors */
1510         j_index_start    = jindex[iidx];
1511         j_index_end      = jindex[iidx+1];
1512
1513         /* Get outer coordinate index */
1514         inr              = iinr[iidx];
1515         i_coord_offset   = DIM*inr;
1516
1517         /* Load i particle coords and add shift vector */
1518         ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1519         iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1520         iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1521         ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1522         iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1523         iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1524         ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
1525         iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
1526         iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
1527
1528         fix1             = _mm_setzero_ps();
1529         fiy1             = _mm_setzero_ps();
1530         fiz1             = _mm_setzero_ps();
1531         fix2             = _mm_setzero_ps();
1532         fiy2             = _mm_setzero_ps();
1533         fiz2             = _mm_setzero_ps();
1534         fix3             = _mm_setzero_ps();
1535         fiy3             = _mm_setzero_ps();
1536         fiz3             = _mm_setzero_ps();
1537
1538         /* Start inner kernel loop */
1539         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1540         {
1541
1542             /* Get j neighbor index, and coordinate index */
1543             jnrA             = jjnr[jidx];
1544             jnrB             = jjnr[jidx+1];
1545             jnrC             = jjnr[jidx+2];
1546             jnrD             = jjnr[jidx+3];
1547
1548             j_coord_offsetA  = DIM*jnrA;
1549             j_coord_offsetB  = DIM*jnrB;
1550             j_coord_offsetC  = DIM*jnrC;
1551             j_coord_offsetD  = DIM*jnrD;
1552
1553             /* load j atom coordinates */
1554             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1555                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1556                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1557
1558             /* Calculate displacement vector */
1559             dx11             = _mm_sub_ps(ix1,jx1);
1560             dy11             = _mm_sub_ps(iy1,jy1);
1561             dz11             = _mm_sub_ps(iz1,jz1);
1562             dx12             = _mm_sub_ps(ix1,jx2);
1563             dy12             = _mm_sub_ps(iy1,jy2);
1564             dz12             = _mm_sub_ps(iz1,jz2);
1565             dx13             = _mm_sub_ps(ix1,jx3);
1566             dy13             = _mm_sub_ps(iy1,jy3);
1567             dz13             = _mm_sub_ps(iz1,jz3);
1568             dx21             = _mm_sub_ps(ix2,jx1);
1569             dy21             = _mm_sub_ps(iy2,jy1);
1570             dz21             = _mm_sub_ps(iz2,jz1);
1571             dx22             = _mm_sub_ps(ix2,jx2);
1572             dy22             = _mm_sub_ps(iy2,jy2);
1573             dz22             = _mm_sub_ps(iz2,jz2);
1574             dx23             = _mm_sub_ps(ix2,jx3);
1575             dy23             = _mm_sub_ps(iy2,jy3);
1576             dz23             = _mm_sub_ps(iz2,jz3);
1577             dx31             = _mm_sub_ps(ix3,jx1);
1578             dy31             = _mm_sub_ps(iy3,jy1);
1579             dz31             = _mm_sub_ps(iz3,jz1);
1580             dx32             = _mm_sub_ps(ix3,jx2);
1581             dy32             = _mm_sub_ps(iy3,jy2);
1582             dz32             = _mm_sub_ps(iz3,jz2);
1583             dx33             = _mm_sub_ps(ix3,jx3);
1584             dy33             = _mm_sub_ps(iy3,jy3);
1585             dz33             = _mm_sub_ps(iz3,jz3);
1586
1587             /* Calculate squared distance and things based on it */
1588             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1589             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1590             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1591             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1592             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1593             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1594             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1595             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1596             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1597
1598             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1599             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1600             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1601             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1602             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1603             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1604             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1605             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1606             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1607
1608             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1609             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1610             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1611             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1612             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1613             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1614             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1615             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1616             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1617
1618             fjx1             = _mm_setzero_ps();
1619             fjy1             = _mm_setzero_ps();
1620             fjz1             = _mm_setzero_ps();
1621             fjx2             = _mm_setzero_ps();
1622             fjy2             = _mm_setzero_ps();
1623             fjz2             = _mm_setzero_ps();
1624             fjx3             = _mm_setzero_ps();
1625             fjy3             = _mm_setzero_ps();
1626             fjz3             = _mm_setzero_ps();
1627
1628             /**************************
1629              * CALCULATE INTERACTIONS *
1630              **************************/
1631
1632             if (gmx_mm_any_lt(rsq11,rcutoff2))
1633             {
1634
1635             r11              = _mm_mul_ps(rsq11,rinv11);
1636
1637             /* EWALD ELECTROSTATICS */
1638
1639             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1640             ewrt             = _mm_mul_ps(r11,ewtabscale);
1641             ewitab           = _mm_cvttps_epi32(ewrt);
1642             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1643             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1644                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1645                                          &ewtabF,&ewtabFn);
1646             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1647             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1648
1649             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1650
1651             fscal            = felec;
1652
1653             fscal            = _mm_and_ps(fscal,cutoff_mask);
1654
1655             /* Calculate temporary vectorial force */
1656             tx               = _mm_mul_ps(fscal,dx11);
1657             ty               = _mm_mul_ps(fscal,dy11);
1658             tz               = _mm_mul_ps(fscal,dz11);
1659
1660             /* Update vectorial force */
1661             fix1             = _mm_add_ps(fix1,tx);
1662             fiy1             = _mm_add_ps(fiy1,ty);
1663             fiz1             = _mm_add_ps(fiz1,tz);
1664
1665             fjx1             = _mm_add_ps(fjx1,tx);
1666             fjy1             = _mm_add_ps(fjy1,ty);
1667             fjz1             = _mm_add_ps(fjz1,tz);
1668
1669             }
1670
1671             /**************************
1672              * CALCULATE INTERACTIONS *
1673              **************************/
1674
1675             if (gmx_mm_any_lt(rsq12,rcutoff2))
1676             {
1677
1678             r12              = _mm_mul_ps(rsq12,rinv12);
1679
1680             /* EWALD ELECTROSTATICS */
1681
1682             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1683             ewrt             = _mm_mul_ps(r12,ewtabscale);
1684             ewitab           = _mm_cvttps_epi32(ewrt);
1685             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1686             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1687                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1688                                          &ewtabF,&ewtabFn);
1689             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1690             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1691
1692             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1693
1694             fscal            = felec;
1695
1696             fscal            = _mm_and_ps(fscal,cutoff_mask);
1697
1698             /* Calculate temporary vectorial force */
1699             tx               = _mm_mul_ps(fscal,dx12);
1700             ty               = _mm_mul_ps(fscal,dy12);
1701             tz               = _mm_mul_ps(fscal,dz12);
1702
1703             /* Update vectorial force */
1704             fix1             = _mm_add_ps(fix1,tx);
1705             fiy1             = _mm_add_ps(fiy1,ty);
1706             fiz1             = _mm_add_ps(fiz1,tz);
1707
1708             fjx2             = _mm_add_ps(fjx2,tx);
1709             fjy2             = _mm_add_ps(fjy2,ty);
1710             fjz2             = _mm_add_ps(fjz2,tz);
1711
1712             }
1713
1714             /**************************
1715              * CALCULATE INTERACTIONS *
1716              **************************/
1717
1718             if (gmx_mm_any_lt(rsq13,rcutoff2))
1719             {
1720
1721             r13              = _mm_mul_ps(rsq13,rinv13);
1722
1723             /* EWALD ELECTROSTATICS */
1724
1725             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1726             ewrt             = _mm_mul_ps(r13,ewtabscale);
1727             ewitab           = _mm_cvttps_epi32(ewrt);
1728             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1729             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1730                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1731                                          &ewtabF,&ewtabFn);
1732             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1733             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1734
1735             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
1736
1737             fscal            = felec;
1738
1739             fscal            = _mm_and_ps(fscal,cutoff_mask);
1740
1741             /* Calculate temporary vectorial force */
1742             tx               = _mm_mul_ps(fscal,dx13);
1743             ty               = _mm_mul_ps(fscal,dy13);
1744             tz               = _mm_mul_ps(fscal,dz13);
1745
1746             /* Update vectorial force */
1747             fix1             = _mm_add_ps(fix1,tx);
1748             fiy1             = _mm_add_ps(fiy1,ty);
1749             fiz1             = _mm_add_ps(fiz1,tz);
1750
1751             fjx3             = _mm_add_ps(fjx3,tx);
1752             fjy3             = _mm_add_ps(fjy3,ty);
1753             fjz3             = _mm_add_ps(fjz3,tz);
1754
1755             }
1756
1757             /**************************
1758              * CALCULATE INTERACTIONS *
1759              **************************/
1760
1761             if (gmx_mm_any_lt(rsq21,rcutoff2))
1762             {
1763
1764             r21              = _mm_mul_ps(rsq21,rinv21);
1765
1766             /* EWALD ELECTROSTATICS */
1767
1768             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1769             ewrt             = _mm_mul_ps(r21,ewtabscale);
1770             ewitab           = _mm_cvttps_epi32(ewrt);
1771             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1772             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1773                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1774                                          &ewtabF,&ewtabFn);
1775             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1776             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1777
1778             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1779
1780             fscal            = felec;
1781
1782             fscal            = _mm_and_ps(fscal,cutoff_mask);
1783
1784             /* Calculate temporary vectorial force */
1785             tx               = _mm_mul_ps(fscal,dx21);
1786             ty               = _mm_mul_ps(fscal,dy21);
1787             tz               = _mm_mul_ps(fscal,dz21);
1788
1789             /* Update vectorial force */
1790             fix2             = _mm_add_ps(fix2,tx);
1791             fiy2             = _mm_add_ps(fiy2,ty);
1792             fiz2             = _mm_add_ps(fiz2,tz);
1793
1794             fjx1             = _mm_add_ps(fjx1,tx);
1795             fjy1             = _mm_add_ps(fjy1,ty);
1796             fjz1             = _mm_add_ps(fjz1,tz);
1797
1798             }
1799
1800             /**************************
1801              * CALCULATE INTERACTIONS *
1802              **************************/
1803
1804             if (gmx_mm_any_lt(rsq22,rcutoff2))
1805             {
1806
1807             r22              = _mm_mul_ps(rsq22,rinv22);
1808
1809             /* EWALD ELECTROSTATICS */
1810
1811             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1812             ewrt             = _mm_mul_ps(r22,ewtabscale);
1813             ewitab           = _mm_cvttps_epi32(ewrt);
1814             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1815             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1816                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1817                                          &ewtabF,&ewtabFn);
1818             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1819             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1820
1821             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1822
1823             fscal            = felec;
1824
1825             fscal            = _mm_and_ps(fscal,cutoff_mask);
1826
1827             /* Calculate temporary vectorial force */
1828             tx               = _mm_mul_ps(fscal,dx22);
1829             ty               = _mm_mul_ps(fscal,dy22);
1830             tz               = _mm_mul_ps(fscal,dz22);
1831
1832             /* Update vectorial force */
1833             fix2             = _mm_add_ps(fix2,tx);
1834             fiy2             = _mm_add_ps(fiy2,ty);
1835             fiz2             = _mm_add_ps(fiz2,tz);
1836
1837             fjx2             = _mm_add_ps(fjx2,tx);
1838             fjy2             = _mm_add_ps(fjy2,ty);
1839             fjz2             = _mm_add_ps(fjz2,tz);
1840
1841             }
1842
1843             /**************************
1844              * CALCULATE INTERACTIONS *
1845              **************************/
1846
1847             if (gmx_mm_any_lt(rsq23,rcutoff2))
1848             {
1849
1850             r23              = _mm_mul_ps(rsq23,rinv23);
1851
1852             /* EWALD ELECTROSTATICS */
1853
1854             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1855             ewrt             = _mm_mul_ps(r23,ewtabscale);
1856             ewitab           = _mm_cvttps_epi32(ewrt);
1857             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1858             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1859                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1860                                          &ewtabF,&ewtabFn);
1861             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1862             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1863
1864             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1865
1866             fscal            = felec;
1867
1868             fscal            = _mm_and_ps(fscal,cutoff_mask);
1869
1870             /* Calculate temporary vectorial force */
1871             tx               = _mm_mul_ps(fscal,dx23);
1872             ty               = _mm_mul_ps(fscal,dy23);
1873             tz               = _mm_mul_ps(fscal,dz23);
1874
1875             /* Update vectorial force */
1876             fix2             = _mm_add_ps(fix2,tx);
1877             fiy2             = _mm_add_ps(fiy2,ty);
1878             fiz2             = _mm_add_ps(fiz2,tz);
1879
1880             fjx3             = _mm_add_ps(fjx3,tx);
1881             fjy3             = _mm_add_ps(fjy3,ty);
1882             fjz3             = _mm_add_ps(fjz3,tz);
1883
1884             }
1885
1886             /**************************
1887              * CALCULATE INTERACTIONS *
1888              **************************/
1889
1890             if (gmx_mm_any_lt(rsq31,rcutoff2))
1891             {
1892
1893             r31              = _mm_mul_ps(rsq31,rinv31);
1894
1895             /* EWALD ELECTROSTATICS */
1896
1897             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1898             ewrt             = _mm_mul_ps(r31,ewtabscale);
1899             ewitab           = _mm_cvttps_epi32(ewrt);
1900             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1901             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1902                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1903                                          &ewtabF,&ewtabFn);
1904             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1905             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1906
1907             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1908
1909             fscal            = felec;
1910
1911             fscal            = _mm_and_ps(fscal,cutoff_mask);
1912
1913             /* Calculate temporary vectorial force */
1914             tx               = _mm_mul_ps(fscal,dx31);
1915             ty               = _mm_mul_ps(fscal,dy31);
1916             tz               = _mm_mul_ps(fscal,dz31);
1917
1918             /* Update vectorial force */
1919             fix3             = _mm_add_ps(fix3,tx);
1920             fiy3             = _mm_add_ps(fiy3,ty);
1921             fiz3             = _mm_add_ps(fiz3,tz);
1922
1923             fjx1             = _mm_add_ps(fjx1,tx);
1924             fjy1             = _mm_add_ps(fjy1,ty);
1925             fjz1             = _mm_add_ps(fjz1,tz);
1926
1927             }
1928
1929             /**************************
1930              * CALCULATE INTERACTIONS *
1931              **************************/
1932
1933             if (gmx_mm_any_lt(rsq32,rcutoff2))
1934             {
1935
1936             r32              = _mm_mul_ps(rsq32,rinv32);
1937
1938             /* EWALD ELECTROSTATICS */
1939
1940             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1941             ewrt             = _mm_mul_ps(r32,ewtabscale);
1942             ewitab           = _mm_cvttps_epi32(ewrt);
1943             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1944             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1945                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1946                                          &ewtabF,&ewtabFn);
1947             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1948             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1949
1950             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1951
1952             fscal            = felec;
1953
1954             fscal            = _mm_and_ps(fscal,cutoff_mask);
1955
1956             /* Calculate temporary vectorial force */
1957             tx               = _mm_mul_ps(fscal,dx32);
1958             ty               = _mm_mul_ps(fscal,dy32);
1959             tz               = _mm_mul_ps(fscal,dz32);
1960
1961             /* Update vectorial force */
1962             fix3             = _mm_add_ps(fix3,tx);
1963             fiy3             = _mm_add_ps(fiy3,ty);
1964             fiz3             = _mm_add_ps(fiz3,tz);
1965
1966             fjx2             = _mm_add_ps(fjx2,tx);
1967             fjy2             = _mm_add_ps(fjy2,ty);
1968             fjz2             = _mm_add_ps(fjz2,tz);
1969
1970             }
1971
1972             /**************************
1973              * CALCULATE INTERACTIONS *
1974              **************************/
1975
1976             if (gmx_mm_any_lt(rsq33,rcutoff2))
1977             {
1978
1979             r33              = _mm_mul_ps(rsq33,rinv33);
1980
1981             /* EWALD ELECTROSTATICS */
1982
1983             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1984             ewrt             = _mm_mul_ps(r33,ewtabscale);
1985             ewitab           = _mm_cvttps_epi32(ewrt);
1986             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1987             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1988                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1989                                          &ewtabF,&ewtabFn);
1990             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1991             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1992
1993             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1994
1995             fscal            = felec;
1996
1997             fscal            = _mm_and_ps(fscal,cutoff_mask);
1998
1999             /* Calculate temporary vectorial force */
2000             tx               = _mm_mul_ps(fscal,dx33);
2001             ty               = _mm_mul_ps(fscal,dy33);
2002             tz               = _mm_mul_ps(fscal,dz33);
2003
2004             /* Update vectorial force */
2005             fix3             = _mm_add_ps(fix3,tx);
2006             fiy3             = _mm_add_ps(fiy3,ty);
2007             fiz3             = _mm_add_ps(fiz3,tz);
2008
2009             fjx3             = _mm_add_ps(fjx3,tx);
2010             fjy3             = _mm_add_ps(fjy3,ty);
2011             fjz3             = _mm_add_ps(fjz3,tz);
2012
2013             }
2014
2015             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
2016                                                    f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
2017                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2018
2019             /* Inner loop uses 351 flops */
2020         }
2021
2022         if(jidx<j_index_end)
2023         {
2024
2025             /* Get j neighbor index, and coordinate index */
2026             jnrA             = jjnr[jidx];
2027             jnrB             = jjnr[jidx+1];
2028             jnrC             = jjnr[jidx+2];
2029             jnrD             = jjnr[jidx+3];
2030
2031             /* Sign of each element will be negative for non-real atoms.
2032              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2033              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2034              */
2035             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2036             jnrA       = (jnrA>=0) ? jnrA : 0;
2037             jnrB       = (jnrB>=0) ? jnrB : 0;
2038             jnrC       = (jnrC>=0) ? jnrC : 0;
2039             jnrD       = (jnrD>=0) ? jnrD : 0;
2040
2041             j_coord_offsetA  = DIM*jnrA;
2042             j_coord_offsetB  = DIM*jnrB;
2043             j_coord_offsetC  = DIM*jnrC;
2044             j_coord_offsetD  = DIM*jnrD;
2045
2046             /* load j atom coordinates */
2047             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
2048                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
2049                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
2050
2051             /* Calculate displacement vector */
2052             dx11             = _mm_sub_ps(ix1,jx1);
2053             dy11             = _mm_sub_ps(iy1,jy1);
2054             dz11             = _mm_sub_ps(iz1,jz1);
2055             dx12             = _mm_sub_ps(ix1,jx2);
2056             dy12             = _mm_sub_ps(iy1,jy2);
2057             dz12             = _mm_sub_ps(iz1,jz2);
2058             dx13             = _mm_sub_ps(ix1,jx3);
2059             dy13             = _mm_sub_ps(iy1,jy3);
2060             dz13             = _mm_sub_ps(iz1,jz3);
2061             dx21             = _mm_sub_ps(ix2,jx1);
2062             dy21             = _mm_sub_ps(iy2,jy1);
2063             dz21             = _mm_sub_ps(iz2,jz1);
2064             dx22             = _mm_sub_ps(ix2,jx2);
2065             dy22             = _mm_sub_ps(iy2,jy2);
2066             dz22             = _mm_sub_ps(iz2,jz2);
2067             dx23             = _mm_sub_ps(ix2,jx3);
2068             dy23             = _mm_sub_ps(iy2,jy3);
2069             dz23             = _mm_sub_ps(iz2,jz3);
2070             dx31             = _mm_sub_ps(ix3,jx1);
2071             dy31             = _mm_sub_ps(iy3,jy1);
2072             dz31             = _mm_sub_ps(iz3,jz1);
2073             dx32             = _mm_sub_ps(ix3,jx2);
2074             dy32             = _mm_sub_ps(iy3,jy2);
2075             dz32             = _mm_sub_ps(iz3,jz2);
2076             dx33             = _mm_sub_ps(ix3,jx3);
2077             dy33             = _mm_sub_ps(iy3,jy3);
2078             dz33             = _mm_sub_ps(iz3,jz3);
2079
2080             /* Calculate squared distance and things based on it */
2081             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2082             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2083             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2084             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2085             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2086             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2087             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2088             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2089             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2090
2091             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2092             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2093             rinv13           = gmx_mm_invsqrt_ps(rsq13);
2094             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2095             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2096             rinv23           = gmx_mm_invsqrt_ps(rsq23);
2097             rinv31           = gmx_mm_invsqrt_ps(rsq31);
2098             rinv32           = gmx_mm_invsqrt_ps(rsq32);
2099             rinv33           = gmx_mm_invsqrt_ps(rsq33);
2100
2101             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2102             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2103             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
2104             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2105             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2106             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
2107             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
2108             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
2109             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
2110
2111             fjx1             = _mm_setzero_ps();
2112             fjy1             = _mm_setzero_ps();
2113             fjz1             = _mm_setzero_ps();
2114             fjx2             = _mm_setzero_ps();
2115             fjy2             = _mm_setzero_ps();
2116             fjz2             = _mm_setzero_ps();
2117             fjx3             = _mm_setzero_ps();
2118             fjy3             = _mm_setzero_ps();
2119             fjz3             = _mm_setzero_ps();
2120
2121             /**************************
2122              * CALCULATE INTERACTIONS *
2123              **************************/
2124
2125             if (gmx_mm_any_lt(rsq11,rcutoff2))
2126             {
2127
2128             r11              = _mm_mul_ps(rsq11,rinv11);
2129             r11              = _mm_andnot_ps(dummy_mask,r11);
2130
2131             /* EWALD ELECTROSTATICS */
2132
2133             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2134             ewrt             = _mm_mul_ps(r11,ewtabscale);
2135             ewitab           = _mm_cvttps_epi32(ewrt);
2136             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2137             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2138                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2139                                          &ewtabF,&ewtabFn);
2140             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2141             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2142
2143             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2144
2145             fscal            = felec;
2146
2147             fscal            = _mm_and_ps(fscal,cutoff_mask);
2148
2149             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2150
2151             /* Calculate temporary vectorial force */
2152             tx               = _mm_mul_ps(fscal,dx11);
2153             ty               = _mm_mul_ps(fscal,dy11);
2154             tz               = _mm_mul_ps(fscal,dz11);
2155
2156             /* Update vectorial force */
2157             fix1             = _mm_add_ps(fix1,tx);
2158             fiy1             = _mm_add_ps(fiy1,ty);
2159             fiz1             = _mm_add_ps(fiz1,tz);
2160
2161             fjx1             = _mm_add_ps(fjx1,tx);
2162             fjy1             = _mm_add_ps(fjy1,ty);
2163             fjz1             = _mm_add_ps(fjz1,tz);
2164
2165             }
2166
2167             /**************************
2168              * CALCULATE INTERACTIONS *
2169              **************************/
2170
2171             if (gmx_mm_any_lt(rsq12,rcutoff2))
2172             {
2173
2174             r12              = _mm_mul_ps(rsq12,rinv12);
2175             r12              = _mm_andnot_ps(dummy_mask,r12);
2176
2177             /* EWALD ELECTROSTATICS */
2178
2179             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2180             ewrt             = _mm_mul_ps(r12,ewtabscale);
2181             ewitab           = _mm_cvttps_epi32(ewrt);
2182             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2183             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2184                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2185                                          &ewtabF,&ewtabFn);
2186             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2187             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2188
2189             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2190
2191             fscal            = felec;
2192
2193             fscal            = _mm_and_ps(fscal,cutoff_mask);
2194
2195             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2196
2197             /* Calculate temporary vectorial force */
2198             tx               = _mm_mul_ps(fscal,dx12);
2199             ty               = _mm_mul_ps(fscal,dy12);
2200             tz               = _mm_mul_ps(fscal,dz12);
2201
2202             /* Update vectorial force */
2203             fix1             = _mm_add_ps(fix1,tx);
2204             fiy1             = _mm_add_ps(fiy1,ty);
2205             fiz1             = _mm_add_ps(fiz1,tz);
2206
2207             fjx2             = _mm_add_ps(fjx2,tx);
2208             fjy2             = _mm_add_ps(fjy2,ty);
2209             fjz2             = _mm_add_ps(fjz2,tz);
2210
2211             }
2212
2213             /**************************
2214              * CALCULATE INTERACTIONS *
2215              **************************/
2216
2217             if (gmx_mm_any_lt(rsq13,rcutoff2))
2218             {
2219
2220             r13              = _mm_mul_ps(rsq13,rinv13);
2221             r13              = _mm_andnot_ps(dummy_mask,r13);
2222
2223             /* EWALD ELECTROSTATICS */
2224
2225             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2226             ewrt             = _mm_mul_ps(r13,ewtabscale);
2227             ewitab           = _mm_cvttps_epi32(ewrt);
2228             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2229             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2230                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2231                                          &ewtabF,&ewtabFn);
2232             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2233             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2234
2235             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
2236
2237             fscal            = felec;
2238
2239             fscal            = _mm_and_ps(fscal,cutoff_mask);
2240
2241             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2242
2243             /* Calculate temporary vectorial force */
2244             tx               = _mm_mul_ps(fscal,dx13);
2245             ty               = _mm_mul_ps(fscal,dy13);
2246             tz               = _mm_mul_ps(fscal,dz13);
2247
2248             /* Update vectorial force */
2249             fix1             = _mm_add_ps(fix1,tx);
2250             fiy1             = _mm_add_ps(fiy1,ty);
2251             fiz1             = _mm_add_ps(fiz1,tz);
2252
2253             fjx3             = _mm_add_ps(fjx3,tx);
2254             fjy3             = _mm_add_ps(fjy3,ty);
2255             fjz3             = _mm_add_ps(fjz3,tz);
2256
2257             }
2258
2259             /**************************
2260              * CALCULATE INTERACTIONS *
2261              **************************/
2262
2263             if (gmx_mm_any_lt(rsq21,rcutoff2))
2264             {
2265
2266             r21              = _mm_mul_ps(rsq21,rinv21);
2267             r21              = _mm_andnot_ps(dummy_mask,r21);
2268
2269             /* EWALD ELECTROSTATICS */
2270
2271             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2272             ewrt             = _mm_mul_ps(r21,ewtabscale);
2273             ewitab           = _mm_cvttps_epi32(ewrt);
2274             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2275             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2276                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2277                                          &ewtabF,&ewtabFn);
2278             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2279             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2280
2281             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2282
2283             fscal            = felec;
2284
2285             fscal            = _mm_and_ps(fscal,cutoff_mask);
2286
2287             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2288
2289             /* Calculate temporary vectorial force */
2290             tx               = _mm_mul_ps(fscal,dx21);
2291             ty               = _mm_mul_ps(fscal,dy21);
2292             tz               = _mm_mul_ps(fscal,dz21);
2293
2294             /* Update vectorial force */
2295             fix2             = _mm_add_ps(fix2,tx);
2296             fiy2             = _mm_add_ps(fiy2,ty);
2297             fiz2             = _mm_add_ps(fiz2,tz);
2298
2299             fjx1             = _mm_add_ps(fjx1,tx);
2300             fjy1             = _mm_add_ps(fjy1,ty);
2301             fjz1             = _mm_add_ps(fjz1,tz);
2302
2303             }
2304
2305             /**************************
2306              * CALCULATE INTERACTIONS *
2307              **************************/
2308
2309             if (gmx_mm_any_lt(rsq22,rcutoff2))
2310             {
2311
2312             r22              = _mm_mul_ps(rsq22,rinv22);
2313             r22              = _mm_andnot_ps(dummy_mask,r22);
2314
2315             /* EWALD ELECTROSTATICS */
2316
2317             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2318             ewrt             = _mm_mul_ps(r22,ewtabscale);
2319             ewitab           = _mm_cvttps_epi32(ewrt);
2320             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2321             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2322                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2323                                          &ewtabF,&ewtabFn);
2324             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2325             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2326
2327             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2328
2329             fscal            = felec;
2330
2331             fscal            = _mm_and_ps(fscal,cutoff_mask);
2332
2333             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2334
2335             /* Calculate temporary vectorial force */
2336             tx               = _mm_mul_ps(fscal,dx22);
2337             ty               = _mm_mul_ps(fscal,dy22);
2338             tz               = _mm_mul_ps(fscal,dz22);
2339
2340             /* Update vectorial force */
2341             fix2             = _mm_add_ps(fix2,tx);
2342             fiy2             = _mm_add_ps(fiy2,ty);
2343             fiz2             = _mm_add_ps(fiz2,tz);
2344
2345             fjx2             = _mm_add_ps(fjx2,tx);
2346             fjy2             = _mm_add_ps(fjy2,ty);
2347             fjz2             = _mm_add_ps(fjz2,tz);
2348
2349             }
2350
2351             /**************************
2352              * CALCULATE INTERACTIONS *
2353              **************************/
2354
2355             if (gmx_mm_any_lt(rsq23,rcutoff2))
2356             {
2357
2358             r23              = _mm_mul_ps(rsq23,rinv23);
2359             r23              = _mm_andnot_ps(dummy_mask,r23);
2360
2361             /* EWALD ELECTROSTATICS */
2362
2363             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2364             ewrt             = _mm_mul_ps(r23,ewtabscale);
2365             ewitab           = _mm_cvttps_epi32(ewrt);
2366             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2367             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2368                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2369                                          &ewtabF,&ewtabFn);
2370             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2371             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2372
2373             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
2374
2375             fscal            = felec;
2376
2377             fscal            = _mm_and_ps(fscal,cutoff_mask);
2378
2379             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2380
2381             /* Calculate temporary vectorial force */
2382             tx               = _mm_mul_ps(fscal,dx23);
2383             ty               = _mm_mul_ps(fscal,dy23);
2384             tz               = _mm_mul_ps(fscal,dz23);
2385
2386             /* Update vectorial force */
2387             fix2             = _mm_add_ps(fix2,tx);
2388             fiy2             = _mm_add_ps(fiy2,ty);
2389             fiz2             = _mm_add_ps(fiz2,tz);
2390
2391             fjx3             = _mm_add_ps(fjx3,tx);
2392             fjy3             = _mm_add_ps(fjy3,ty);
2393             fjz3             = _mm_add_ps(fjz3,tz);
2394
2395             }
2396
2397             /**************************
2398              * CALCULATE INTERACTIONS *
2399              **************************/
2400
2401             if (gmx_mm_any_lt(rsq31,rcutoff2))
2402             {
2403
2404             r31              = _mm_mul_ps(rsq31,rinv31);
2405             r31              = _mm_andnot_ps(dummy_mask,r31);
2406
2407             /* EWALD ELECTROSTATICS */
2408
2409             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2410             ewrt             = _mm_mul_ps(r31,ewtabscale);
2411             ewitab           = _mm_cvttps_epi32(ewrt);
2412             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2413             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2414                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2415                                          &ewtabF,&ewtabFn);
2416             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2417             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2418
2419             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
2420
2421             fscal            = felec;
2422
2423             fscal            = _mm_and_ps(fscal,cutoff_mask);
2424
2425             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2426
2427             /* Calculate temporary vectorial force */
2428             tx               = _mm_mul_ps(fscal,dx31);
2429             ty               = _mm_mul_ps(fscal,dy31);
2430             tz               = _mm_mul_ps(fscal,dz31);
2431
2432             /* Update vectorial force */
2433             fix3             = _mm_add_ps(fix3,tx);
2434             fiy3             = _mm_add_ps(fiy3,ty);
2435             fiz3             = _mm_add_ps(fiz3,tz);
2436
2437             fjx1             = _mm_add_ps(fjx1,tx);
2438             fjy1             = _mm_add_ps(fjy1,ty);
2439             fjz1             = _mm_add_ps(fjz1,tz);
2440
2441             }
2442
2443             /**************************
2444              * CALCULATE INTERACTIONS *
2445              **************************/
2446
2447             if (gmx_mm_any_lt(rsq32,rcutoff2))
2448             {
2449
2450             r32              = _mm_mul_ps(rsq32,rinv32);
2451             r32              = _mm_andnot_ps(dummy_mask,r32);
2452
2453             /* EWALD ELECTROSTATICS */
2454
2455             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2456             ewrt             = _mm_mul_ps(r32,ewtabscale);
2457             ewitab           = _mm_cvttps_epi32(ewrt);
2458             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2459             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2460                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2461                                          &ewtabF,&ewtabFn);
2462             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2463             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2464
2465             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
2466
2467             fscal            = felec;
2468
2469             fscal            = _mm_and_ps(fscal,cutoff_mask);
2470
2471             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2472
2473             /* Calculate temporary vectorial force */
2474             tx               = _mm_mul_ps(fscal,dx32);
2475             ty               = _mm_mul_ps(fscal,dy32);
2476             tz               = _mm_mul_ps(fscal,dz32);
2477
2478             /* Update vectorial force */
2479             fix3             = _mm_add_ps(fix3,tx);
2480             fiy3             = _mm_add_ps(fiy3,ty);
2481             fiz3             = _mm_add_ps(fiz3,tz);
2482
2483             fjx2             = _mm_add_ps(fjx2,tx);
2484             fjy2             = _mm_add_ps(fjy2,ty);
2485             fjz2             = _mm_add_ps(fjz2,tz);
2486
2487             }
2488
2489             /**************************
2490              * CALCULATE INTERACTIONS *
2491              **************************/
2492
2493             if (gmx_mm_any_lt(rsq33,rcutoff2))
2494             {
2495
2496             r33              = _mm_mul_ps(rsq33,rinv33);
2497             r33              = _mm_andnot_ps(dummy_mask,r33);
2498
2499             /* EWALD ELECTROSTATICS */
2500
2501             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2502             ewrt             = _mm_mul_ps(r33,ewtabscale);
2503             ewitab           = _mm_cvttps_epi32(ewrt);
2504             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2505             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2506                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2507                                          &ewtabF,&ewtabFn);
2508             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2509             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2510
2511             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
2512
2513             fscal            = felec;
2514
2515             fscal            = _mm_and_ps(fscal,cutoff_mask);
2516
2517             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2518
2519             /* Calculate temporary vectorial force */
2520             tx               = _mm_mul_ps(fscal,dx33);
2521             ty               = _mm_mul_ps(fscal,dy33);
2522             tz               = _mm_mul_ps(fscal,dz33);
2523
2524             /* Update vectorial force */
2525             fix3             = _mm_add_ps(fix3,tx);
2526             fiy3             = _mm_add_ps(fiy3,ty);
2527             fiz3             = _mm_add_ps(fiz3,tz);
2528
2529             fjx3             = _mm_add_ps(fjx3,tx);
2530             fjy3             = _mm_add_ps(fjy3,ty);
2531             fjz3             = _mm_add_ps(fjz3,tz);
2532
2533             }
2534
2535             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
2536                                                    f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
2537                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2538
2539             /* Inner loop uses 360 flops */
2540         }
2541
2542         /* End of innermost loop */
2543
2544         gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2545                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
2546
2547         /* Increment number of inner iterations */
2548         inneriter                  += j_index_end - j_index_start;
2549
2550         /* Outer loop uses 27 flops */
2551     }
2552
2553     /* Increment number of outer iterations */
2554     outeriter        += nri;
2555
2556     /* Update outer/inner flops */
2557
2558     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*360);
2559 }