Use full path for legacyheaders
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecEwSh_VdwNone_GeomW4W4_sse2_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse2_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
46
47 #include "gromacs/simd/math_x86_sse2_single.h"
48 #include "kernelutil_x86_sse2_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse2_single
52  * Electrostatics interaction: Ewald
53  * VdW interaction:            None
54  * Geometry:                   Water4-Water4
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse2_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83     int              vdwioffset1;
84     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85     int              vdwioffset2;
86     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87     int              vdwioffset3;
88     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
89     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
90     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
91     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
92     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
93     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
94     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
95     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
96     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
97     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
98     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
99     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
100     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
101     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
102     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
103     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
104     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
105     real             *charge;
106     __m128i          ewitab;
107     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
108     real             *ewtab;
109     __m128           dummy_mask,cutoff_mask;
110     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
111     __m128           one     = _mm_set1_ps(1.0);
112     __m128           two     = _mm_set1_ps(2.0);
113     x                = xx[0];
114     f                = ff[0];
115
116     nri              = nlist->nri;
117     iinr             = nlist->iinr;
118     jindex           = nlist->jindex;
119     jjnr             = nlist->jjnr;
120     shiftidx         = nlist->shift;
121     gid              = nlist->gid;
122     shiftvec         = fr->shift_vec[0];
123     fshift           = fr->fshift[0];
124     facel            = _mm_set1_ps(fr->epsfac);
125     charge           = mdatoms->chargeA;
126
127     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
128     ewtab            = fr->ic->tabq_coul_FDV0;
129     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
130     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
131
132     /* Setup water-specific parameters */
133     inr              = nlist->iinr[0];
134     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
135     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
136     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
137
138     jq1              = _mm_set1_ps(charge[inr+1]);
139     jq2              = _mm_set1_ps(charge[inr+2]);
140     jq3              = _mm_set1_ps(charge[inr+3]);
141     qq11             = _mm_mul_ps(iq1,jq1);
142     qq12             = _mm_mul_ps(iq1,jq2);
143     qq13             = _mm_mul_ps(iq1,jq3);
144     qq21             = _mm_mul_ps(iq2,jq1);
145     qq22             = _mm_mul_ps(iq2,jq2);
146     qq23             = _mm_mul_ps(iq2,jq3);
147     qq31             = _mm_mul_ps(iq3,jq1);
148     qq32             = _mm_mul_ps(iq3,jq2);
149     qq33             = _mm_mul_ps(iq3,jq3);
150
151     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
152     rcutoff_scalar   = fr->rcoulomb;
153     rcutoff          = _mm_set1_ps(rcutoff_scalar);
154     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
155
156     /* Avoid stupid compiler warnings */
157     jnrA = jnrB = jnrC = jnrD = 0;
158     j_coord_offsetA = 0;
159     j_coord_offsetB = 0;
160     j_coord_offsetC = 0;
161     j_coord_offsetD = 0;
162
163     outeriter        = 0;
164     inneriter        = 0;
165
166     for(iidx=0;iidx<4*DIM;iidx++)
167     {
168         scratch[iidx] = 0.0;
169     }  
170
171     /* Start outer loop over neighborlists */
172     for(iidx=0; iidx<nri; iidx++)
173     {
174         /* Load shift vector for this list */
175         i_shift_offset   = DIM*shiftidx[iidx];
176
177         /* Load limits for loop over neighbors */
178         j_index_start    = jindex[iidx];
179         j_index_end      = jindex[iidx+1];
180
181         /* Get outer coordinate index */
182         inr              = iinr[iidx];
183         i_coord_offset   = DIM*inr;
184
185         /* Load i particle coords and add shift vector */
186         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
187                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
188         
189         fix1             = _mm_setzero_ps();
190         fiy1             = _mm_setzero_ps();
191         fiz1             = _mm_setzero_ps();
192         fix2             = _mm_setzero_ps();
193         fiy2             = _mm_setzero_ps();
194         fiz2             = _mm_setzero_ps();
195         fix3             = _mm_setzero_ps();
196         fiy3             = _mm_setzero_ps();
197         fiz3             = _mm_setzero_ps();
198
199         /* Reset potential sums */
200         velecsum         = _mm_setzero_ps();
201
202         /* Start inner kernel loop */
203         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
204         {
205
206             /* Get j neighbor index, and coordinate index */
207             jnrA             = jjnr[jidx];
208             jnrB             = jjnr[jidx+1];
209             jnrC             = jjnr[jidx+2];
210             jnrD             = jjnr[jidx+3];
211             j_coord_offsetA  = DIM*jnrA;
212             j_coord_offsetB  = DIM*jnrB;
213             j_coord_offsetC  = DIM*jnrC;
214             j_coord_offsetD  = DIM*jnrD;
215
216             /* load j atom coordinates */
217             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
218                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
219                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
220
221             /* Calculate displacement vector */
222             dx11             = _mm_sub_ps(ix1,jx1);
223             dy11             = _mm_sub_ps(iy1,jy1);
224             dz11             = _mm_sub_ps(iz1,jz1);
225             dx12             = _mm_sub_ps(ix1,jx2);
226             dy12             = _mm_sub_ps(iy1,jy2);
227             dz12             = _mm_sub_ps(iz1,jz2);
228             dx13             = _mm_sub_ps(ix1,jx3);
229             dy13             = _mm_sub_ps(iy1,jy3);
230             dz13             = _mm_sub_ps(iz1,jz3);
231             dx21             = _mm_sub_ps(ix2,jx1);
232             dy21             = _mm_sub_ps(iy2,jy1);
233             dz21             = _mm_sub_ps(iz2,jz1);
234             dx22             = _mm_sub_ps(ix2,jx2);
235             dy22             = _mm_sub_ps(iy2,jy2);
236             dz22             = _mm_sub_ps(iz2,jz2);
237             dx23             = _mm_sub_ps(ix2,jx3);
238             dy23             = _mm_sub_ps(iy2,jy3);
239             dz23             = _mm_sub_ps(iz2,jz3);
240             dx31             = _mm_sub_ps(ix3,jx1);
241             dy31             = _mm_sub_ps(iy3,jy1);
242             dz31             = _mm_sub_ps(iz3,jz1);
243             dx32             = _mm_sub_ps(ix3,jx2);
244             dy32             = _mm_sub_ps(iy3,jy2);
245             dz32             = _mm_sub_ps(iz3,jz2);
246             dx33             = _mm_sub_ps(ix3,jx3);
247             dy33             = _mm_sub_ps(iy3,jy3);
248             dz33             = _mm_sub_ps(iz3,jz3);
249
250             /* Calculate squared distance and things based on it */
251             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
252             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
253             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
254             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
255             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
256             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
257             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
258             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
259             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
260
261             rinv11           = gmx_mm_invsqrt_ps(rsq11);
262             rinv12           = gmx_mm_invsqrt_ps(rsq12);
263             rinv13           = gmx_mm_invsqrt_ps(rsq13);
264             rinv21           = gmx_mm_invsqrt_ps(rsq21);
265             rinv22           = gmx_mm_invsqrt_ps(rsq22);
266             rinv23           = gmx_mm_invsqrt_ps(rsq23);
267             rinv31           = gmx_mm_invsqrt_ps(rsq31);
268             rinv32           = gmx_mm_invsqrt_ps(rsq32);
269             rinv33           = gmx_mm_invsqrt_ps(rsq33);
270
271             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
272             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
273             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
274             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
275             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
276             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
277             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
278             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
279             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
280
281             fjx1             = _mm_setzero_ps();
282             fjy1             = _mm_setzero_ps();
283             fjz1             = _mm_setzero_ps();
284             fjx2             = _mm_setzero_ps();
285             fjy2             = _mm_setzero_ps();
286             fjz2             = _mm_setzero_ps();
287             fjx3             = _mm_setzero_ps();
288             fjy3             = _mm_setzero_ps();
289             fjz3             = _mm_setzero_ps();
290
291             /**************************
292              * CALCULATE INTERACTIONS *
293              **************************/
294
295             if (gmx_mm_any_lt(rsq11,rcutoff2))
296             {
297
298             r11              = _mm_mul_ps(rsq11,rinv11);
299
300             /* EWALD ELECTROSTATICS */
301
302             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
303             ewrt             = _mm_mul_ps(r11,ewtabscale);
304             ewitab           = _mm_cvttps_epi32(ewrt);
305             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
306             ewitab           = _mm_slli_epi32(ewitab,2);
307             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
308             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
309             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
310             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
311             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
312             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
313             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
314             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
315             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
316
317             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
318
319             /* Update potential sum for this i atom from the interaction with this j atom. */
320             velec            = _mm_and_ps(velec,cutoff_mask);
321             velecsum         = _mm_add_ps(velecsum,velec);
322
323             fscal            = felec;
324
325             fscal            = _mm_and_ps(fscal,cutoff_mask);
326
327             /* Calculate temporary vectorial force */
328             tx               = _mm_mul_ps(fscal,dx11);
329             ty               = _mm_mul_ps(fscal,dy11);
330             tz               = _mm_mul_ps(fscal,dz11);
331
332             /* Update vectorial force */
333             fix1             = _mm_add_ps(fix1,tx);
334             fiy1             = _mm_add_ps(fiy1,ty);
335             fiz1             = _mm_add_ps(fiz1,tz);
336
337             fjx1             = _mm_add_ps(fjx1,tx);
338             fjy1             = _mm_add_ps(fjy1,ty);
339             fjz1             = _mm_add_ps(fjz1,tz);
340             
341             }
342
343             /**************************
344              * CALCULATE INTERACTIONS *
345              **************************/
346
347             if (gmx_mm_any_lt(rsq12,rcutoff2))
348             {
349
350             r12              = _mm_mul_ps(rsq12,rinv12);
351
352             /* EWALD ELECTROSTATICS */
353
354             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
355             ewrt             = _mm_mul_ps(r12,ewtabscale);
356             ewitab           = _mm_cvttps_epi32(ewrt);
357             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
358             ewitab           = _mm_slli_epi32(ewitab,2);
359             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
360             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
361             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
362             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
363             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
364             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
365             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
366             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
367             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
368
369             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
370
371             /* Update potential sum for this i atom from the interaction with this j atom. */
372             velec            = _mm_and_ps(velec,cutoff_mask);
373             velecsum         = _mm_add_ps(velecsum,velec);
374
375             fscal            = felec;
376
377             fscal            = _mm_and_ps(fscal,cutoff_mask);
378
379             /* Calculate temporary vectorial force */
380             tx               = _mm_mul_ps(fscal,dx12);
381             ty               = _mm_mul_ps(fscal,dy12);
382             tz               = _mm_mul_ps(fscal,dz12);
383
384             /* Update vectorial force */
385             fix1             = _mm_add_ps(fix1,tx);
386             fiy1             = _mm_add_ps(fiy1,ty);
387             fiz1             = _mm_add_ps(fiz1,tz);
388
389             fjx2             = _mm_add_ps(fjx2,tx);
390             fjy2             = _mm_add_ps(fjy2,ty);
391             fjz2             = _mm_add_ps(fjz2,tz);
392             
393             }
394
395             /**************************
396              * CALCULATE INTERACTIONS *
397              **************************/
398
399             if (gmx_mm_any_lt(rsq13,rcutoff2))
400             {
401
402             r13              = _mm_mul_ps(rsq13,rinv13);
403
404             /* EWALD ELECTROSTATICS */
405
406             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
407             ewrt             = _mm_mul_ps(r13,ewtabscale);
408             ewitab           = _mm_cvttps_epi32(ewrt);
409             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
410             ewitab           = _mm_slli_epi32(ewitab,2);
411             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
412             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
413             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
414             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
415             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
416             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
417             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
418             velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
419             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
420
421             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
422
423             /* Update potential sum for this i atom from the interaction with this j atom. */
424             velec            = _mm_and_ps(velec,cutoff_mask);
425             velecsum         = _mm_add_ps(velecsum,velec);
426
427             fscal            = felec;
428
429             fscal            = _mm_and_ps(fscal,cutoff_mask);
430
431             /* Calculate temporary vectorial force */
432             tx               = _mm_mul_ps(fscal,dx13);
433             ty               = _mm_mul_ps(fscal,dy13);
434             tz               = _mm_mul_ps(fscal,dz13);
435
436             /* Update vectorial force */
437             fix1             = _mm_add_ps(fix1,tx);
438             fiy1             = _mm_add_ps(fiy1,ty);
439             fiz1             = _mm_add_ps(fiz1,tz);
440
441             fjx3             = _mm_add_ps(fjx3,tx);
442             fjy3             = _mm_add_ps(fjy3,ty);
443             fjz3             = _mm_add_ps(fjz3,tz);
444             
445             }
446
447             /**************************
448              * CALCULATE INTERACTIONS *
449              **************************/
450
451             if (gmx_mm_any_lt(rsq21,rcutoff2))
452             {
453
454             r21              = _mm_mul_ps(rsq21,rinv21);
455
456             /* EWALD ELECTROSTATICS */
457
458             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
459             ewrt             = _mm_mul_ps(r21,ewtabscale);
460             ewitab           = _mm_cvttps_epi32(ewrt);
461             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
462             ewitab           = _mm_slli_epi32(ewitab,2);
463             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
464             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
465             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
466             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
467             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
468             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
469             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
470             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
471             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
472
473             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
474
475             /* Update potential sum for this i atom from the interaction with this j atom. */
476             velec            = _mm_and_ps(velec,cutoff_mask);
477             velecsum         = _mm_add_ps(velecsum,velec);
478
479             fscal            = felec;
480
481             fscal            = _mm_and_ps(fscal,cutoff_mask);
482
483             /* Calculate temporary vectorial force */
484             tx               = _mm_mul_ps(fscal,dx21);
485             ty               = _mm_mul_ps(fscal,dy21);
486             tz               = _mm_mul_ps(fscal,dz21);
487
488             /* Update vectorial force */
489             fix2             = _mm_add_ps(fix2,tx);
490             fiy2             = _mm_add_ps(fiy2,ty);
491             fiz2             = _mm_add_ps(fiz2,tz);
492
493             fjx1             = _mm_add_ps(fjx1,tx);
494             fjy1             = _mm_add_ps(fjy1,ty);
495             fjz1             = _mm_add_ps(fjz1,tz);
496             
497             }
498
499             /**************************
500              * CALCULATE INTERACTIONS *
501              **************************/
502
503             if (gmx_mm_any_lt(rsq22,rcutoff2))
504             {
505
506             r22              = _mm_mul_ps(rsq22,rinv22);
507
508             /* EWALD ELECTROSTATICS */
509
510             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
511             ewrt             = _mm_mul_ps(r22,ewtabscale);
512             ewitab           = _mm_cvttps_epi32(ewrt);
513             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
514             ewitab           = _mm_slli_epi32(ewitab,2);
515             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
516             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
517             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
518             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
519             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
520             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
521             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
522             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
523             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
524
525             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
526
527             /* Update potential sum for this i atom from the interaction with this j atom. */
528             velec            = _mm_and_ps(velec,cutoff_mask);
529             velecsum         = _mm_add_ps(velecsum,velec);
530
531             fscal            = felec;
532
533             fscal            = _mm_and_ps(fscal,cutoff_mask);
534
535             /* Calculate temporary vectorial force */
536             tx               = _mm_mul_ps(fscal,dx22);
537             ty               = _mm_mul_ps(fscal,dy22);
538             tz               = _mm_mul_ps(fscal,dz22);
539
540             /* Update vectorial force */
541             fix2             = _mm_add_ps(fix2,tx);
542             fiy2             = _mm_add_ps(fiy2,ty);
543             fiz2             = _mm_add_ps(fiz2,tz);
544
545             fjx2             = _mm_add_ps(fjx2,tx);
546             fjy2             = _mm_add_ps(fjy2,ty);
547             fjz2             = _mm_add_ps(fjz2,tz);
548             
549             }
550
551             /**************************
552              * CALCULATE INTERACTIONS *
553              **************************/
554
555             if (gmx_mm_any_lt(rsq23,rcutoff2))
556             {
557
558             r23              = _mm_mul_ps(rsq23,rinv23);
559
560             /* EWALD ELECTROSTATICS */
561
562             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
563             ewrt             = _mm_mul_ps(r23,ewtabscale);
564             ewitab           = _mm_cvttps_epi32(ewrt);
565             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
566             ewitab           = _mm_slli_epi32(ewitab,2);
567             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
568             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
569             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
570             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
571             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
572             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
573             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
574             velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
575             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
576
577             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
578
579             /* Update potential sum for this i atom from the interaction with this j atom. */
580             velec            = _mm_and_ps(velec,cutoff_mask);
581             velecsum         = _mm_add_ps(velecsum,velec);
582
583             fscal            = felec;
584
585             fscal            = _mm_and_ps(fscal,cutoff_mask);
586
587             /* Calculate temporary vectorial force */
588             tx               = _mm_mul_ps(fscal,dx23);
589             ty               = _mm_mul_ps(fscal,dy23);
590             tz               = _mm_mul_ps(fscal,dz23);
591
592             /* Update vectorial force */
593             fix2             = _mm_add_ps(fix2,tx);
594             fiy2             = _mm_add_ps(fiy2,ty);
595             fiz2             = _mm_add_ps(fiz2,tz);
596
597             fjx3             = _mm_add_ps(fjx3,tx);
598             fjy3             = _mm_add_ps(fjy3,ty);
599             fjz3             = _mm_add_ps(fjz3,tz);
600             
601             }
602
603             /**************************
604              * CALCULATE INTERACTIONS *
605              **************************/
606
607             if (gmx_mm_any_lt(rsq31,rcutoff2))
608             {
609
610             r31              = _mm_mul_ps(rsq31,rinv31);
611
612             /* EWALD ELECTROSTATICS */
613
614             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
615             ewrt             = _mm_mul_ps(r31,ewtabscale);
616             ewitab           = _mm_cvttps_epi32(ewrt);
617             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
618             ewitab           = _mm_slli_epi32(ewitab,2);
619             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
620             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
621             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
622             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
623             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
624             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
625             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
626             velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
627             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
628
629             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
630
631             /* Update potential sum for this i atom from the interaction with this j atom. */
632             velec            = _mm_and_ps(velec,cutoff_mask);
633             velecsum         = _mm_add_ps(velecsum,velec);
634
635             fscal            = felec;
636
637             fscal            = _mm_and_ps(fscal,cutoff_mask);
638
639             /* Calculate temporary vectorial force */
640             tx               = _mm_mul_ps(fscal,dx31);
641             ty               = _mm_mul_ps(fscal,dy31);
642             tz               = _mm_mul_ps(fscal,dz31);
643
644             /* Update vectorial force */
645             fix3             = _mm_add_ps(fix3,tx);
646             fiy3             = _mm_add_ps(fiy3,ty);
647             fiz3             = _mm_add_ps(fiz3,tz);
648
649             fjx1             = _mm_add_ps(fjx1,tx);
650             fjy1             = _mm_add_ps(fjy1,ty);
651             fjz1             = _mm_add_ps(fjz1,tz);
652             
653             }
654
655             /**************************
656              * CALCULATE INTERACTIONS *
657              **************************/
658
659             if (gmx_mm_any_lt(rsq32,rcutoff2))
660             {
661
662             r32              = _mm_mul_ps(rsq32,rinv32);
663
664             /* EWALD ELECTROSTATICS */
665
666             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
667             ewrt             = _mm_mul_ps(r32,ewtabscale);
668             ewitab           = _mm_cvttps_epi32(ewrt);
669             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
670             ewitab           = _mm_slli_epi32(ewitab,2);
671             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
672             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
673             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
674             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
675             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
676             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
677             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
678             velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
679             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
680
681             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
682
683             /* Update potential sum for this i atom from the interaction with this j atom. */
684             velec            = _mm_and_ps(velec,cutoff_mask);
685             velecsum         = _mm_add_ps(velecsum,velec);
686
687             fscal            = felec;
688
689             fscal            = _mm_and_ps(fscal,cutoff_mask);
690
691             /* Calculate temporary vectorial force */
692             tx               = _mm_mul_ps(fscal,dx32);
693             ty               = _mm_mul_ps(fscal,dy32);
694             tz               = _mm_mul_ps(fscal,dz32);
695
696             /* Update vectorial force */
697             fix3             = _mm_add_ps(fix3,tx);
698             fiy3             = _mm_add_ps(fiy3,ty);
699             fiz3             = _mm_add_ps(fiz3,tz);
700
701             fjx2             = _mm_add_ps(fjx2,tx);
702             fjy2             = _mm_add_ps(fjy2,ty);
703             fjz2             = _mm_add_ps(fjz2,tz);
704             
705             }
706
707             /**************************
708              * CALCULATE INTERACTIONS *
709              **************************/
710
711             if (gmx_mm_any_lt(rsq33,rcutoff2))
712             {
713
714             r33              = _mm_mul_ps(rsq33,rinv33);
715
716             /* EWALD ELECTROSTATICS */
717
718             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
719             ewrt             = _mm_mul_ps(r33,ewtabscale);
720             ewitab           = _mm_cvttps_epi32(ewrt);
721             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
722             ewitab           = _mm_slli_epi32(ewitab,2);
723             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
724             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
725             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
726             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
727             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
728             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
729             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
730             velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
731             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
732
733             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
734
735             /* Update potential sum for this i atom from the interaction with this j atom. */
736             velec            = _mm_and_ps(velec,cutoff_mask);
737             velecsum         = _mm_add_ps(velecsum,velec);
738
739             fscal            = felec;
740
741             fscal            = _mm_and_ps(fscal,cutoff_mask);
742
743             /* Calculate temporary vectorial force */
744             tx               = _mm_mul_ps(fscal,dx33);
745             ty               = _mm_mul_ps(fscal,dy33);
746             tz               = _mm_mul_ps(fscal,dz33);
747
748             /* Update vectorial force */
749             fix3             = _mm_add_ps(fix3,tx);
750             fiy3             = _mm_add_ps(fiy3,ty);
751             fiz3             = _mm_add_ps(fiz3,tz);
752
753             fjx3             = _mm_add_ps(fjx3,tx);
754             fjy3             = _mm_add_ps(fjy3,ty);
755             fjz3             = _mm_add_ps(fjz3,tz);
756             
757             }
758
759             fjptrA             = f+j_coord_offsetA;
760             fjptrB             = f+j_coord_offsetB;
761             fjptrC             = f+j_coord_offsetC;
762             fjptrD             = f+j_coord_offsetD;
763
764             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
765                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
766
767             /* Inner loop uses 414 flops */
768         }
769
770         if(jidx<j_index_end)
771         {
772
773             /* Get j neighbor index, and coordinate index */
774             jnrlistA         = jjnr[jidx];
775             jnrlistB         = jjnr[jidx+1];
776             jnrlistC         = jjnr[jidx+2];
777             jnrlistD         = jjnr[jidx+3];
778             /* Sign of each element will be negative for non-real atoms.
779              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
780              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
781              */
782             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
783             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
784             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
785             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
786             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
787             j_coord_offsetA  = DIM*jnrA;
788             j_coord_offsetB  = DIM*jnrB;
789             j_coord_offsetC  = DIM*jnrC;
790             j_coord_offsetD  = DIM*jnrD;
791
792             /* load j atom coordinates */
793             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
794                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
795                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
796
797             /* Calculate displacement vector */
798             dx11             = _mm_sub_ps(ix1,jx1);
799             dy11             = _mm_sub_ps(iy1,jy1);
800             dz11             = _mm_sub_ps(iz1,jz1);
801             dx12             = _mm_sub_ps(ix1,jx2);
802             dy12             = _mm_sub_ps(iy1,jy2);
803             dz12             = _mm_sub_ps(iz1,jz2);
804             dx13             = _mm_sub_ps(ix1,jx3);
805             dy13             = _mm_sub_ps(iy1,jy3);
806             dz13             = _mm_sub_ps(iz1,jz3);
807             dx21             = _mm_sub_ps(ix2,jx1);
808             dy21             = _mm_sub_ps(iy2,jy1);
809             dz21             = _mm_sub_ps(iz2,jz1);
810             dx22             = _mm_sub_ps(ix2,jx2);
811             dy22             = _mm_sub_ps(iy2,jy2);
812             dz22             = _mm_sub_ps(iz2,jz2);
813             dx23             = _mm_sub_ps(ix2,jx3);
814             dy23             = _mm_sub_ps(iy2,jy3);
815             dz23             = _mm_sub_ps(iz2,jz3);
816             dx31             = _mm_sub_ps(ix3,jx1);
817             dy31             = _mm_sub_ps(iy3,jy1);
818             dz31             = _mm_sub_ps(iz3,jz1);
819             dx32             = _mm_sub_ps(ix3,jx2);
820             dy32             = _mm_sub_ps(iy3,jy2);
821             dz32             = _mm_sub_ps(iz3,jz2);
822             dx33             = _mm_sub_ps(ix3,jx3);
823             dy33             = _mm_sub_ps(iy3,jy3);
824             dz33             = _mm_sub_ps(iz3,jz3);
825
826             /* Calculate squared distance and things based on it */
827             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
828             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
829             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
830             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
831             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
832             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
833             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
834             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
835             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
836
837             rinv11           = gmx_mm_invsqrt_ps(rsq11);
838             rinv12           = gmx_mm_invsqrt_ps(rsq12);
839             rinv13           = gmx_mm_invsqrt_ps(rsq13);
840             rinv21           = gmx_mm_invsqrt_ps(rsq21);
841             rinv22           = gmx_mm_invsqrt_ps(rsq22);
842             rinv23           = gmx_mm_invsqrt_ps(rsq23);
843             rinv31           = gmx_mm_invsqrt_ps(rsq31);
844             rinv32           = gmx_mm_invsqrt_ps(rsq32);
845             rinv33           = gmx_mm_invsqrt_ps(rsq33);
846
847             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
848             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
849             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
850             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
851             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
852             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
853             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
854             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
855             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
856
857             fjx1             = _mm_setzero_ps();
858             fjy1             = _mm_setzero_ps();
859             fjz1             = _mm_setzero_ps();
860             fjx2             = _mm_setzero_ps();
861             fjy2             = _mm_setzero_ps();
862             fjz2             = _mm_setzero_ps();
863             fjx3             = _mm_setzero_ps();
864             fjy3             = _mm_setzero_ps();
865             fjz3             = _mm_setzero_ps();
866
867             /**************************
868              * CALCULATE INTERACTIONS *
869              **************************/
870
871             if (gmx_mm_any_lt(rsq11,rcutoff2))
872             {
873
874             r11              = _mm_mul_ps(rsq11,rinv11);
875             r11              = _mm_andnot_ps(dummy_mask,r11);
876
877             /* EWALD ELECTROSTATICS */
878
879             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
880             ewrt             = _mm_mul_ps(r11,ewtabscale);
881             ewitab           = _mm_cvttps_epi32(ewrt);
882             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
883             ewitab           = _mm_slli_epi32(ewitab,2);
884             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
885             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
886             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
887             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
888             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
889             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
890             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
891             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
892             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
893
894             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
895
896             /* Update potential sum for this i atom from the interaction with this j atom. */
897             velec            = _mm_and_ps(velec,cutoff_mask);
898             velec            = _mm_andnot_ps(dummy_mask,velec);
899             velecsum         = _mm_add_ps(velecsum,velec);
900
901             fscal            = felec;
902
903             fscal            = _mm_and_ps(fscal,cutoff_mask);
904
905             fscal            = _mm_andnot_ps(dummy_mask,fscal);
906
907             /* Calculate temporary vectorial force */
908             tx               = _mm_mul_ps(fscal,dx11);
909             ty               = _mm_mul_ps(fscal,dy11);
910             tz               = _mm_mul_ps(fscal,dz11);
911
912             /* Update vectorial force */
913             fix1             = _mm_add_ps(fix1,tx);
914             fiy1             = _mm_add_ps(fiy1,ty);
915             fiz1             = _mm_add_ps(fiz1,tz);
916
917             fjx1             = _mm_add_ps(fjx1,tx);
918             fjy1             = _mm_add_ps(fjy1,ty);
919             fjz1             = _mm_add_ps(fjz1,tz);
920             
921             }
922
923             /**************************
924              * CALCULATE INTERACTIONS *
925              **************************/
926
927             if (gmx_mm_any_lt(rsq12,rcutoff2))
928             {
929
930             r12              = _mm_mul_ps(rsq12,rinv12);
931             r12              = _mm_andnot_ps(dummy_mask,r12);
932
933             /* EWALD ELECTROSTATICS */
934
935             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
936             ewrt             = _mm_mul_ps(r12,ewtabscale);
937             ewitab           = _mm_cvttps_epi32(ewrt);
938             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
939             ewitab           = _mm_slli_epi32(ewitab,2);
940             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
941             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
942             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
943             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
944             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
945             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
946             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
947             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
948             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
949
950             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
951
952             /* Update potential sum for this i atom from the interaction with this j atom. */
953             velec            = _mm_and_ps(velec,cutoff_mask);
954             velec            = _mm_andnot_ps(dummy_mask,velec);
955             velecsum         = _mm_add_ps(velecsum,velec);
956
957             fscal            = felec;
958
959             fscal            = _mm_and_ps(fscal,cutoff_mask);
960
961             fscal            = _mm_andnot_ps(dummy_mask,fscal);
962
963             /* Calculate temporary vectorial force */
964             tx               = _mm_mul_ps(fscal,dx12);
965             ty               = _mm_mul_ps(fscal,dy12);
966             tz               = _mm_mul_ps(fscal,dz12);
967
968             /* Update vectorial force */
969             fix1             = _mm_add_ps(fix1,tx);
970             fiy1             = _mm_add_ps(fiy1,ty);
971             fiz1             = _mm_add_ps(fiz1,tz);
972
973             fjx2             = _mm_add_ps(fjx2,tx);
974             fjy2             = _mm_add_ps(fjy2,ty);
975             fjz2             = _mm_add_ps(fjz2,tz);
976             
977             }
978
979             /**************************
980              * CALCULATE INTERACTIONS *
981              **************************/
982
983             if (gmx_mm_any_lt(rsq13,rcutoff2))
984             {
985
986             r13              = _mm_mul_ps(rsq13,rinv13);
987             r13              = _mm_andnot_ps(dummy_mask,r13);
988
989             /* EWALD ELECTROSTATICS */
990
991             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
992             ewrt             = _mm_mul_ps(r13,ewtabscale);
993             ewitab           = _mm_cvttps_epi32(ewrt);
994             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
995             ewitab           = _mm_slli_epi32(ewitab,2);
996             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
997             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
998             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
999             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1000             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1001             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1002             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1003             velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
1004             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1005
1006             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
1007
1008             /* Update potential sum for this i atom from the interaction with this j atom. */
1009             velec            = _mm_and_ps(velec,cutoff_mask);
1010             velec            = _mm_andnot_ps(dummy_mask,velec);
1011             velecsum         = _mm_add_ps(velecsum,velec);
1012
1013             fscal            = felec;
1014
1015             fscal            = _mm_and_ps(fscal,cutoff_mask);
1016
1017             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1018
1019             /* Calculate temporary vectorial force */
1020             tx               = _mm_mul_ps(fscal,dx13);
1021             ty               = _mm_mul_ps(fscal,dy13);
1022             tz               = _mm_mul_ps(fscal,dz13);
1023
1024             /* Update vectorial force */
1025             fix1             = _mm_add_ps(fix1,tx);
1026             fiy1             = _mm_add_ps(fiy1,ty);
1027             fiz1             = _mm_add_ps(fiz1,tz);
1028
1029             fjx3             = _mm_add_ps(fjx3,tx);
1030             fjy3             = _mm_add_ps(fjy3,ty);
1031             fjz3             = _mm_add_ps(fjz3,tz);
1032             
1033             }
1034
1035             /**************************
1036              * CALCULATE INTERACTIONS *
1037              **************************/
1038
1039             if (gmx_mm_any_lt(rsq21,rcutoff2))
1040             {
1041
1042             r21              = _mm_mul_ps(rsq21,rinv21);
1043             r21              = _mm_andnot_ps(dummy_mask,r21);
1044
1045             /* EWALD ELECTROSTATICS */
1046
1047             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1048             ewrt             = _mm_mul_ps(r21,ewtabscale);
1049             ewitab           = _mm_cvttps_epi32(ewrt);
1050             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1051             ewitab           = _mm_slli_epi32(ewitab,2);
1052             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1053             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1054             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1055             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1056             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1057             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1058             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1059             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
1060             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1061
1062             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1063
1064             /* Update potential sum for this i atom from the interaction with this j atom. */
1065             velec            = _mm_and_ps(velec,cutoff_mask);
1066             velec            = _mm_andnot_ps(dummy_mask,velec);
1067             velecsum         = _mm_add_ps(velecsum,velec);
1068
1069             fscal            = felec;
1070
1071             fscal            = _mm_and_ps(fscal,cutoff_mask);
1072
1073             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1074
1075             /* Calculate temporary vectorial force */
1076             tx               = _mm_mul_ps(fscal,dx21);
1077             ty               = _mm_mul_ps(fscal,dy21);
1078             tz               = _mm_mul_ps(fscal,dz21);
1079
1080             /* Update vectorial force */
1081             fix2             = _mm_add_ps(fix2,tx);
1082             fiy2             = _mm_add_ps(fiy2,ty);
1083             fiz2             = _mm_add_ps(fiz2,tz);
1084
1085             fjx1             = _mm_add_ps(fjx1,tx);
1086             fjy1             = _mm_add_ps(fjy1,ty);
1087             fjz1             = _mm_add_ps(fjz1,tz);
1088             
1089             }
1090
1091             /**************************
1092              * CALCULATE INTERACTIONS *
1093              **************************/
1094
1095             if (gmx_mm_any_lt(rsq22,rcutoff2))
1096             {
1097
1098             r22              = _mm_mul_ps(rsq22,rinv22);
1099             r22              = _mm_andnot_ps(dummy_mask,r22);
1100
1101             /* EWALD ELECTROSTATICS */
1102
1103             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1104             ewrt             = _mm_mul_ps(r22,ewtabscale);
1105             ewitab           = _mm_cvttps_epi32(ewrt);
1106             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1107             ewitab           = _mm_slli_epi32(ewitab,2);
1108             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1109             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1110             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1111             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1112             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1113             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1114             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1115             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
1116             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1117
1118             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1119
1120             /* Update potential sum for this i atom from the interaction with this j atom. */
1121             velec            = _mm_and_ps(velec,cutoff_mask);
1122             velec            = _mm_andnot_ps(dummy_mask,velec);
1123             velecsum         = _mm_add_ps(velecsum,velec);
1124
1125             fscal            = felec;
1126
1127             fscal            = _mm_and_ps(fscal,cutoff_mask);
1128
1129             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1130
1131             /* Calculate temporary vectorial force */
1132             tx               = _mm_mul_ps(fscal,dx22);
1133             ty               = _mm_mul_ps(fscal,dy22);
1134             tz               = _mm_mul_ps(fscal,dz22);
1135
1136             /* Update vectorial force */
1137             fix2             = _mm_add_ps(fix2,tx);
1138             fiy2             = _mm_add_ps(fiy2,ty);
1139             fiz2             = _mm_add_ps(fiz2,tz);
1140
1141             fjx2             = _mm_add_ps(fjx2,tx);
1142             fjy2             = _mm_add_ps(fjy2,ty);
1143             fjz2             = _mm_add_ps(fjz2,tz);
1144             
1145             }
1146
1147             /**************************
1148              * CALCULATE INTERACTIONS *
1149              **************************/
1150
1151             if (gmx_mm_any_lt(rsq23,rcutoff2))
1152             {
1153
1154             r23              = _mm_mul_ps(rsq23,rinv23);
1155             r23              = _mm_andnot_ps(dummy_mask,r23);
1156
1157             /* EWALD ELECTROSTATICS */
1158
1159             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1160             ewrt             = _mm_mul_ps(r23,ewtabscale);
1161             ewitab           = _mm_cvttps_epi32(ewrt);
1162             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1163             ewitab           = _mm_slli_epi32(ewitab,2);
1164             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1165             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1166             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1167             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1168             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1169             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1170             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1171             velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
1172             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1173
1174             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1175
1176             /* Update potential sum for this i atom from the interaction with this j atom. */
1177             velec            = _mm_and_ps(velec,cutoff_mask);
1178             velec            = _mm_andnot_ps(dummy_mask,velec);
1179             velecsum         = _mm_add_ps(velecsum,velec);
1180
1181             fscal            = felec;
1182
1183             fscal            = _mm_and_ps(fscal,cutoff_mask);
1184
1185             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1186
1187             /* Calculate temporary vectorial force */
1188             tx               = _mm_mul_ps(fscal,dx23);
1189             ty               = _mm_mul_ps(fscal,dy23);
1190             tz               = _mm_mul_ps(fscal,dz23);
1191
1192             /* Update vectorial force */
1193             fix2             = _mm_add_ps(fix2,tx);
1194             fiy2             = _mm_add_ps(fiy2,ty);
1195             fiz2             = _mm_add_ps(fiz2,tz);
1196
1197             fjx3             = _mm_add_ps(fjx3,tx);
1198             fjy3             = _mm_add_ps(fjy3,ty);
1199             fjz3             = _mm_add_ps(fjz3,tz);
1200             
1201             }
1202
1203             /**************************
1204              * CALCULATE INTERACTIONS *
1205              **************************/
1206
1207             if (gmx_mm_any_lt(rsq31,rcutoff2))
1208             {
1209
1210             r31              = _mm_mul_ps(rsq31,rinv31);
1211             r31              = _mm_andnot_ps(dummy_mask,r31);
1212
1213             /* EWALD ELECTROSTATICS */
1214
1215             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1216             ewrt             = _mm_mul_ps(r31,ewtabscale);
1217             ewitab           = _mm_cvttps_epi32(ewrt);
1218             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1219             ewitab           = _mm_slli_epi32(ewitab,2);
1220             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1221             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1222             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1223             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1224             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1225             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1226             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1227             velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
1228             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1229
1230             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1231
1232             /* Update potential sum for this i atom from the interaction with this j atom. */
1233             velec            = _mm_and_ps(velec,cutoff_mask);
1234             velec            = _mm_andnot_ps(dummy_mask,velec);
1235             velecsum         = _mm_add_ps(velecsum,velec);
1236
1237             fscal            = felec;
1238
1239             fscal            = _mm_and_ps(fscal,cutoff_mask);
1240
1241             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1242
1243             /* Calculate temporary vectorial force */
1244             tx               = _mm_mul_ps(fscal,dx31);
1245             ty               = _mm_mul_ps(fscal,dy31);
1246             tz               = _mm_mul_ps(fscal,dz31);
1247
1248             /* Update vectorial force */
1249             fix3             = _mm_add_ps(fix3,tx);
1250             fiy3             = _mm_add_ps(fiy3,ty);
1251             fiz3             = _mm_add_ps(fiz3,tz);
1252
1253             fjx1             = _mm_add_ps(fjx1,tx);
1254             fjy1             = _mm_add_ps(fjy1,ty);
1255             fjz1             = _mm_add_ps(fjz1,tz);
1256             
1257             }
1258
1259             /**************************
1260              * CALCULATE INTERACTIONS *
1261              **************************/
1262
1263             if (gmx_mm_any_lt(rsq32,rcutoff2))
1264             {
1265
1266             r32              = _mm_mul_ps(rsq32,rinv32);
1267             r32              = _mm_andnot_ps(dummy_mask,r32);
1268
1269             /* EWALD ELECTROSTATICS */
1270
1271             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1272             ewrt             = _mm_mul_ps(r32,ewtabscale);
1273             ewitab           = _mm_cvttps_epi32(ewrt);
1274             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1275             ewitab           = _mm_slli_epi32(ewitab,2);
1276             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1277             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1278             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1279             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1280             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1281             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1282             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1283             velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
1284             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1285
1286             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1287
1288             /* Update potential sum for this i atom from the interaction with this j atom. */
1289             velec            = _mm_and_ps(velec,cutoff_mask);
1290             velec            = _mm_andnot_ps(dummy_mask,velec);
1291             velecsum         = _mm_add_ps(velecsum,velec);
1292
1293             fscal            = felec;
1294
1295             fscal            = _mm_and_ps(fscal,cutoff_mask);
1296
1297             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1298
1299             /* Calculate temporary vectorial force */
1300             tx               = _mm_mul_ps(fscal,dx32);
1301             ty               = _mm_mul_ps(fscal,dy32);
1302             tz               = _mm_mul_ps(fscal,dz32);
1303
1304             /* Update vectorial force */
1305             fix3             = _mm_add_ps(fix3,tx);
1306             fiy3             = _mm_add_ps(fiy3,ty);
1307             fiz3             = _mm_add_ps(fiz3,tz);
1308
1309             fjx2             = _mm_add_ps(fjx2,tx);
1310             fjy2             = _mm_add_ps(fjy2,ty);
1311             fjz2             = _mm_add_ps(fjz2,tz);
1312             
1313             }
1314
1315             /**************************
1316              * CALCULATE INTERACTIONS *
1317              **************************/
1318
1319             if (gmx_mm_any_lt(rsq33,rcutoff2))
1320             {
1321
1322             r33              = _mm_mul_ps(rsq33,rinv33);
1323             r33              = _mm_andnot_ps(dummy_mask,r33);
1324
1325             /* EWALD ELECTROSTATICS */
1326
1327             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1328             ewrt             = _mm_mul_ps(r33,ewtabscale);
1329             ewitab           = _mm_cvttps_epi32(ewrt);
1330             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1331             ewitab           = _mm_slli_epi32(ewitab,2);
1332             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1333             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1334             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1335             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1336             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1337             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1338             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1339             velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
1340             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1341
1342             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1343
1344             /* Update potential sum for this i atom from the interaction with this j atom. */
1345             velec            = _mm_and_ps(velec,cutoff_mask);
1346             velec            = _mm_andnot_ps(dummy_mask,velec);
1347             velecsum         = _mm_add_ps(velecsum,velec);
1348
1349             fscal            = felec;
1350
1351             fscal            = _mm_and_ps(fscal,cutoff_mask);
1352
1353             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1354
1355             /* Calculate temporary vectorial force */
1356             tx               = _mm_mul_ps(fscal,dx33);
1357             ty               = _mm_mul_ps(fscal,dy33);
1358             tz               = _mm_mul_ps(fscal,dz33);
1359
1360             /* Update vectorial force */
1361             fix3             = _mm_add_ps(fix3,tx);
1362             fiy3             = _mm_add_ps(fiy3,ty);
1363             fiz3             = _mm_add_ps(fiz3,tz);
1364
1365             fjx3             = _mm_add_ps(fjx3,tx);
1366             fjy3             = _mm_add_ps(fjy3,ty);
1367             fjz3             = _mm_add_ps(fjz3,tz);
1368             
1369             }
1370
1371             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1372             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1373             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1374             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1375
1376             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1377                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1378
1379             /* Inner loop uses 423 flops */
1380         }
1381
1382         /* End of innermost loop */
1383
1384         gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1385                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
1386
1387         ggid                        = gid[iidx];
1388         /* Update potential energies */
1389         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1390
1391         /* Increment number of inner iterations */
1392         inneriter                  += j_index_end - j_index_start;
1393
1394         /* Outer loop uses 19 flops */
1395     }
1396
1397     /* Increment number of outer iterations */
1398     outeriter        += nri;
1399
1400     /* Update outer/inner flops */
1401
1402     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*423);
1403 }
1404 /*
1405  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse2_single
1406  * Electrostatics interaction: Ewald
1407  * VdW interaction:            None
1408  * Geometry:                   Water4-Water4
1409  * Calculate force/pot:        Force
1410  */
1411 void
1412 nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse2_single
1413                     (t_nblist                    * gmx_restrict       nlist,
1414                      rvec                        * gmx_restrict          xx,
1415                      rvec                        * gmx_restrict          ff,
1416                      t_forcerec                  * gmx_restrict          fr,
1417                      t_mdatoms                   * gmx_restrict     mdatoms,
1418                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1419                      t_nrnb                      * gmx_restrict        nrnb)
1420 {
1421     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1422      * just 0 for non-waters.
1423      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1424      * jnr indices corresponding to data put in the four positions in the SIMD register.
1425      */
1426     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1427     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1428     int              jnrA,jnrB,jnrC,jnrD;
1429     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1430     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1431     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1432     real             rcutoff_scalar;
1433     real             *shiftvec,*fshift,*x,*f;
1434     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1435     real             scratch[4*DIM];
1436     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1437     int              vdwioffset1;
1438     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1439     int              vdwioffset2;
1440     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1441     int              vdwioffset3;
1442     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1443     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1444     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1445     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1446     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1447     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1448     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1449     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1450     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1451     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1452     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1453     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1454     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1455     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1456     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1457     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1458     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1459     real             *charge;
1460     __m128i          ewitab;
1461     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1462     real             *ewtab;
1463     __m128           dummy_mask,cutoff_mask;
1464     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1465     __m128           one     = _mm_set1_ps(1.0);
1466     __m128           two     = _mm_set1_ps(2.0);
1467     x                = xx[0];
1468     f                = ff[0];
1469
1470     nri              = nlist->nri;
1471     iinr             = nlist->iinr;
1472     jindex           = nlist->jindex;
1473     jjnr             = nlist->jjnr;
1474     shiftidx         = nlist->shift;
1475     gid              = nlist->gid;
1476     shiftvec         = fr->shift_vec[0];
1477     fshift           = fr->fshift[0];
1478     facel            = _mm_set1_ps(fr->epsfac);
1479     charge           = mdatoms->chargeA;
1480
1481     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1482     ewtab            = fr->ic->tabq_coul_F;
1483     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1484     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1485
1486     /* Setup water-specific parameters */
1487     inr              = nlist->iinr[0];
1488     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1489     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1490     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1491
1492     jq1              = _mm_set1_ps(charge[inr+1]);
1493     jq2              = _mm_set1_ps(charge[inr+2]);
1494     jq3              = _mm_set1_ps(charge[inr+3]);
1495     qq11             = _mm_mul_ps(iq1,jq1);
1496     qq12             = _mm_mul_ps(iq1,jq2);
1497     qq13             = _mm_mul_ps(iq1,jq3);
1498     qq21             = _mm_mul_ps(iq2,jq1);
1499     qq22             = _mm_mul_ps(iq2,jq2);
1500     qq23             = _mm_mul_ps(iq2,jq3);
1501     qq31             = _mm_mul_ps(iq3,jq1);
1502     qq32             = _mm_mul_ps(iq3,jq2);
1503     qq33             = _mm_mul_ps(iq3,jq3);
1504
1505     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1506     rcutoff_scalar   = fr->rcoulomb;
1507     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1508     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1509
1510     /* Avoid stupid compiler warnings */
1511     jnrA = jnrB = jnrC = jnrD = 0;
1512     j_coord_offsetA = 0;
1513     j_coord_offsetB = 0;
1514     j_coord_offsetC = 0;
1515     j_coord_offsetD = 0;
1516
1517     outeriter        = 0;
1518     inneriter        = 0;
1519
1520     for(iidx=0;iidx<4*DIM;iidx++)
1521     {
1522         scratch[iidx] = 0.0;
1523     }  
1524
1525     /* Start outer loop over neighborlists */
1526     for(iidx=0; iidx<nri; iidx++)
1527     {
1528         /* Load shift vector for this list */
1529         i_shift_offset   = DIM*shiftidx[iidx];
1530
1531         /* Load limits for loop over neighbors */
1532         j_index_start    = jindex[iidx];
1533         j_index_end      = jindex[iidx+1];
1534
1535         /* Get outer coordinate index */
1536         inr              = iinr[iidx];
1537         i_coord_offset   = DIM*inr;
1538
1539         /* Load i particle coords and add shift vector */
1540         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1541                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1542         
1543         fix1             = _mm_setzero_ps();
1544         fiy1             = _mm_setzero_ps();
1545         fiz1             = _mm_setzero_ps();
1546         fix2             = _mm_setzero_ps();
1547         fiy2             = _mm_setzero_ps();
1548         fiz2             = _mm_setzero_ps();
1549         fix3             = _mm_setzero_ps();
1550         fiy3             = _mm_setzero_ps();
1551         fiz3             = _mm_setzero_ps();
1552
1553         /* Start inner kernel loop */
1554         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1555         {
1556
1557             /* Get j neighbor index, and coordinate index */
1558             jnrA             = jjnr[jidx];
1559             jnrB             = jjnr[jidx+1];
1560             jnrC             = jjnr[jidx+2];
1561             jnrD             = jjnr[jidx+3];
1562             j_coord_offsetA  = DIM*jnrA;
1563             j_coord_offsetB  = DIM*jnrB;
1564             j_coord_offsetC  = DIM*jnrC;
1565             j_coord_offsetD  = DIM*jnrD;
1566
1567             /* load j atom coordinates */
1568             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1569                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1570                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1571
1572             /* Calculate displacement vector */
1573             dx11             = _mm_sub_ps(ix1,jx1);
1574             dy11             = _mm_sub_ps(iy1,jy1);
1575             dz11             = _mm_sub_ps(iz1,jz1);
1576             dx12             = _mm_sub_ps(ix1,jx2);
1577             dy12             = _mm_sub_ps(iy1,jy2);
1578             dz12             = _mm_sub_ps(iz1,jz2);
1579             dx13             = _mm_sub_ps(ix1,jx3);
1580             dy13             = _mm_sub_ps(iy1,jy3);
1581             dz13             = _mm_sub_ps(iz1,jz3);
1582             dx21             = _mm_sub_ps(ix2,jx1);
1583             dy21             = _mm_sub_ps(iy2,jy1);
1584             dz21             = _mm_sub_ps(iz2,jz1);
1585             dx22             = _mm_sub_ps(ix2,jx2);
1586             dy22             = _mm_sub_ps(iy2,jy2);
1587             dz22             = _mm_sub_ps(iz2,jz2);
1588             dx23             = _mm_sub_ps(ix2,jx3);
1589             dy23             = _mm_sub_ps(iy2,jy3);
1590             dz23             = _mm_sub_ps(iz2,jz3);
1591             dx31             = _mm_sub_ps(ix3,jx1);
1592             dy31             = _mm_sub_ps(iy3,jy1);
1593             dz31             = _mm_sub_ps(iz3,jz1);
1594             dx32             = _mm_sub_ps(ix3,jx2);
1595             dy32             = _mm_sub_ps(iy3,jy2);
1596             dz32             = _mm_sub_ps(iz3,jz2);
1597             dx33             = _mm_sub_ps(ix3,jx3);
1598             dy33             = _mm_sub_ps(iy3,jy3);
1599             dz33             = _mm_sub_ps(iz3,jz3);
1600
1601             /* Calculate squared distance and things based on it */
1602             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1603             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1604             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1605             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1606             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1607             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1608             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1609             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1610             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1611
1612             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1613             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1614             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1615             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1616             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1617             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1618             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1619             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1620             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1621
1622             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1623             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1624             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1625             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1626             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1627             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1628             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1629             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1630             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1631
1632             fjx1             = _mm_setzero_ps();
1633             fjy1             = _mm_setzero_ps();
1634             fjz1             = _mm_setzero_ps();
1635             fjx2             = _mm_setzero_ps();
1636             fjy2             = _mm_setzero_ps();
1637             fjz2             = _mm_setzero_ps();
1638             fjx3             = _mm_setzero_ps();
1639             fjy3             = _mm_setzero_ps();
1640             fjz3             = _mm_setzero_ps();
1641
1642             /**************************
1643              * CALCULATE INTERACTIONS *
1644              **************************/
1645
1646             if (gmx_mm_any_lt(rsq11,rcutoff2))
1647             {
1648
1649             r11              = _mm_mul_ps(rsq11,rinv11);
1650
1651             /* EWALD ELECTROSTATICS */
1652
1653             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1654             ewrt             = _mm_mul_ps(r11,ewtabscale);
1655             ewitab           = _mm_cvttps_epi32(ewrt);
1656             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1657             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1658                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1659                                          &ewtabF,&ewtabFn);
1660             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1661             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1662
1663             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1664
1665             fscal            = felec;
1666
1667             fscal            = _mm_and_ps(fscal,cutoff_mask);
1668
1669             /* Calculate temporary vectorial force */
1670             tx               = _mm_mul_ps(fscal,dx11);
1671             ty               = _mm_mul_ps(fscal,dy11);
1672             tz               = _mm_mul_ps(fscal,dz11);
1673
1674             /* Update vectorial force */
1675             fix1             = _mm_add_ps(fix1,tx);
1676             fiy1             = _mm_add_ps(fiy1,ty);
1677             fiz1             = _mm_add_ps(fiz1,tz);
1678
1679             fjx1             = _mm_add_ps(fjx1,tx);
1680             fjy1             = _mm_add_ps(fjy1,ty);
1681             fjz1             = _mm_add_ps(fjz1,tz);
1682             
1683             }
1684
1685             /**************************
1686              * CALCULATE INTERACTIONS *
1687              **************************/
1688
1689             if (gmx_mm_any_lt(rsq12,rcutoff2))
1690             {
1691
1692             r12              = _mm_mul_ps(rsq12,rinv12);
1693
1694             /* EWALD ELECTROSTATICS */
1695
1696             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1697             ewrt             = _mm_mul_ps(r12,ewtabscale);
1698             ewitab           = _mm_cvttps_epi32(ewrt);
1699             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1700             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1701                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1702                                          &ewtabF,&ewtabFn);
1703             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1704             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1705
1706             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1707
1708             fscal            = felec;
1709
1710             fscal            = _mm_and_ps(fscal,cutoff_mask);
1711
1712             /* Calculate temporary vectorial force */
1713             tx               = _mm_mul_ps(fscal,dx12);
1714             ty               = _mm_mul_ps(fscal,dy12);
1715             tz               = _mm_mul_ps(fscal,dz12);
1716
1717             /* Update vectorial force */
1718             fix1             = _mm_add_ps(fix1,tx);
1719             fiy1             = _mm_add_ps(fiy1,ty);
1720             fiz1             = _mm_add_ps(fiz1,tz);
1721
1722             fjx2             = _mm_add_ps(fjx2,tx);
1723             fjy2             = _mm_add_ps(fjy2,ty);
1724             fjz2             = _mm_add_ps(fjz2,tz);
1725             
1726             }
1727
1728             /**************************
1729              * CALCULATE INTERACTIONS *
1730              **************************/
1731
1732             if (gmx_mm_any_lt(rsq13,rcutoff2))
1733             {
1734
1735             r13              = _mm_mul_ps(rsq13,rinv13);
1736
1737             /* EWALD ELECTROSTATICS */
1738
1739             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1740             ewrt             = _mm_mul_ps(r13,ewtabscale);
1741             ewitab           = _mm_cvttps_epi32(ewrt);
1742             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1743             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1744                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1745                                          &ewtabF,&ewtabFn);
1746             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1747             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1748
1749             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
1750
1751             fscal            = felec;
1752
1753             fscal            = _mm_and_ps(fscal,cutoff_mask);
1754
1755             /* Calculate temporary vectorial force */
1756             tx               = _mm_mul_ps(fscal,dx13);
1757             ty               = _mm_mul_ps(fscal,dy13);
1758             tz               = _mm_mul_ps(fscal,dz13);
1759
1760             /* Update vectorial force */
1761             fix1             = _mm_add_ps(fix1,tx);
1762             fiy1             = _mm_add_ps(fiy1,ty);
1763             fiz1             = _mm_add_ps(fiz1,tz);
1764
1765             fjx3             = _mm_add_ps(fjx3,tx);
1766             fjy3             = _mm_add_ps(fjy3,ty);
1767             fjz3             = _mm_add_ps(fjz3,tz);
1768             
1769             }
1770
1771             /**************************
1772              * CALCULATE INTERACTIONS *
1773              **************************/
1774
1775             if (gmx_mm_any_lt(rsq21,rcutoff2))
1776             {
1777
1778             r21              = _mm_mul_ps(rsq21,rinv21);
1779
1780             /* EWALD ELECTROSTATICS */
1781
1782             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1783             ewrt             = _mm_mul_ps(r21,ewtabscale);
1784             ewitab           = _mm_cvttps_epi32(ewrt);
1785             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1786             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1787                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1788                                          &ewtabF,&ewtabFn);
1789             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1790             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1791
1792             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1793
1794             fscal            = felec;
1795
1796             fscal            = _mm_and_ps(fscal,cutoff_mask);
1797
1798             /* Calculate temporary vectorial force */
1799             tx               = _mm_mul_ps(fscal,dx21);
1800             ty               = _mm_mul_ps(fscal,dy21);
1801             tz               = _mm_mul_ps(fscal,dz21);
1802
1803             /* Update vectorial force */
1804             fix2             = _mm_add_ps(fix2,tx);
1805             fiy2             = _mm_add_ps(fiy2,ty);
1806             fiz2             = _mm_add_ps(fiz2,tz);
1807
1808             fjx1             = _mm_add_ps(fjx1,tx);
1809             fjy1             = _mm_add_ps(fjy1,ty);
1810             fjz1             = _mm_add_ps(fjz1,tz);
1811             
1812             }
1813
1814             /**************************
1815              * CALCULATE INTERACTIONS *
1816              **************************/
1817
1818             if (gmx_mm_any_lt(rsq22,rcutoff2))
1819             {
1820
1821             r22              = _mm_mul_ps(rsq22,rinv22);
1822
1823             /* EWALD ELECTROSTATICS */
1824
1825             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1826             ewrt             = _mm_mul_ps(r22,ewtabscale);
1827             ewitab           = _mm_cvttps_epi32(ewrt);
1828             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1829             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1830                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1831                                          &ewtabF,&ewtabFn);
1832             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1833             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1834
1835             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1836
1837             fscal            = felec;
1838
1839             fscal            = _mm_and_ps(fscal,cutoff_mask);
1840
1841             /* Calculate temporary vectorial force */
1842             tx               = _mm_mul_ps(fscal,dx22);
1843             ty               = _mm_mul_ps(fscal,dy22);
1844             tz               = _mm_mul_ps(fscal,dz22);
1845
1846             /* Update vectorial force */
1847             fix2             = _mm_add_ps(fix2,tx);
1848             fiy2             = _mm_add_ps(fiy2,ty);
1849             fiz2             = _mm_add_ps(fiz2,tz);
1850
1851             fjx2             = _mm_add_ps(fjx2,tx);
1852             fjy2             = _mm_add_ps(fjy2,ty);
1853             fjz2             = _mm_add_ps(fjz2,tz);
1854             
1855             }
1856
1857             /**************************
1858              * CALCULATE INTERACTIONS *
1859              **************************/
1860
1861             if (gmx_mm_any_lt(rsq23,rcutoff2))
1862             {
1863
1864             r23              = _mm_mul_ps(rsq23,rinv23);
1865
1866             /* EWALD ELECTROSTATICS */
1867
1868             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1869             ewrt             = _mm_mul_ps(r23,ewtabscale);
1870             ewitab           = _mm_cvttps_epi32(ewrt);
1871             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1872             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1873                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1874                                          &ewtabF,&ewtabFn);
1875             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1876             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1877
1878             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1879
1880             fscal            = felec;
1881
1882             fscal            = _mm_and_ps(fscal,cutoff_mask);
1883
1884             /* Calculate temporary vectorial force */
1885             tx               = _mm_mul_ps(fscal,dx23);
1886             ty               = _mm_mul_ps(fscal,dy23);
1887             tz               = _mm_mul_ps(fscal,dz23);
1888
1889             /* Update vectorial force */
1890             fix2             = _mm_add_ps(fix2,tx);
1891             fiy2             = _mm_add_ps(fiy2,ty);
1892             fiz2             = _mm_add_ps(fiz2,tz);
1893
1894             fjx3             = _mm_add_ps(fjx3,tx);
1895             fjy3             = _mm_add_ps(fjy3,ty);
1896             fjz3             = _mm_add_ps(fjz3,tz);
1897             
1898             }
1899
1900             /**************************
1901              * CALCULATE INTERACTIONS *
1902              **************************/
1903
1904             if (gmx_mm_any_lt(rsq31,rcutoff2))
1905             {
1906
1907             r31              = _mm_mul_ps(rsq31,rinv31);
1908
1909             /* EWALD ELECTROSTATICS */
1910
1911             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1912             ewrt             = _mm_mul_ps(r31,ewtabscale);
1913             ewitab           = _mm_cvttps_epi32(ewrt);
1914             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1915             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1916                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1917                                          &ewtabF,&ewtabFn);
1918             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1919             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1920
1921             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1922
1923             fscal            = felec;
1924
1925             fscal            = _mm_and_ps(fscal,cutoff_mask);
1926
1927             /* Calculate temporary vectorial force */
1928             tx               = _mm_mul_ps(fscal,dx31);
1929             ty               = _mm_mul_ps(fscal,dy31);
1930             tz               = _mm_mul_ps(fscal,dz31);
1931
1932             /* Update vectorial force */
1933             fix3             = _mm_add_ps(fix3,tx);
1934             fiy3             = _mm_add_ps(fiy3,ty);
1935             fiz3             = _mm_add_ps(fiz3,tz);
1936
1937             fjx1             = _mm_add_ps(fjx1,tx);
1938             fjy1             = _mm_add_ps(fjy1,ty);
1939             fjz1             = _mm_add_ps(fjz1,tz);
1940             
1941             }
1942
1943             /**************************
1944              * CALCULATE INTERACTIONS *
1945              **************************/
1946
1947             if (gmx_mm_any_lt(rsq32,rcutoff2))
1948             {
1949
1950             r32              = _mm_mul_ps(rsq32,rinv32);
1951
1952             /* EWALD ELECTROSTATICS */
1953
1954             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1955             ewrt             = _mm_mul_ps(r32,ewtabscale);
1956             ewitab           = _mm_cvttps_epi32(ewrt);
1957             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1958             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1959                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1960                                          &ewtabF,&ewtabFn);
1961             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1962             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1963
1964             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1965
1966             fscal            = felec;
1967
1968             fscal            = _mm_and_ps(fscal,cutoff_mask);
1969
1970             /* Calculate temporary vectorial force */
1971             tx               = _mm_mul_ps(fscal,dx32);
1972             ty               = _mm_mul_ps(fscal,dy32);
1973             tz               = _mm_mul_ps(fscal,dz32);
1974
1975             /* Update vectorial force */
1976             fix3             = _mm_add_ps(fix3,tx);
1977             fiy3             = _mm_add_ps(fiy3,ty);
1978             fiz3             = _mm_add_ps(fiz3,tz);
1979
1980             fjx2             = _mm_add_ps(fjx2,tx);
1981             fjy2             = _mm_add_ps(fjy2,ty);
1982             fjz2             = _mm_add_ps(fjz2,tz);
1983             
1984             }
1985
1986             /**************************
1987              * CALCULATE INTERACTIONS *
1988              **************************/
1989
1990             if (gmx_mm_any_lt(rsq33,rcutoff2))
1991             {
1992
1993             r33              = _mm_mul_ps(rsq33,rinv33);
1994
1995             /* EWALD ELECTROSTATICS */
1996
1997             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1998             ewrt             = _mm_mul_ps(r33,ewtabscale);
1999             ewitab           = _mm_cvttps_epi32(ewrt);
2000             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2001             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2002                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2003                                          &ewtabF,&ewtabFn);
2004             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2005             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2006
2007             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
2008
2009             fscal            = felec;
2010
2011             fscal            = _mm_and_ps(fscal,cutoff_mask);
2012
2013             /* Calculate temporary vectorial force */
2014             tx               = _mm_mul_ps(fscal,dx33);
2015             ty               = _mm_mul_ps(fscal,dy33);
2016             tz               = _mm_mul_ps(fscal,dz33);
2017
2018             /* Update vectorial force */
2019             fix3             = _mm_add_ps(fix3,tx);
2020             fiy3             = _mm_add_ps(fiy3,ty);
2021             fiz3             = _mm_add_ps(fiz3,tz);
2022
2023             fjx3             = _mm_add_ps(fjx3,tx);
2024             fjy3             = _mm_add_ps(fjy3,ty);
2025             fjz3             = _mm_add_ps(fjz3,tz);
2026             
2027             }
2028
2029             fjptrA             = f+j_coord_offsetA;
2030             fjptrB             = f+j_coord_offsetB;
2031             fjptrC             = f+j_coord_offsetC;
2032             fjptrD             = f+j_coord_offsetD;
2033
2034             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2035                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2036
2037             /* Inner loop uses 351 flops */
2038         }
2039
2040         if(jidx<j_index_end)
2041         {
2042
2043             /* Get j neighbor index, and coordinate index */
2044             jnrlistA         = jjnr[jidx];
2045             jnrlistB         = jjnr[jidx+1];
2046             jnrlistC         = jjnr[jidx+2];
2047             jnrlistD         = jjnr[jidx+3];
2048             /* Sign of each element will be negative for non-real atoms.
2049              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2050              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2051              */
2052             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2053             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2054             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2055             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2056             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2057             j_coord_offsetA  = DIM*jnrA;
2058             j_coord_offsetB  = DIM*jnrB;
2059             j_coord_offsetC  = DIM*jnrC;
2060             j_coord_offsetD  = DIM*jnrD;
2061
2062             /* load j atom coordinates */
2063             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
2064                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
2065                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
2066
2067             /* Calculate displacement vector */
2068             dx11             = _mm_sub_ps(ix1,jx1);
2069             dy11             = _mm_sub_ps(iy1,jy1);
2070             dz11             = _mm_sub_ps(iz1,jz1);
2071             dx12             = _mm_sub_ps(ix1,jx2);
2072             dy12             = _mm_sub_ps(iy1,jy2);
2073             dz12             = _mm_sub_ps(iz1,jz2);
2074             dx13             = _mm_sub_ps(ix1,jx3);
2075             dy13             = _mm_sub_ps(iy1,jy3);
2076             dz13             = _mm_sub_ps(iz1,jz3);
2077             dx21             = _mm_sub_ps(ix2,jx1);
2078             dy21             = _mm_sub_ps(iy2,jy1);
2079             dz21             = _mm_sub_ps(iz2,jz1);
2080             dx22             = _mm_sub_ps(ix2,jx2);
2081             dy22             = _mm_sub_ps(iy2,jy2);
2082             dz22             = _mm_sub_ps(iz2,jz2);
2083             dx23             = _mm_sub_ps(ix2,jx3);
2084             dy23             = _mm_sub_ps(iy2,jy3);
2085             dz23             = _mm_sub_ps(iz2,jz3);
2086             dx31             = _mm_sub_ps(ix3,jx1);
2087             dy31             = _mm_sub_ps(iy3,jy1);
2088             dz31             = _mm_sub_ps(iz3,jz1);
2089             dx32             = _mm_sub_ps(ix3,jx2);
2090             dy32             = _mm_sub_ps(iy3,jy2);
2091             dz32             = _mm_sub_ps(iz3,jz2);
2092             dx33             = _mm_sub_ps(ix3,jx3);
2093             dy33             = _mm_sub_ps(iy3,jy3);
2094             dz33             = _mm_sub_ps(iz3,jz3);
2095
2096             /* Calculate squared distance and things based on it */
2097             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2098             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2099             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2100             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2101             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2102             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2103             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2104             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2105             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2106
2107             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2108             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2109             rinv13           = gmx_mm_invsqrt_ps(rsq13);
2110             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2111             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2112             rinv23           = gmx_mm_invsqrt_ps(rsq23);
2113             rinv31           = gmx_mm_invsqrt_ps(rsq31);
2114             rinv32           = gmx_mm_invsqrt_ps(rsq32);
2115             rinv33           = gmx_mm_invsqrt_ps(rsq33);
2116
2117             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2118             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2119             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
2120             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2121             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2122             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
2123             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
2124             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
2125             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
2126
2127             fjx1             = _mm_setzero_ps();
2128             fjy1             = _mm_setzero_ps();
2129             fjz1             = _mm_setzero_ps();
2130             fjx2             = _mm_setzero_ps();
2131             fjy2             = _mm_setzero_ps();
2132             fjz2             = _mm_setzero_ps();
2133             fjx3             = _mm_setzero_ps();
2134             fjy3             = _mm_setzero_ps();
2135             fjz3             = _mm_setzero_ps();
2136
2137             /**************************
2138              * CALCULATE INTERACTIONS *
2139              **************************/
2140
2141             if (gmx_mm_any_lt(rsq11,rcutoff2))
2142             {
2143
2144             r11              = _mm_mul_ps(rsq11,rinv11);
2145             r11              = _mm_andnot_ps(dummy_mask,r11);
2146
2147             /* EWALD ELECTROSTATICS */
2148
2149             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2150             ewrt             = _mm_mul_ps(r11,ewtabscale);
2151             ewitab           = _mm_cvttps_epi32(ewrt);
2152             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2153             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2154                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2155                                          &ewtabF,&ewtabFn);
2156             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2157             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2158
2159             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2160
2161             fscal            = felec;
2162
2163             fscal            = _mm_and_ps(fscal,cutoff_mask);
2164
2165             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2166
2167             /* Calculate temporary vectorial force */
2168             tx               = _mm_mul_ps(fscal,dx11);
2169             ty               = _mm_mul_ps(fscal,dy11);
2170             tz               = _mm_mul_ps(fscal,dz11);
2171
2172             /* Update vectorial force */
2173             fix1             = _mm_add_ps(fix1,tx);
2174             fiy1             = _mm_add_ps(fiy1,ty);
2175             fiz1             = _mm_add_ps(fiz1,tz);
2176
2177             fjx1             = _mm_add_ps(fjx1,tx);
2178             fjy1             = _mm_add_ps(fjy1,ty);
2179             fjz1             = _mm_add_ps(fjz1,tz);
2180             
2181             }
2182
2183             /**************************
2184              * CALCULATE INTERACTIONS *
2185              **************************/
2186
2187             if (gmx_mm_any_lt(rsq12,rcutoff2))
2188             {
2189
2190             r12              = _mm_mul_ps(rsq12,rinv12);
2191             r12              = _mm_andnot_ps(dummy_mask,r12);
2192
2193             /* EWALD ELECTROSTATICS */
2194
2195             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2196             ewrt             = _mm_mul_ps(r12,ewtabscale);
2197             ewitab           = _mm_cvttps_epi32(ewrt);
2198             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2199             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2200                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2201                                          &ewtabF,&ewtabFn);
2202             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2203             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2204
2205             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2206
2207             fscal            = felec;
2208
2209             fscal            = _mm_and_ps(fscal,cutoff_mask);
2210
2211             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2212
2213             /* Calculate temporary vectorial force */
2214             tx               = _mm_mul_ps(fscal,dx12);
2215             ty               = _mm_mul_ps(fscal,dy12);
2216             tz               = _mm_mul_ps(fscal,dz12);
2217
2218             /* Update vectorial force */
2219             fix1             = _mm_add_ps(fix1,tx);
2220             fiy1             = _mm_add_ps(fiy1,ty);
2221             fiz1             = _mm_add_ps(fiz1,tz);
2222
2223             fjx2             = _mm_add_ps(fjx2,tx);
2224             fjy2             = _mm_add_ps(fjy2,ty);
2225             fjz2             = _mm_add_ps(fjz2,tz);
2226             
2227             }
2228
2229             /**************************
2230              * CALCULATE INTERACTIONS *
2231              **************************/
2232
2233             if (gmx_mm_any_lt(rsq13,rcutoff2))
2234             {
2235
2236             r13              = _mm_mul_ps(rsq13,rinv13);
2237             r13              = _mm_andnot_ps(dummy_mask,r13);
2238
2239             /* EWALD ELECTROSTATICS */
2240
2241             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2242             ewrt             = _mm_mul_ps(r13,ewtabscale);
2243             ewitab           = _mm_cvttps_epi32(ewrt);
2244             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2245             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2246                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2247                                          &ewtabF,&ewtabFn);
2248             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2249             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2250
2251             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
2252
2253             fscal            = felec;
2254
2255             fscal            = _mm_and_ps(fscal,cutoff_mask);
2256
2257             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2258
2259             /* Calculate temporary vectorial force */
2260             tx               = _mm_mul_ps(fscal,dx13);
2261             ty               = _mm_mul_ps(fscal,dy13);
2262             tz               = _mm_mul_ps(fscal,dz13);
2263
2264             /* Update vectorial force */
2265             fix1             = _mm_add_ps(fix1,tx);
2266             fiy1             = _mm_add_ps(fiy1,ty);
2267             fiz1             = _mm_add_ps(fiz1,tz);
2268
2269             fjx3             = _mm_add_ps(fjx3,tx);
2270             fjy3             = _mm_add_ps(fjy3,ty);
2271             fjz3             = _mm_add_ps(fjz3,tz);
2272             
2273             }
2274
2275             /**************************
2276              * CALCULATE INTERACTIONS *
2277              **************************/
2278
2279             if (gmx_mm_any_lt(rsq21,rcutoff2))
2280             {
2281
2282             r21              = _mm_mul_ps(rsq21,rinv21);
2283             r21              = _mm_andnot_ps(dummy_mask,r21);
2284
2285             /* EWALD ELECTROSTATICS */
2286
2287             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2288             ewrt             = _mm_mul_ps(r21,ewtabscale);
2289             ewitab           = _mm_cvttps_epi32(ewrt);
2290             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2291             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2292                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2293                                          &ewtabF,&ewtabFn);
2294             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2295             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2296
2297             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2298
2299             fscal            = felec;
2300
2301             fscal            = _mm_and_ps(fscal,cutoff_mask);
2302
2303             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2304
2305             /* Calculate temporary vectorial force */
2306             tx               = _mm_mul_ps(fscal,dx21);
2307             ty               = _mm_mul_ps(fscal,dy21);
2308             tz               = _mm_mul_ps(fscal,dz21);
2309
2310             /* Update vectorial force */
2311             fix2             = _mm_add_ps(fix2,tx);
2312             fiy2             = _mm_add_ps(fiy2,ty);
2313             fiz2             = _mm_add_ps(fiz2,tz);
2314
2315             fjx1             = _mm_add_ps(fjx1,tx);
2316             fjy1             = _mm_add_ps(fjy1,ty);
2317             fjz1             = _mm_add_ps(fjz1,tz);
2318             
2319             }
2320
2321             /**************************
2322              * CALCULATE INTERACTIONS *
2323              **************************/
2324
2325             if (gmx_mm_any_lt(rsq22,rcutoff2))
2326             {
2327
2328             r22              = _mm_mul_ps(rsq22,rinv22);
2329             r22              = _mm_andnot_ps(dummy_mask,r22);
2330
2331             /* EWALD ELECTROSTATICS */
2332
2333             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2334             ewrt             = _mm_mul_ps(r22,ewtabscale);
2335             ewitab           = _mm_cvttps_epi32(ewrt);
2336             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2337             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2338                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2339                                          &ewtabF,&ewtabFn);
2340             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2341             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2342
2343             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2344
2345             fscal            = felec;
2346
2347             fscal            = _mm_and_ps(fscal,cutoff_mask);
2348
2349             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2350
2351             /* Calculate temporary vectorial force */
2352             tx               = _mm_mul_ps(fscal,dx22);
2353             ty               = _mm_mul_ps(fscal,dy22);
2354             tz               = _mm_mul_ps(fscal,dz22);
2355
2356             /* Update vectorial force */
2357             fix2             = _mm_add_ps(fix2,tx);
2358             fiy2             = _mm_add_ps(fiy2,ty);
2359             fiz2             = _mm_add_ps(fiz2,tz);
2360
2361             fjx2             = _mm_add_ps(fjx2,tx);
2362             fjy2             = _mm_add_ps(fjy2,ty);
2363             fjz2             = _mm_add_ps(fjz2,tz);
2364             
2365             }
2366
2367             /**************************
2368              * CALCULATE INTERACTIONS *
2369              **************************/
2370
2371             if (gmx_mm_any_lt(rsq23,rcutoff2))
2372             {
2373
2374             r23              = _mm_mul_ps(rsq23,rinv23);
2375             r23              = _mm_andnot_ps(dummy_mask,r23);
2376
2377             /* EWALD ELECTROSTATICS */
2378
2379             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2380             ewrt             = _mm_mul_ps(r23,ewtabscale);
2381             ewitab           = _mm_cvttps_epi32(ewrt);
2382             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2383             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2384                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2385                                          &ewtabF,&ewtabFn);
2386             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2387             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2388
2389             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
2390
2391             fscal            = felec;
2392
2393             fscal            = _mm_and_ps(fscal,cutoff_mask);
2394
2395             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2396
2397             /* Calculate temporary vectorial force */
2398             tx               = _mm_mul_ps(fscal,dx23);
2399             ty               = _mm_mul_ps(fscal,dy23);
2400             tz               = _mm_mul_ps(fscal,dz23);
2401
2402             /* Update vectorial force */
2403             fix2             = _mm_add_ps(fix2,tx);
2404             fiy2             = _mm_add_ps(fiy2,ty);
2405             fiz2             = _mm_add_ps(fiz2,tz);
2406
2407             fjx3             = _mm_add_ps(fjx3,tx);
2408             fjy3             = _mm_add_ps(fjy3,ty);
2409             fjz3             = _mm_add_ps(fjz3,tz);
2410             
2411             }
2412
2413             /**************************
2414              * CALCULATE INTERACTIONS *
2415              **************************/
2416
2417             if (gmx_mm_any_lt(rsq31,rcutoff2))
2418             {
2419
2420             r31              = _mm_mul_ps(rsq31,rinv31);
2421             r31              = _mm_andnot_ps(dummy_mask,r31);
2422
2423             /* EWALD ELECTROSTATICS */
2424
2425             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2426             ewrt             = _mm_mul_ps(r31,ewtabscale);
2427             ewitab           = _mm_cvttps_epi32(ewrt);
2428             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2429             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2430                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2431                                          &ewtabF,&ewtabFn);
2432             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2433             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2434
2435             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
2436
2437             fscal            = felec;
2438
2439             fscal            = _mm_and_ps(fscal,cutoff_mask);
2440
2441             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2442
2443             /* Calculate temporary vectorial force */
2444             tx               = _mm_mul_ps(fscal,dx31);
2445             ty               = _mm_mul_ps(fscal,dy31);
2446             tz               = _mm_mul_ps(fscal,dz31);
2447
2448             /* Update vectorial force */
2449             fix3             = _mm_add_ps(fix3,tx);
2450             fiy3             = _mm_add_ps(fiy3,ty);
2451             fiz3             = _mm_add_ps(fiz3,tz);
2452
2453             fjx1             = _mm_add_ps(fjx1,tx);
2454             fjy1             = _mm_add_ps(fjy1,ty);
2455             fjz1             = _mm_add_ps(fjz1,tz);
2456             
2457             }
2458
2459             /**************************
2460              * CALCULATE INTERACTIONS *
2461              **************************/
2462
2463             if (gmx_mm_any_lt(rsq32,rcutoff2))
2464             {
2465
2466             r32              = _mm_mul_ps(rsq32,rinv32);
2467             r32              = _mm_andnot_ps(dummy_mask,r32);
2468
2469             /* EWALD ELECTROSTATICS */
2470
2471             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2472             ewrt             = _mm_mul_ps(r32,ewtabscale);
2473             ewitab           = _mm_cvttps_epi32(ewrt);
2474             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2475             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2476                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2477                                          &ewtabF,&ewtabFn);
2478             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2479             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2480
2481             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
2482
2483             fscal            = felec;
2484
2485             fscal            = _mm_and_ps(fscal,cutoff_mask);
2486
2487             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2488
2489             /* Calculate temporary vectorial force */
2490             tx               = _mm_mul_ps(fscal,dx32);
2491             ty               = _mm_mul_ps(fscal,dy32);
2492             tz               = _mm_mul_ps(fscal,dz32);
2493
2494             /* Update vectorial force */
2495             fix3             = _mm_add_ps(fix3,tx);
2496             fiy3             = _mm_add_ps(fiy3,ty);
2497             fiz3             = _mm_add_ps(fiz3,tz);
2498
2499             fjx2             = _mm_add_ps(fjx2,tx);
2500             fjy2             = _mm_add_ps(fjy2,ty);
2501             fjz2             = _mm_add_ps(fjz2,tz);
2502             
2503             }
2504
2505             /**************************
2506              * CALCULATE INTERACTIONS *
2507              **************************/
2508
2509             if (gmx_mm_any_lt(rsq33,rcutoff2))
2510             {
2511
2512             r33              = _mm_mul_ps(rsq33,rinv33);
2513             r33              = _mm_andnot_ps(dummy_mask,r33);
2514
2515             /* EWALD ELECTROSTATICS */
2516
2517             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2518             ewrt             = _mm_mul_ps(r33,ewtabscale);
2519             ewitab           = _mm_cvttps_epi32(ewrt);
2520             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2521             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2522                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2523                                          &ewtabF,&ewtabFn);
2524             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2525             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2526
2527             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
2528
2529             fscal            = felec;
2530
2531             fscal            = _mm_and_ps(fscal,cutoff_mask);
2532
2533             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2534
2535             /* Calculate temporary vectorial force */
2536             tx               = _mm_mul_ps(fscal,dx33);
2537             ty               = _mm_mul_ps(fscal,dy33);
2538             tz               = _mm_mul_ps(fscal,dz33);
2539
2540             /* Update vectorial force */
2541             fix3             = _mm_add_ps(fix3,tx);
2542             fiy3             = _mm_add_ps(fiy3,ty);
2543             fiz3             = _mm_add_ps(fiz3,tz);
2544
2545             fjx3             = _mm_add_ps(fjx3,tx);
2546             fjy3             = _mm_add_ps(fjy3,ty);
2547             fjz3             = _mm_add_ps(fjz3,tz);
2548             
2549             }
2550
2551             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2552             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2553             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2554             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2555
2556             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2557                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2558
2559             /* Inner loop uses 360 flops */
2560         }
2561
2562         /* End of innermost loop */
2563
2564         gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2565                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
2566
2567         /* Increment number of inner iterations */
2568         inneriter                  += j_index_end - j_index_start;
2569
2570         /* Outer loop uses 18 flops */
2571     }
2572
2573     /* Increment number of outer iterations */
2574     outeriter        += nri;
2575
2576     /* Update outer/inner flops */
2577
2578     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*360);
2579 }