e85efc2417ef79139ce23cc7f4663f21b8f3f993
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecEw_VdwNone_GeomW4W4_sse2_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse2_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
46
47 #include "gromacs/simd/math_x86_sse2_single.h"
48 #include "kernelutil_x86_sse2_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse2_single
52  * Electrostatics interaction: Ewald
53  * VdW interaction:            None
54  * Geometry:                   Water4-Water4
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse2_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83     int              vdwioffset1;
84     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85     int              vdwioffset2;
86     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87     int              vdwioffset3;
88     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
89     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
90     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
91     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
92     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
93     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
94     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
95     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
96     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
97     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
98     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
99     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
100     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
101     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
102     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
103     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
104     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
105     real             *charge;
106     __m128i          ewitab;
107     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
108     real             *ewtab;
109     __m128           dummy_mask,cutoff_mask;
110     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
111     __m128           one     = _mm_set1_ps(1.0);
112     __m128           two     = _mm_set1_ps(2.0);
113     x                = xx[0];
114     f                = ff[0];
115
116     nri              = nlist->nri;
117     iinr             = nlist->iinr;
118     jindex           = nlist->jindex;
119     jjnr             = nlist->jjnr;
120     shiftidx         = nlist->shift;
121     gid              = nlist->gid;
122     shiftvec         = fr->shift_vec[0];
123     fshift           = fr->fshift[0];
124     facel            = _mm_set1_ps(fr->epsfac);
125     charge           = mdatoms->chargeA;
126
127     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
128     ewtab            = fr->ic->tabq_coul_FDV0;
129     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
130     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
131
132     /* Setup water-specific parameters */
133     inr              = nlist->iinr[0];
134     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
135     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
136     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
137
138     jq1              = _mm_set1_ps(charge[inr+1]);
139     jq2              = _mm_set1_ps(charge[inr+2]);
140     jq3              = _mm_set1_ps(charge[inr+3]);
141     qq11             = _mm_mul_ps(iq1,jq1);
142     qq12             = _mm_mul_ps(iq1,jq2);
143     qq13             = _mm_mul_ps(iq1,jq3);
144     qq21             = _mm_mul_ps(iq2,jq1);
145     qq22             = _mm_mul_ps(iq2,jq2);
146     qq23             = _mm_mul_ps(iq2,jq3);
147     qq31             = _mm_mul_ps(iq3,jq1);
148     qq32             = _mm_mul_ps(iq3,jq2);
149     qq33             = _mm_mul_ps(iq3,jq3);
150
151     /* Avoid stupid compiler warnings */
152     jnrA = jnrB = jnrC = jnrD = 0;
153     j_coord_offsetA = 0;
154     j_coord_offsetB = 0;
155     j_coord_offsetC = 0;
156     j_coord_offsetD = 0;
157
158     outeriter        = 0;
159     inneriter        = 0;
160
161     for(iidx=0;iidx<4*DIM;iidx++)
162     {
163         scratch[iidx] = 0.0;
164     }  
165
166     /* Start outer loop over neighborlists */
167     for(iidx=0; iidx<nri; iidx++)
168     {
169         /* Load shift vector for this list */
170         i_shift_offset   = DIM*shiftidx[iidx];
171
172         /* Load limits for loop over neighbors */
173         j_index_start    = jindex[iidx];
174         j_index_end      = jindex[iidx+1];
175
176         /* Get outer coordinate index */
177         inr              = iinr[iidx];
178         i_coord_offset   = DIM*inr;
179
180         /* Load i particle coords and add shift vector */
181         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
182                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
183         
184         fix1             = _mm_setzero_ps();
185         fiy1             = _mm_setzero_ps();
186         fiz1             = _mm_setzero_ps();
187         fix2             = _mm_setzero_ps();
188         fiy2             = _mm_setzero_ps();
189         fiz2             = _mm_setzero_ps();
190         fix3             = _mm_setzero_ps();
191         fiy3             = _mm_setzero_ps();
192         fiz3             = _mm_setzero_ps();
193
194         /* Reset potential sums */
195         velecsum         = _mm_setzero_ps();
196
197         /* Start inner kernel loop */
198         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
199         {
200
201             /* Get j neighbor index, and coordinate index */
202             jnrA             = jjnr[jidx];
203             jnrB             = jjnr[jidx+1];
204             jnrC             = jjnr[jidx+2];
205             jnrD             = jjnr[jidx+3];
206             j_coord_offsetA  = DIM*jnrA;
207             j_coord_offsetB  = DIM*jnrB;
208             j_coord_offsetC  = DIM*jnrC;
209             j_coord_offsetD  = DIM*jnrD;
210
211             /* load j atom coordinates */
212             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
213                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
214                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
215
216             /* Calculate displacement vector */
217             dx11             = _mm_sub_ps(ix1,jx1);
218             dy11             = _mm_sub_ps(iy1,jy1);
219             dz11             = _mm_sub_ps(iz1,jz1);
220             dx12             = _mm_sub_ps(ix1,jx2);
221             dy12             = _mm_sub_ps(iy1,jy2);
222             dz12             = _mm_sub_ps(iz1,jz2);
223             dx13             = _mm_sub_ps(ix1,jx3);
224             dy13             = _mm_sub_ps(iy1,jy3);
225             dz13             = _mm_sub_ps(iz1,jz3);
226             dx21             = _mm_sub_ps(ix2,jx1);
227             dy21             = _mm_sub_ps(iy2,jy1);
228             dz21             = _mm_sub_ps(iz2,jz1);
229             dx22             = _mm_sub_ps(ix2,jx2);
230             dy22             = _mm_sub_ps(iy2,jy2);
231             dz22             = _mm_sub_ps(iz2,jz2);
232             dx23             = _mm_sub_ps(ix2,jx3);
233             dy23             = _mm_sub_ps(iy2,jy3);
234             dz23             = _mm_sub_ps(iz2,jz3);
235             dx31             = _mm_sub_ps(ix3,jx1);
236             dy31             = _mm_sub_ps(iy3,jy1);
237             dz31             = _mm_sub_ps(iz3,jz1);
238             dx32             = _mm_sub_ps(ix3,jx2);
239             dy32             = _mm_sub_ps(iy3,jy2);
240             dz32             = _mm_sub_ps(iz3,jz2);
241             dx33             = _mm_sub_ps(ix3,jx3);
242             dy33             = _mm_sub_ps(iy3,jy3);
243             dz33             = _mm_sub_ps(iz3,jz3);
244
245             /* Calculate squared distance and things based on it */
246             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
247             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
248             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
249             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
250             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
251             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
252             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
253             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
254             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
255
256             rinv11           = gmx_mm_invsqrt_ps(rsq11);
257             rinv12           = gmx_mm_invsqrt_ps(rsq12);
258             rinv13           = gmx_mm_invsqrt_ps(rsq13);
259             rinv21           = gmx_mm_invsqrt_ps(rsq21);
260             rinv22           = gmx_mm_invsqrt_ps(rsq22);
261             rinv23           = gmx_mm_invsqrt_ps(rsq23);
262             rinv31           = gmx_mm_invsqrt_ps(rsq31);
263             rinv32           = gmx_mm_invsqrt_ps(rsq32);
264             rinv33           = gmx_mm_invsqrt_ps(rsq33);
265
266             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
267             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
268             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
269             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
270             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
271             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
272             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
273             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
274             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
275
276             fjx1             = _mm_setzero_ps();
277             fjy1             = _mm_setzero_ps();
278             fjz1             = _mm_setzero_ps();
279             fjx2             = _mm_setzero_ps();
280             fjy2             = _mm_setzero_ps();
281             fjz2             = _mm_setzero_ps();
282             fjx3             = _mm_setzero_ps();
283             fjy3             = _mm_setzero_ps();
284             fjz3             = _mm_setzero_ps();
285
286             /**************************
287              * CALCULATE INTERACTIONS *
288              **************************/
289
290             r11              = _mm_mul_ps(rsq11,rinv11);
291
292             /* EWALD ELECTROSTATICS */
293
294             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
295             ewrt             = _mm_mul_ps(r11,ewtabscale);
296             ewitab           = _mm_cvttps_epi32(ewrt);
297             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
298             ewitab           = _mm_slli_epi32(ewitab,2);
299             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
300             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
301             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
302             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
303             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
304             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
305             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
306             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
307             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
308
309             /* Update potential sum for this i atom from the interaction with this j atom. */
310             velecsum         = _mm_add_ps(velecsum,velec);
311
312             fscal            = felec;
313
314             /* Calculate temporary vectorial force */
315             tx               = _mm_mul_ps(fscal,dx11);
316             ty               = _mm_mul_ps(fscal,dy11);
317             tz               = _mm_mul_ps(fscal,dz11);
318
319             /* Update vectorial force */
320             fix1             = _mm_add_ps(fix1,tx);
321             fiy1             = _mm_add_ps(fiy1,ty);
322             fiz1             = _mm_add_ps(fiz1,tz);
323
324             fjx1             = _mm_add_ps(fjx1,tx);
325             fjy1             = _mm_add_ps(fjy1,ty);
326             fjz1             = _mm_add_ps(fjz1,tz);
327             
328             /**************************
329              * CALCULATE INTERACTIONS *
330              **************************/
331
332             r12              = _mm_mul_ps(rsq12,rinv12);
333
334             /* EWALD ELECTROSTATICS */
335
336             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
337             ewrt             = _mm_mul_ps(r12,ewtabscale);
338             ewitab           = _mm_cvttps_epi32(ewrt);
339             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
340             ewitab           = _mm_slli_epi32(ewitab,2);
341             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
342             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
343             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
344             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
345             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
346             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
347             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
348             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
349             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
350
351             /* Update potential sum for this i atom from the interaction with this j atom. */
352             velecsum         = _mm_add_ps(velecsum,velec);
353
354             fscal            = felec;
355
356             /* Calculate temporary vectorial force */
357             tx               = _mm_mul_ps(fscal,dx12);
358             ty               = _mm_mul_ps(fscal,dy12);
359             tz               = _mm_mul_ps(fscal,dz12);
360
361             /* Update vectorial force */
362             fix1             = _mm_add_ps(fix1,tx);
363             fiy1             = _mm_add_ps(fiy1,ty);
364             fiz1             = _mm_add_ps(fiz1,tz);
365
366             fjx2             = _mm_add_ps(fjx2,tx);
367             fjy2             = _mm_add_ps(fjy2,ty);
368             fjz2             = _mm_add_ps(fjz2,tz);
369             
370             /**************************
371              * CALCULATE INTERACTIONS *
372              **************************/
373
374             r13              = _mm_mul_ps(rsq13,rinv13);
375
376             /* EWALD ELECTROSTATICS */
377
378             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
379             ewrt             = _mm_mul_ps(r13,ewtabscale);
380             ewitab           = _mm_cvttps_epi32(ewrt);
381             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
382             ewitab           = _mm_slli_epi32(ewitab,2);
383             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
384             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
385             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
386             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
387             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
388             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
389             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
390             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
391             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
392
393             /* Update potential sum for this i atom from the interaction with this j atom. */
394             velecsum         = _mm_add_ps(velecsum,velec);
395
396             fscal            = felec;
397
398             /* Calculate temporary vectorial force */
399             tx               = _mm_mul_ps(fscal,dx13);
400             ty               = _mm_mul_ps(fscal,dy13);
401             tz               = _mm_mul_ps(fscal,dz13);
402
403             /* Update vectorial force */
404             fix1             = _mm_add_ps(fix1,tx);
405             fiy1             = _mm_add_ps(fiy1,ty);
406             fiz1             = _mm_add_ps(fiz1,tz);
407
408             fjx3             = _mm_add_ps(fjx3,tx);
409             fjy3             = _mm_add_ps(fjy3,ty);
410             fjz3             = _mm_add_ps(fjz3,tz);
411             
412             /**************************
413              * CALCULATE INTERACTIONS *
414              **************************/
415
416             r21              = _mm_mul_ps(rsq21,rinv21);
417
418             /* EWALD ELECTROSTATICS */
419
420             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
421             ewrt             = _mm_mul_ps(r21,ewtabscale);
422             ewitab           = _mm_cvttps_epi32(ewrt);
423             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
424             ewitab           = _mm_slli_epi32(ewitab,2);
425             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
426             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
427             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
428             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
429             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
430             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
431             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
432             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
433             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
434
435             /* Update potential sum for this i atom from the interaction with this j atom. */
436             velecsum         = _mm_add_ps(velecsum,velec);
437
438             fscal            = felec;
439
440             /* Calculate temporary vectorial force */
441             tx               = _mm_mul_ps(fscal,dx21);
442             ty               = _mm_mul_ps(fscal,dy21);
443             tz               = _mm_mul_ps(fscal,dz21);
444
445             /* Update vectorial force */
446             fix2             = _mm_add_ps(fix2,tx);
447             fiy2             = _mm_add_ps(fiy2,ty);
448             fiz2             = _mm_add_ps(fiz2,tz);
449
450             fjx1             = _mm_add_ps(fjx1,tx);
451             fjy1             = _mm_add_ps(fjy1,ty);
452             fjz1             = _mm_add_ps(fjz1,tz);
453             
454             /**************************
455              * CALCULATE INTERACTIONS *
456              **************************/
457
458             r22              = _mm_mul_ps(rsq22,rinv22);
459
460             /* EWALD ELECTROSTATICS */
461
462             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
463             ewrt             = _mm_mul_ps(r22,ewtabscale);
464             ewitab           = _mm_cvttps_epi32(ewrt);
465             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
466             ewitab           = _mm_slli_epi32(ewitab,2);
467             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
468             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
469             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
470             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
471             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
472             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
473             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
474             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
475             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
476
477             /* Update potential sum for this i atom from the interaction with this j atom. */
478             velecsum         = _mm_add_ps(velecsum,velec);
479
480             fscal            = felec;
481
482             /* Calculate temporary vectorial force */
483             tx               = _mm_mul_ps(fscal,dx22);
484             ty               = _mm_mul_ps(fscal,dy22);
485             tz               = _mm_mul_ps(fscal,dz22);
486
487             /* Update vectorial force */
488             fix2             = _mm_add_ps(fix2,tx);
489             fiy2             = _mm_add_ps(fiy2,ty);
490             fiz2             = _mm_add_ps(fiz2,tz);
491
492             fjx2             = _mm_add_ps(fjx2,tx);
493             fjy2             = _mm_add_ps(fjy2,ty);
494             fjz2             = _mm_add_ps(fjz2,tz);
495             
496             /**************************
497              * CALCULATE INTERACTIONS *
498              **************************/
499
500             r23              = _mm_mul_ps(rsq23,rinv23);
501
502             /* EWALD ELECTROSTATICS */
503
504             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
505             ewrt             = _mm_mul_ps(r23,ewtabscale);
506             ewitab           = _mm_cvttps_epi32(ewrt);
507             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
508             ewitab           = _mm_slli_epi32(ewitab,2);
509             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
510             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
511             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
512             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
513             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
514             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
515             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
516             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
517             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
518
519             /* Update potential sum for this i atom from the interaction with this j atom. */
520             velecsum         = _mm_add_ps(velecsum,velec);
521
522             fscal            = felec;
523
524             /* Calculate temporary vectorial force */
525             tx               = _mm_mul_ps(fscal,dx23);
526             ty               = _mm_mul_ps(fscal,dy23);
527             tz               = _mm_mul_ps(fscal,dz23);
528
529             /* Update vectorial force */
530             fix2             = _mm_add_ps(fix2,tx);
531             fiy2             = _mm_add_ps(fiy2,ty);
532             fiz2             = _mm_add_ps(fiz2,tz);
533
534             fjx3             = _mm_add_ps(fjx3,tx);
535             fjy3             = _mm_add_ps(fjy3,ty);
536             fjz3             = _mm_add_ps(fjz3,tz);
537             
538             /**************************
539              * CALCULATE INTERACTIONS *
540              **************************/
541
542             r31              = _mm_mul_ps(rsq31,rinv31);
543
544             /* EWALD ELECTROSTATICS */
545
546             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
547             ewrt             = _mm_mul_ps(r31,ewtabscale);
548             ewitab           = _mm_cvttps_epi32(ewrt);
549             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
550             ewitab           = _mm_slli_epi32(ewitab,2);
551             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
552             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
553             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
554             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
555             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
556             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
557             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
558             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
559             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
560
561             /* Update potential sum for this i atom from the interaction with this j atom. */
562             velecsum         = _mm_add_ps(velecsum,velec);
563
564             fscal            = felec;
565
566             /* Calculate temporary vectorial force */
567             tx               = _mm_mul_ps(fscal,dx31);
568             ty               = _mm_mul_ps(fscal,dy31);
569             tz               = _mm_mul_ps(fscal,dz31);
570
571             /* Update vectorial force */
572             fix3             = _mm_add_ps(fix3,tx);
573             fiy3             = _mm_add_ps(fiy3,ty);
574             fiz3             = _mm_add_ps(fiz3,tz);
575
576             fjx1             = _mm_add_ps(fjx1,tx);
577             fjy1             = _mm_add_ps(fjy1,ty);
578             fjz1             = _mm_add_ps(fjz1,tz);
579             
580             /**************************
581              * CALCULATE INTERACTIONS *
582              **************************/
583
584             r32              = _mm_mul_ps(rsq32,rinv32);
585
586             /* EWALD ELECTROSTATICS */
587
588             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
589             ewrt             = _mm_mul_ps(r32,ewtabscale);
590             ewitab           = _mm_cvttps_epi32(ewrt);
591             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
592             ewitab           = _mm_slli_epi32(ewitab,2);
593             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
594             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
595             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
596             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
597             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
598             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
599             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
600             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
601             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
602
603             /* Update potential sum for this i atom from the interaction with this j atom. */
604             velecsum         = _mm_add_ps(velecsum,velec);
605
606             fscal            = felec;
607
608             /* Calculate temporary vectorial force */
609             tx               = _mm_mul_ps(fscal,dx32);
610             ty               = _mm_mul_ps(fscal,dy32);
611             tz               = _mm_mul_ps(fscal,dz32);
612
613             /* Update vectorial force */
614             fix3             = _mm_add_ps(fix3,tx);
615             fiy3             = _mm_add_ps(fiy3,ty);
616             fiz3             = _mm_add_ps(fiz3,tz);
617
618             fjx2             = _mm_add_ps(fjx2,tx);
619             fjy2             = _mm_add_ps(fjy2,ty);
620             fjz2             = _mm_add_ps(fjz2,tz);
621             
622             /**************************
623              * CALCULATE INTERACTIONS *
624              **************************/
625
626             r33              = _mm_mul_ps(rsq33,rinv33);
627
628             /* EWALD ELECTROSTATICS */
629
630             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
631             ewrt             = _mm_mul_ps(r33,ewtabscale);
632             ewitab           = _mm_cvttps_epi32(ewrt);
633             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
634             ewitab           = _mm_slli_epi32(ewitab,2);
635             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
636             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
637             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
638             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
639             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
640             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
641             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
642             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
643             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
644
645             /* Update potential sum for this i atom from the interaction with this j atom. */
646             velecsum         = _mm_add_ps(velecsum,velec);
647
648             fscal            = felec;
649
650             /* Calculate temporary vectorial force */
651             tx               = _mm_mul_ps(fscal,dx33);
652             ty               = _mm_mul_ps(fscal,dy33);
653             tz               = _mm_mul_ps(fscal,dz33);
654
655             /* Update vectorial force */
656             fix3             = _mm_add_ps(fix3,tx);
657             fiy3             = _mm_add_ps(fiy3,ty);
658             fiz3             = _mm_add_ps(fiz3,tz);
659
660             fjx3             = _mm_add_ps(fjx3,tx);
661             fjy3             = _mm_add_ps(fjy3,ty);
662             fjz3             = _mm_add_ps(fjz3,tz);
663             
664             fjptrA             = f+j_coord_offsetA;
665             fjptrB             = f+j_coord_offsetB;
666             fjptrC             = f+j_coord_offsetC;
667             fjptrD             = f+j_coord_offsetD;
668
669             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
670                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
671
672             /* Inner loop uses 369 flops */
673         }
674
675         if(jidx<j_index_end)
676         {
677
678             /* Get j neighbor index, and coordinate index */
679             jnrlistA         = jjnr[jidx];
680             jnrlistB         = jjnr[jidx+1];
681             jnrlistC         = jjnr[jidx+2];
682             jnrlistD         = jjnr[jidx+3];
683             /* Sign of each element will be negative for non-real atoms.
684              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
685              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
686              */
687             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
688             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
689             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
690             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
691             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
692             j_coord_offsetA  = DIM*jnrA;
693             j_coord_offsetB  = DIM*jnrB;
694             j_coord_offsetC  = DIM*jnrC;
695             j_coord_offsetD  = DIM*jnrD;
696
697             /* load j atom coordinates */
698             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
699                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
700                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
701
702             /* Calculate displacement vector */
703             dx11             = _mm_sub_ps(ix1,jx1);
704             dy11             = _mm_sub_ps(iy1,jy1);
705             dz11             = _mm_sub_ps(iz1,jz1);
706             dx12             = _mm_sub_ps(ix1,jx2);
707             dy12             = _mm_sub_ps(iy1,jy2);
708             dz12             = _mm_sub_ps(iz1,jz2);
709             dx13             = _mm_sub_ps(ix1,jx3);
710             dy13             = _mm_sub_ps(iy1,jy3);
711             dz13             = _mm_sub_ps(iz1,jz3);
712             dx21             = _mm_sub_ps(ix2,jx1);
713             dy21             = _mm_sub_ps(iy2,jy1);
714             dz21             = _mm_sub_ps(iz2,jz1);
715             dx22             = _mm_sub_ps(ix2,jx2);
716             dy22             = _mm_sub_ps(iy2,jy2);
717             dz22             = _mm_sub_ps(iz2,jz2);
718             dx23             = _mm_sub_ps(ix2,jx3);
719             dy23             = _mm_sub_ps(iy2,jy3);
720             dz23             = _mm_sub_ps(iz2,jz3);
721             dx31             = _mm_sub_ps(ix3,jx1);
722             dy31             = _mm_sub_ps(iy3,jy1);
723             dz31             = _mm_sub_ps(iz3,jz1);
724             dx32             = _mm_sub_ps(ix3,jx2);
725             dy32             = _mm_sub_ps(iy3,jy2);
726             dz32             = _mm_sub_ps(iz3,jz2);
727             dx33             = _mm_sub_ps(ix3,jx3);
728             dy33             = _mm_sub_ps(iy3,jy3);
729             dz33             = _mm_sub_ps(iz3,jz3);
730
731             /* Calculate squared distance and things based on it */
732             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
733             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
734             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
735             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
736             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
737             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
738             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
739             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
740             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
741
742             rinv11           = gmx_mm_invsqrt_ps(rsq11);
743             rinv12           = gmx_mm_invsqrt_ps(rsq12);
744             rinv13           = gmx_mm_invsqrt_ps(rsq13);
745             rinv21           = gmx_mm_invsqrt_ps(rsq21);
746             rinv22           = gmx_mm_invsqrt_ps(rsq22);
747             rinv23           = gmx_mm_invsqrt_ps(rsq23);
748             rinv31           = gmx_mm_invsqrt_ps(rsq31);
749             rinv32           = gmx_mm_invsqrt_ps(rsq32);
750             rinv33           = gmx_mm_invsqrt_ps(rsq33);
751
752             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
753             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
754             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
755             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
756             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
757             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
758             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
759             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
760             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
761
762             fjx1             = _mm_setzero_ps();
763             fjy1             = _mm_setzero_ps();
764             fjz1             = _mm_setzero_ps();
765             fjx2             = _mm_setzero_ps();
766             fjy2             = _mm_setzero_ps();
767             fjz2             = _mm_setzero_ps();
768             fjx3             = _mm_setzero_ps();
769             fjy3             = _mm_setzero_ps();
770             fjz3             = _mm_setzero_ps();
771
772             /**************************
773              * CALCULATE INTERACTIONS *
774              **************************/
775
776             r11              = _mm_mul_ps(rsq11,rinv11);
777             r11              = _mm_andnot_ps(dummy_mask,r11);
778
779             /* EWALD ELECTROSTATICS */
780
781             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
782             ewrt             = _mm_mul_ps(r11,ewtabscale);
783             ewitab           = _mm_cvttps_epi32(ewrt);
784             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
785             ewitab           = _mm_slli_epi32(ewitab,2);
786             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
787             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
788             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
789             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
790             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
791             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
792             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
793             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
794             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
795
796             /* Update potential sum for this i atom from the interaction with this j atom. */
797             velec            = _mm_andnot_ps(dummy_mask,velec);
798             velecsum         = _mm_add_ps(velecsum,velec);
799
800             fscal            = felec;
801
802             fscal            = _mm_andnot_ps(dummy_mask,fscal);
803
804             /* Calculate temporary vectorial force */
805             tx               = _mm_mul_ps(fscal,dx11);
806             ty               = _mm_mul_ps(fscal,dy11);
807             tz               = _mm_mul_ps(fscal,dz11);
808
809             /* Update vectorial force */
810             fix1             = _mm_add_ps(fix1,tx);
811             fiy1             = _mm_add_ps(fiy1,ty);
812             fiz1             = _mm_add_ps(fiz1,tz);
813
814             fjx1             = _mm_add_ps(fjx1,tx);
815             fjy1             = _mm_add_ps(fjy1,ty);
816             fjz1             = _mm_add_ps(fjz1,tz);
817             
818             /**************************
819              * CALCULATE INTERACTIONS *
820              **************************/
821
822             r12              = _mm_mul_ps(rsq12,rinv12);
823             r12              = _mm_andnot_ps(dummy_mask,r12);
824
825             /* EWALD ELECTROSTATICS */
826
827             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
828             ewrt             = _mm_mul_ps(r12,ewtabscale);
829             ewitab           = _mm_cvttps_epi32(ewrt);
830             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
831             ewitab           = _mm_slli_epi32(ewitab,2);
832             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
833             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
834             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
835             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
836             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
837             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
838             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
839             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
840             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
841
842             /* Update potential sum for this i atom from the interaction with this j atom. */
843             velec            = _mm_andnot_ps(dummy_mask,velec);
844             velecsum         = _mm_add_ps(velecsum,velec);
845
846             fscal            = felec;
847
848             fscal            = _mm_andnot_ps(dummy_mask,fscal);
849
850             /* Calculate temporary vectorial force */
851             tx               = _mm_mul_ps(fscal,dx12);
852             ty               = _mm_mul_ps(fscal,dy12);
853             tz               = _mm_mul_ps(fscal,dz12);
854
855             /* Update vectorial force */
856             fix1             = _mm_add_ps(fix1,tx);
857             fiy1             = _mm_add_ps(fiy1,ty);
858             fiz1             = _mm_add_ps(fiz1,tz);
859
860             fjx2             = _mm_add_ps(fjx2,tx);
861             fjy2             = _mm_add_ps(fjy2,ty);
862             fjz2             = _mm_add_ps(fjz2,tz);
863             
864             /**************************
865              * CALCULATE INTERACTIONS *
866              **************************/
867
868             r13              = _mm_mul_ps(rsq13,rinv13);
869             r13              = _mm_andnot_ps(dummy_mask,r13);
870
871             /* EWALD ELECTROSTATICS */
872
873             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
874             ewrt             = _mm_mul_ps(r13,ewtabscale);
875             ewitab           = _mm_cvttps_epi32(ewrt);
876             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
877             ewitab           = _mm_slli_epi32(ewitab,2);
878             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
879             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
880             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
881             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
882             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
883             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
884             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
885             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
886             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
887
888             /* Update potential sum for this i atom from the interaction with this j atom. */
889             velec            = _mm_andnot_ps(dummy_mask,velec);
890             velecsum         = _mm_add_ps(velecsum,velec);
891
892             fscal            = felec;
893
894             fscal            = _mm_andnot_ps(dummy_mask,fscal);
895
896             /* Calculate temporary vectorial force */
897             tx               = _mm_mul_ps(fscal,dx13);
898             ty               = _mm_mul_ps(fscal,dy13);
899             tz               = _mm_mul_ps(fscal,dz13);
900
901             /* Update vectorial force */
902             fix1             = _mm_add_ps(fix1,tx);
903             fiy1             = _mm_add_ps(fiy1,ty);
904             fiz1             = _mm_add_ps(fiz1,tz);
905
906             fjx3             = _mm_add_ps(fjx3,tx);
907             fjy3             = _mm_add_ps(fjy3,ty);
908             fjz3             = _mm_add_ps(fjz3,tz);
909             
910             /**************************
911              * CALCULATE INTERACTIONS *
912              **************************/
913
914             r21              = _mm_mul_ps(rsq21,rinv21);
915             r21              = _mm_andnot_ps(dummy_mask,r21);
916
917             /* EWALD ELECTROSTATICS */
918
919             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
920             ewrt             = _mm_mul_ps(r21,ewtabscale);
921             ewitab           = _mm_cvttps_epi32(ewrt);
922             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
923             ewitab           = _mm_slli_epi32(ewitab,2);
924             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
925             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
926             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
927             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
928             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
929             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
930             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
931             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
932             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
933
934             /* Update potential sum for this i atom from the interaction with this j atom. */
935             velec            = _mm_andnot_ps(dummy_mask,velec);
936             velecsum         = _mm_add_ps(velecsum,velec);
937
938             fscal            = felec;
939
940             fscal            = _mm_andnot_ps(dummy_mask,fscal);
941
942             /* Calculate temporary vectorial force */
943             tx               = _mm_mul_ps(fscal,dx21);
944             ty               = _mm_mul_ps(fscal,dy21);
945             tz               = _mm_mul_ps(fscal,dz21);
946
947             /* Update vectorial force */
948             fix2             = _mm_add_ps(fix2,tx);
949             fiy2             = _mm_add_ps(fiy2,ty);
950             fiz2             = _mm_add_ps(fiz2,tz);
951
952             fjx1             = _mm_add_ps(fjx1,tx);
953             fjy1             = _mm_add_ps(fjy1,ty);
954             fjz1             = _mm_add_ps(fjz1,tz);
955             
956             /**************************
957              * CALCULATE INTERACTIONS *
958              **************************/
959
960             r22              = _mm_mul_ps(rsq22,rinv22);
961             r22              = _mm_andnot_ps(dummy_mask,r22);
962
963             /* EWALD ELECTROSTATICS */
964
965             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
966             ewrt             = _mm_mul_ps(r22,ewtabscale);
967             ewitab           = _mm_cvttps_epi32(ewrt);
968             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
969             ewitab           = _mm_slli_epi32(ewitab,2);
970             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
971             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
972             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
973             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
974             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
975             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
976             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
977             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
978             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
979
980             /* Update potential sum for this i atom from the interaction with this j atom. */
981             velec            = _mm_andnot_ps(dummy_mask,velec);
982             velecsum         = _mm_add_ps(velecsum,velec);
983
984             fscal            = felec;
985
986             fscal            = _mm_andnot_ps(dummy_mask,fscal);
987
988             /* Calculate temporary vectorial force */
989             tx               = _mm_mul_ps(fscal,dx22);
990             ty               = _mm_mul_ps(fscal,dy22);
991             tz               = _mm_mul_ps(fscal,dz22);
992
993             /* Update vectorial force */
994             fix2             = _mm_add_ps(fix2,tx);
995             fiy2             = _mm_add_ps(fiy2,ty);
996             fiz2             = _mm_add_ps(fiz2,tz);
997
998             fjx2             = _mm_add_ps(fjx2,tx);
999             fjy2             = _mm_add_ps(fjy2,ty);
1000             fjz2             = _mm_add_ps(fjz2,tz);
1001             
1002             /**************************
1003              * CALCULATE INTERACTIONS *
1004              **************************/
1005
1006             r23              = _mm_mul_ps(rsq23,rinv23);
1007             r23              = _mm_andnot_ps(dummy_mask,r23);
1008
1009             /* EWALD ELECTROSTATICS */
1010
1011             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1012             ewrt             = _mm_mul_ps(r23,ewtabscale);
1013             ewitab           = _mm_cvttps_epi32(ewrt);
1014             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1015             ewitab           = _mm_slli_epi32(ewitab,2);
1016             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1017             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1018             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1019             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1020             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1021             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1022             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1023             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
1024             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1025
1026             /* Update potential sum for this i atom from the interaction with this j atom. */
1027             velec            = _mm_andnot_ps(dummy_mask,velec);
1028             velecsum         = _mm_add_ps(velecsum,velec);
1029
1030             fscal            = felec;
1031
1032             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1033
1034             /* Calculate temporary vectorial force */
1035             tx               = _mm_mul_ps(fscal,dx23);
1036             ty               = _mm_mul_ps(fscal,dy23);
1037             tz               = _mm_mul_ps(fscal,dz23);
1038
1039             /* Update vectorial force */
1040             fix2             = _mm_add_ps(fix2,tx);
1041             fiy2             = _mm_add_ps(fiy2,ty);
1042             fiz2             = _mm_add_ps(fiz2,tz);
1043
1044             fjx3             = _mm_add_ps(fjx3,tx);
1045             fjy3             = _mm_add_ps(fjy3,ty);
1046             fjz3             = _mm_add_ps(fjz3,tz);
1047             
1048             /**************************
1049              * CALCULATE INTERACTIONS *
1050              **************************/
1051
1052             r31              = _mm_mul_ps(rsq31,rinv31);
1053             r31              = _mm_andnot_ps(dummy_mask,r31);
1054
1055             /* EWALD ELECTROSTATICS */
1056
1057             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1058             ewrt             = _mm_mul_ps(r31,ewtabscale);
1059             ewitab           = _mm_cvttps_epi32(ewrt);
1060             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1061             ewitab           = _mm_slli_epi32(ewitab,2);
1062             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1063             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1064             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1065             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1066             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1067             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1068             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1069             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
1070             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1071
1072             /* Update potential sum for this i atom from the interaction with this j atom. */
1073             velec            = _mm_andnot_ps(dummy_mask,velec);
1074             velecsum         = _mm_add_ps(velecsum,velec);
1075
1076             fscal            = felec;
1077
1078             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1079
1080             /* Calculate temporary vectorial force */
1081             tx               = _mm_mul_ps(fscal,dx31);
1082             ty               = _mm_mul_ps(fscal,dy31);
1083             tz               = _mm_mul_ps(fscal,dz31);
1084
1085             /* Update vectorial force */
1086             fix3             = _mm_add_ps(fix3,tx);
1087             fiy3             = _mm_add_ps(fiy3,ty);
1088             fiz3             = _mm_add_ps(fiz3,tz);
1089
1090             fjx1             = _mm_add_ps(fjx1,tx);
1091             fjy1             = _mm_add_ps(fjy1,ty);
1092             fjz1             = _mm_add_ps(fjz1,tz);
1093             
1094             /**************************
1095              * CALCULATE INTERACTIONS *
1096              **************************/
1097
1098             r32              = _mm_mul_ps(rsq32,rinv32);
1099             r32              = _mm_andnot_ps(dummy_mask,r32);
1100
1101             /* EWALD ELECTROSTATICS */
1102
1103             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1104             ewrt             = _mm_mul_ps(r32,ewtabscale);
1105             ewitab           = _mm_cvttps_epi32(ewrt);
1106             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1107             ewitab           = _mm_slli_epi32(ewitab,2);
1108             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1109             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1110             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1111             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1112             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1113             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1114             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1115             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
1116             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1117
1118             /* Update potential sum for this i atom from the interaction with this j atom. */
1119             velec            = _mm_andnot_ps(dummy_mask,velec);
1120             velecsum         = _mm_add_ps(velecsum,velec);
1121
1122             fscal            = felec;
1123
1124             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1125
1126             /* Calculate temporary vectorial force */
1127             tx               = _mm_mul_ps(fscal,dx32);
1128             ty               = _mm_mul_ps(fscal,dy32);
1129             tz               = _mm_mul_ps(fscal,dz32);
1130
1131             /* Update vectorial force */
1132             fix3             = _mm_add_ps(fix3,tx);
1133             fiy3             = _mm_add_ps(fiy3,ty);
1134             fiz3             = _mm_add_ps(fiz3,tz);
1135
1136             fjx2             = _mm_add_ps(fjx2,tx);
1137             fjy2             = _mm_add_ps(fjy2,ty);
1138             fjz2             = _mm_add_ps(fjz2,tz);
1139             
1140             /**************************
1141              * CALCULATE INTERACTIONS *
1142              **************************/
1143
1144             r33              = _mm_mul_ps(rsq33,rinv33);
1145             r33              = _mm_andnot_ps(dummy_mask,r33);
1146
1147             /* EWALD ELECTROSTATICS */
1148
1149             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1150             ewrt             = _mm_mul_ps(r33,ewtabscale);
1151             ewitab           = _mm_cvttps_epi32(ewrt);
1152             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1153             ewitab           = _mm_slli_epi32(ewitab,2);
1154             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1155             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1156             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1157             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1158             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1159             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1160             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1161             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
1162             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1163
1164             /* Update potential sum for this i atom from the interaction with this j atom. */
1165             velec            = _mm_andnot_ps(dummy_mask,velec);
1166             velecsum         = _mm_add_ps(velecsum,velec);
1167
1168             fscal            = felec;
1169
1170             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1171
1172             /* Calculate temporary vectorial force */
1173             tx               = _mm_mul_ps(fscal,dx33);
1174             ty               = _mm_mul_ps(fscal,dy33);
1175             tz               = _mm_mul_ps(fscal,dz33);
1176
1177             /* Update vectorial force */
1178             fix3             = _mm_add_ps(fix3,tx);
1179             fiy3             = _mm_add_ps(fiy3,ty);
1180             fiz3             = _mm_add_ps(fiz3,tz);
1181
1182             fjx3             = _mm_add_ps(fjx3,tx);
1183             fjy3             = _mm_add_ps(fjy3,ty);
1184             fjz3             = _mm_add_ps(fjz3,tz);
1185             
1186             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1187             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1188             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1189             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1190
1191             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1192                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1193
1194             /* Inner loop uses 378 flops */
1195         }
1196
1197         /* End of innermost loop */
1198
1199         gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1200                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
1201
1202         ggid                        = gid[iidx];
1203         /* Update potential energies */
1204         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1205
1206         /* Increment number of inner iterations */
1207         inneriter                  += j_index_end - j_index_start;
1208
1209         /* Outer loop uses 19 flops */
1210     }
1211
1212     /* Increment number of outer iterations */
1213     outeriter        += nri;
1214
1215     /* Update outer/inner flops */
1216
1217     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*378);
1218 }
1219 /*
1220  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse2_single
1221  * Electrostatics interaction: Ewald
1222  * VdW interaction:            None
1223  * Geometry:                   Water4-Water4
1224  * Calculate force/pot:        Force
1225  */
1226 void
1227 nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse2_single
1228                     (t_nblist                    * gmx_restrict       nlist,
1229                      rvec                        * gmx_restrict          xx,
1230                      rvec                        * gmx_restrict          ff,
1231                      t_forcerec                  * gmx_restrict          fr,
1232                      t_mdatoms                   * gmx_restrict     mdatoms,
1233                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1234                      t_nrnb                      * gmx_restrict        nrnb)
1235 {
1236     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1237      * just 0 for non-waters.
1238      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1239      * jnr indices corresponding to data put in the four positions in the SIMD register.
1240      */
1241     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1242     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1243     int              jnrA,jnrB,jnrC,jnrD;
1244     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1245     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1246     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1247     real             rcutoff_scalar;
1248     real             *shiftvec,*fshift,*x,*f;
1249     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1250     real             scratch[4*DIM];
1251     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1252     int              vdwioffset1;
1253     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1254     int              vdwioffset2;
1255     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1256     int              vdwioffset3;
1257     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1258     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1259     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1260     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1261     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1262     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1263     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1264     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1265     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1266     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1267     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1268     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1269     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1270     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1271     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1272     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1273     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1274     real             *charge;
1275     __m128i          ewitab;
1276     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1277     real             *ewtab;
1278     __m128           dummy_mask,cutoff_mask;
1279     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1280     __m128           one     = _mm_set1_ps(1.0);
1281     __m128           two     = _mm_set1_ps(2.0);
1282     x                = xx[0];
1283     f                = ff[0];
1284
1285     nri              = nlist->nri;
1286     iinr             = nlist->iinr;
1287     jindex           = nlist->jindex;
1288     jjnr             = nlist->jjnr;
1289     shiftidx         = nlist->shift;
1290     gid              = nlist->gid;
1291     shiftvec         = fr->shift_vec[0];
1292     fshift           = fr->fshift[0];
1293     facel            = _mm_set1_ps(fr->epsfac);
1294     charge           = mdatoms->chargeA;
1295
1296     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1297     ewtab            = fr->ic->tabq_coul_F;
1298     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1299     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1300
1301     /* Setup water-specific parameters */
1302     inr              = nlist->iinr[0];
1303     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1304     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1305     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1306
1307     jq1              = _mm_set1_ps(charge[inr+1]);
1308     jq2              = _mm_set1_ps(charge[inr+2]);
1309     jq3              = _mm_set1_ps(charge[inr+3]);
1310     qq11             = _mm_mul_ps(iq1,jq1);
1311     qq12             = _mm_mul_ps(iq1,jq2);
1312     qq13             = _mm_mul_ps(iq1,jq3);
1313     qq21             = _mm_mul_ps(iq2,jq1);
1314     qq22             = _mm_mul_ps(iq2,jq2);
1315     qq23             = _mm_mul_ps(iq2,jq3);
1316     qq31             = _mm_mul_ps(iq3,jq1);
1317     qq32             = _mm_mul_ps(iq3,jq2);
1318     qq33             = _mm_mul_ps(iq3,jq3);
1319
1320     /* Avoid stupid compiler warnings */
1321     jnrA = jnrB = jnrC = jnrD = 0;
1322     j_coord_offsetA = 0;
1323     j_coord_offsetB = 0;
1324     j_coord_offsetC = 0;
1325     j_coord_offsetD = 0;
1326
1327     outeriter        = 0;
1328     inneriter        = 0;
1329
1330     for(iidx=0;iidx<4*DIM;iidx++)
1331     {
1332         scratch[iidx] = 0.0;
1333     }  
1334
1335     /* Start outer loop over neighborlists */
1336     for(iidx=0; iidx<nri; iidx++)
1337     {
1338         /* Load shift vector for this list */
1339         i_shift_offset   = DIM*shiftidx[iidx];
1340
1341         /* Load limits for loop over neighbors */
1342         j_index_start    = jindex[iidx];
1343         j_index_end      = jindex[iidx+1];
1344
1345         /* Get outer coordinate index */
1346         inr              = iinr[iidx];
1347         i_coord_offset   = DIM*inr;
1348
1349         /* Load i particle coords and add shift vector */
1350         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1351                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1352         
1353         fix1             = _mm_setzero_ps();
1354         fiy1             = _mm_setzero_ps();
1355         fiz1             = _mm_setzero_ps();
1356         fix2             = _mm_setzero_ps();
1357         fiy2             = _mm_setzero_ps();
1358         fiz2             = _mm_setzero_ps();
1359         fix3             = _mm_setzero_ps();
1360         fiy3             = _mm_setzero_ps();
1361         fiz3             = _mm_setzero_ps();
1362
1363         /* Start inner kernel loop */
1364         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1365         {
1366
1367             /* Get j neighbor index, and coordinate index */
1368             jnrA             = jjnr[jidx];
1369             jnrB             = jjnr[jidx+1];
1370             jnrC             = jjnr[jidx+2];
1371             jnrD             = jjnr[jidx+3];
1372             j_coord_offsetA  = DIM*jnrA;
1373             j_coord_offsetB  = DIM*jnrB;
1374             j_coord_offsetC  = DIM*jnrC;
1375             j_coord_offsetD  = DIM*jnrD;
1376
1377             /* load j atom coordinates */
1378             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1379                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1380                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1381
1382             /* Calculate displacement vector */
1383             dx11             = _mm_sub_ps(ix1,jx1);
1384             dy11             = _mm_sub_ps(iy1,jy1);
1385             dz11             = _mm_sub_ps(iz1,jz1);
1386             dx12             = _mm_sub_ps(ix1,jx2);
1387             dy12             = _mm_sub_ps(iy1,jy2);
1388             dz12             = _mm_sub_ps(iz1,jz2);
1389             dx13             = _mm_sub_ps(ix1,jx3);
1390             dy13             = _mm_sub_ps(iy1,jy3);
1391             dz13             = _mm_sub_ps(iz1,jz3);
1392             dx21             = _mm_sub_ps(ix2,jx1);
1393             dy21             = _mm_sub_ps(iy2,jy1);
1394             dz21             = _mm_sub_ps(iz2,jz1);
1395             dx22             = _mm_sub_ps(ix2,jx2);
1396             dy22             = _mm_sub_ps(iy2,jy2);
1397             dz22             = _mm_sub_ps(iz2,jz2);
1398             dx23             = _mm_sub_ps(ix2,jx3);
1399             dy23             = _mm_sub_ps(iy2,jy3);
1400             dz23             = _mm_sub_ps(iz2,jz3);
1401             dx31             = _mm_sub_ps(ix3,jx1);
1402             dy31             = _mm_sub_ps(iy3,jy1);
1403             dz31             = _mm_sub_ps(iz3,jz1);
1404             dx32             = _mm_sub_ps(ix3,jx2);
1405             dy32             = _mm_sub_ps(iy3,jy2);
1406             dz32             = _mm_sub_ps(iz3,jz2);
1407             dx33             = _mm_sub_ps(ix3,jx3);
1408             dy33             = _mm_sub_ps(iy3,jy3);
1409             dz33             = _mm_sub_ps(iz3,jz3);
1410
1411             /* Calculate squared distance and things based on it */
1412             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1413             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1414             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1415             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1416             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1417             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1418             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1419             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1420             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1421
1422             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1423             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1424             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1425             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1426             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1427             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1428             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1429             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1430             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1431
1432             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1433             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1434             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1435             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1436             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1437             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1438             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1439             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1440             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1441
1442             fjx1             = _mm_setzero_ps();
1443             fjy1             = _mm_setzero_ps();
1444             fjz1             = _mm_setzero_ps();
1445             fjx2             = _mm_setzero_ps();
1446             fjy2             = _mm_setzero_ps();
1447             fjz2             = _mm_setzero_ps();
1448             fjx3             = _mm_setzero_ps();
1449             fjy3             = _mm_setzero_ps();
1450             fjz3             = _mm_setzero_ps();
1451
1452             /**************************
1453              * CALCULATE INTERACTIONS *
1454              **************************/
1455
1456             r11              = _mm_mul_ps(rsq11,rinv11);
1457
1458             /* EWALD ELECTROSTATICS */
1459
1460             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1461             ewrt             = _mm_mul_ps(r11,ewtabscale);
1462             ewitab           = _mm_cvttps_epi32(ewrt);
1463             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1464             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1465                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1466                                          &ewtabF,&ewtabFn);
1467             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1468             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1469
1470             fscal            = felec;
1471
1472             /* Calculate temporary vectorial force */
1473             tx               = _mm_mul_ps(fscal,dx11);
1474             ty               = _mm_mul_ps(fscal,dy11);
1475             tz               = _mm_mul_ps(fscal,dz11);
1476
1477             /* Update vectorial force */
1478             fix1             = _mm_add_ps(fix1,tx);
1479             fiy1             = _mm_add_ps(fiy1,ty);
1480             fiz1             = _mm_add_ps(fiz1,tz);
1481
1482             fjx1             = _mm_add_ps(fjx1,tx);
1483             fjy1             = _mm_add_ps(fjy1,ty);
1484             fjz1             = _mm_add_ps(fjz1,tz);
1485             
1486             /**************************
1487              * CALCULATE INTERACTIONS *
1488              **************************/
1489
1490             r12              = _mm_mul_ps(rsq12,rinv12);
1491
1492             /* EWALD ELECTROSTATICS */
1493
1494             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1495             ewrt             = _mm_mul_ps(r12,ewtabscale);
1496             ewitab           = _mm_cvttps_epi32(ewrt);
1497             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1498             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1499                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1500                                          &ewtabF,&ewtabFn);
1501             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1502             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1503
1504             fscal            = felec;
1505
1506             /* Calculate temporary vectorial force */
1507             tx               = _mm_mul_ps(fscal,dx12);
1508             ty               = _mm_mul_ps(fscal,dy12);
1509             tz               = _mm_mul_ps(fscal,dz12);
1510
1511             /* Update vectorial force */
1512             fix1             = _mm_add_ps(fix1,tx);
1513             fiy1             = _mm_add_ps(fiy1,ty);
1514             fiz1             = _mm_add_ps(fiz1,tz);
1515
1516             fjx2             = _mm_add_ps(fjx2,tx);
1517             fjy2             = _mm_add_ps(fjy2,ty);
1518             fjz2             = _mm_add_ps(fjz2,tz);
1519             
1520             /**************************
1521              * CALCULATE INTERACTIONS *
1522              **************************/
1523
1524             r13              = _mm_mul_ps(rsq13,rinv13);
1525
1526             /* EWALD ELECTROSTATICS */
1527
1528             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1529             ewrt             = _mm_mul_ps(r13,ewtabscale);
1530             ewitab           = _mm_cvttps_epi32(ewrt);
1531             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1532             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1533                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1534                                          &ewtabF,&ewtabFn);
1535             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1536             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1537
1538             fscal            = felec;
1539
1540             /* Calculate temporary vectorial force */
1541             tx               = _mm_mul_ps(fscal,dx13);
1542             ty               = _mm_mul_ps(fscal,dy13);
1543             tz               = _mm_mul_ps(fscal,dz13);
1544
1545             /* Update vectorial force */
1546             fix1             = _mm_add_ps(fix1,tx);
1547             fiy1             = _mm_add_ps(fiy1,ty);
1548             fiz1             = _mm_add_ps(fiz1,tz);
1549
1550             fjx3             = _mm_add_ps(fjx3,tx);
1551             fjy3             = _mm_add_ps(fjy3,ty);
1552             fjz3             = _mm_add_ps(fjz3,tz);
1553             
1554             /**************************
1555              * CALCULATE INTERACTIONS *
1556              **************************/
1557
1558             r21              = _mm_mul_ps(rsq21,rinv21);
1559
1560             /* EWALD ELECTROSTATICS */
1561
1562             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1563             ewrt             = _mm_mul_ps(r21,ewtabscale);
1564             ewitab           = _mm_cvttps_epi32(ewrt);
1565             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1566             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1567                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1568                                          &ewtabF,&ewtabFn);
1569             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1570             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1571
1572             fscal            = felec;
1573
1574             /* Calculate temporary vectorial force */
1575             tx               = _mm_mul_ps(fscal,dx21);
1576             ty               = _mm_mul_ps(fscal,dy21);
1577             tz               = _mm_mul_ps(fscal,dz21);
1578
1579             /* Update vectorial force */
1580             fix2             = _mm_add_ps(fix2,tx);
1581             fiy2             = _mm_add_ps(fiy2,ty);
1582             fiz2             = _mm_add_ps(fiz2,tz);
1583
1584             fjx1             = _mm_add_ps(fjx1,tx);
1585             fjy1             = _mm_add_ps(fjy1,ty);
1586             fjz1             = _mm_add_ps(fjz1,tz);
1587             
1588             /**************************
1589              * CALCULATE INTERACTIONS *
1590              **************************/
1591
1592             r22              = _mm_mul_ps(rsq22,rinv22);
1593
1594             /* EWALD ELECTROSTATICS */
1595
1596             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1597             ewrt             = _mm_mul_ps(r22,ewtabscale);
1598             ewitab           = _mm_cvttps_epi32(ewrt);
1599             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1600             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1601                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1602                                          &ewtabF,&ewtabFn);
1603             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1604             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1605
1606             fscal            = felec;
1607
1608             /* Calculate temporary vectorial force */
1609             tx               = _mm_mul_ps(fscal,dx22);
1610             ty               = _mm_mul_ps(fscal,dy22);
1611             tz               = _mm_mul_ps(fscal,dz22);
1612
1613             /* Update vectorial force */
1614             fix2             = _mm_add_ps(fix2,tx);
1615             fiy2             = _mm_add_ps(fiy2,ty);
1616             fiz2             = _mm_add_ps(fiz2,tz);
1617
1618             fjx2             = _mm_add_ps(fjx2,tx);
1619             fjy2             = _mm_add_ps(fjy2,ty);
1620             fjz2             = _mm_add_ps(fjz2,tz);
1621             
1622             /**************************
1623              * CALCULATE INTERACTIONS *
1624              **************************/
1625
1626             r23              = _mm_mul_ps(rsq23,rinv23);
1627
1628             /* EWALD ELECTROSTATICS */
1629
1630             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1631             ewrt             = _mm_mul_ps(r23,ewtabscale);
1632             ewitab           = _mm_cvttps_epi32(ewrt);
1633             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1634             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1635                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1636                                          &ewtabF,&ewtabFn);
1637             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1638             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1639
1640             fscal            = felec;
1641
1642             /* Calculate temporary vectorial force */
1643             tx               = _mm_mul_ps(fscal,dx23);
1644             ty               = _mm_mul_ps(fscal,dy23);
1645             tz               = _mm_mul_ps(fscal,dz23);
1646
1647             /* Update vectorial force */
1648             fix2             = _mm_add_ps(fix2,tx);
1649             fiy2             = _mm_add_ps(fiy2,ty);
1650             fiz2             = _mm_add_ps(fiz2,tz);
1651
1652             fjx3             = _mm_add_ps(fjx3,tx);
1653             fjy3             = _mm_add_ps(fjy3,ty);
1654             fjz3             = _mm_add_ps(fjz3,tz);
1655             
1656             /**************************
1657              * CALCULATE INTERACTIONS *
1658              **************************/
1659
1660             r31              = _mm_mul_ps(rsq31,rinv31);
1661
1662             /* EWALD ELECTROSTATICS */
1663
1664             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1665             ewrt             = _mm_mul_ps(r31,ewtabscale);
1666             ewitab           = _mm_cvttps_epi32(ewrt);
1667             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1668             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1669                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1670                                          &ewtabF,&ewtabFn);
1671             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1672             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1673
1674             fscal            = felec;
1675
1676             /* Calculate temporary vectorial force */
1677             tx               = _mm_mul_ps(fscal,dx31);
1678             ty               = _mm_mul_ps(fscal,dy31);
1679             tz               = _mm_mul_ps(fscal,dz31);
1680
1681             /* Update vectorial force */
1682             fix3             = _mm_add_ps(fix3,tx);
1683             fiy3             = _mm_add_ps(fiy3,ty);
1684             fiz3             = _mm_add_ps(fiz3,tz);
1685
1686             fjx1             = _mm_add_ps(fjx1,tx);
1687             fjy1             = _mm_add_ps(fjy1,ty);
1688             fjz1             = _mm_add_ps(fjz1,tz);
1689             
1690             /**************************
1691              * CALCULATE INTERACTIONS *
1692              **************************/
1693
1694             r32              = _mm_mul_ps(rsq32,rinv32);
1695
1696             /* EWALD ELECTROSTATICS */
1697
1698             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1699             ewrt             = _mm_mul_ps(r32,ewtabscale);
1700             ewitab           = _mm_cvttps_epi32(ewrt);
1701             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1702             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1703                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1704                                          &ewtabF,&ewtabFn);
1705             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1706             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1707
1708             fscal            = felec;
1709
1710             /* Calculate temporary vectorial force */
1711             tx               = _mm_mul_ps(fscal,dx32);
1712             ty               = _mm_mul_ps(fscal,dy32);
1713             tz               = _mm_mul_ps(fscal,dz32);
1714
1715             /* Update vectorial force */
1716             fix3             = _mm_add_ps(fix3,tx);
1717             fiy3             = _mm_add_ps(fiy3,ty);
1718             fiz3             = _mm_add_ps(fiz3,tz);
1719
1720             fjx2             = _mm_add_ps(fjx2,tx);
1721             fjy2             = _mm_add_ps(fjy2,ty);
1722             fjz2             = _mm_add_ps(fjz2,tz);
1723             
1724             /**************************
1725              * CALCULATE INTERACTIONS *
1726              **************************/
1727
1728             r33              = _mm_mul_ps(rsq33,rinv33);
1729
1730             /* EWALD ELECTROSTATICS */
1731
1732             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1733             ewrt             = _mm_mul_ps(r33,ewtabscale);
1734             ewitab           = _mm_cvttps_epi32(ewrt);
1735             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1736             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1737                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1738                                          &ewtabF,&ewtabFn);
1739             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1740             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1741
1742             fscal            = felec;
1743
1744             /* Calculate temporary vectorial force */
1745             tx               = _mm_mul_ps(fscal,dx33);
1746             ty               = _mm_mul_ps(fscal,dy33);
1747             tz               = _mm_mul_ps(fscal,dz33);
1748
1749             /* Update vectorial force */
1750             fix3             = _mm_add_ps(fix3,tx);
1751             fiy3             = _mm_add_ps(fiy3,ty);
1752             fiz3             = _mm_add_ps(fiz3,tz);
1753
1754             fjx3             = _mm_add_ps(fjx3,tx);
1755             fjy3             = _mm_add_ps(fjy3,ty);
1756             fjz3             = _mm_add_ps(fjz3,tz);
1757             
1758             fjptrA             = f+j_coord_offsetA;
1759             fjptrB             = f+j_coord_offsetB;
1760             fjptrC             = f+j_coord_offsetC;
1761             fjptrD             = f+j_coord_offsetD;
1762
1763             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1764                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1765
1766             /* Inner loop uses 324 flops */
1767         }
1768
1769         if(jidx<j_index_end)
1770         {
1771
1772             /* Get j neighbor index, and coordinate index */
1773             jnrlistA         = jjnr[jidx];
1774             jnrlistB         = jjnr[jidx+1];
1775             jnrlistC         = jjnr[jidx+2];
1776             jnrlistD         = jjnr[jidx+3];
1777             /* Sign of each element will be negative for non-real atoms.
1778              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1779              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1780              */
1781             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1782             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1783             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1784             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1785             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1786             j_coord_offsetA  = DIM*jnrA;
1787             j_coord_offsetB  = DIM*jnrB;
1788             j_coord_offsetC  = DIM*jnrC;
1789             j_coord_offsetD  = DIM*jnrD;
1790
1791             /* load j atom coordinates */
1792             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1793                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1794                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1795
1796             /* Calculate displacement vector */
1797             dx11             = _mm_sub_ps(ix1,jx1);
1798             dy11             = _mm_sub_ps(iy1,jy1);
1799             dz11             = _mm_sub_ps(iz1,jz1);
1800             dx12             = _mm_sub_ps(ix1,jx2);
1801             dy12             = _mm_sub_ps(iy1,jy2);
1802             dz12             = _mm_sub_ps(iz1,jz2);
1803             dx13             = _mm_sub_ps(ix1,jx3);
1804             dy13             = _mm_sub_ps(iy1,jy3);
1805             dz13             = _mm_sub_ps(iz1,jz3);
1806             dx21             = _mm_sub_ps(ix2,jx1);
1807             dy21             = _mm_sub_ps(iy2,jy1);
1808             dz21             = _mm_sub_ps(iz2,jz1);
1809             dx22             = _mm_sub_ps(ix2,jx2);
1810             dy22             = _mm_sub_ps(iy2,jy2);
1811             dz22             = _mm_sub_ps(iz2,jz2);
1812             dx23             = _mm_sub_ps(ix2,jx3);
1813             dy23             = _mm_sub_ps(iy2,jy3);
1814             dz23             = _mm_sub_ps(iz2,jz3);
1815             dx31             = _mm_sub_ps(ix3,jx1);
1816             dy31             = _mm_sub_ps(iy3,jy1);
1817             dz31             = _mm_sub_ps(iz3,jz1);
1818             dx32             = _mm_sub_ps(ix3,jx2);
1819             dy32             = _mm_sub_ps(iy3,jy2);
1820             dz32             = _mm_sub_ps(iz3,jz2);
1821             dx33             = _mm_sub_ps(ix3,jx3);
1822             dy33             = _mm_sub_ps(iy3,jy3);
1823             dz33             = _mm_sub_ps(iz3,jz3);
1824
1825             /* Calculate squared distance and things based on it */
1826             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1827             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1828             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1829             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1830             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1831             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1832             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1833             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1834             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1835
1836             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1837             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1838             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1839             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1840             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1841             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1842             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1843             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1844             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1845
1846             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1847             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1848             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1849             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1850             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1851             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1852             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1853             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1854             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1855
1856             fjx1             = _mm_setzero_ps();
1857             fjy1             = _mm_setzero_ps();
1858             fjz1             = _mm_setzero_ps();
1859             fjx2             = _mm_setzero_ps();
1860             fjy2             = _mm_setzero_ps();
1861             fjz2             = _mm_setzero_ps();
1862             fjx3             = _mm_setzero_ps();
1863             fjy3             = _mm_setzero_ps();
1864             fjz3             = _mm_setzero_ps();
1865
1866             /**************************
1867              * CALCULATE INTERACTIONS *
1868              **************************/
1869
1870             r11              = _mm_mul_ps(rsq11,rinv11);
1871             r11              = _mm_andnot_ps(dummy_mask,r11);
1872
1873             /* EWALD ELECTROSTATICS */
1874
1875             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1876             ewrt             = _mm_mul_ps(r11,ewtabscale);
1877             ewitab           = _mm_cvttps_epi32(ewrt);
1878             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1879             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1880                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1881                                          &ewtabF,&ewtabFn);
1882             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1883             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1884
1885             fscal            = felec;
1886
1887             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1888
1889             /* Calculate temporary vectorial force */
1890             tx               = _mm_mul_ps(fscal,dx11);
1891             ty               = _mm_mul_ps(fscal,dy11);
1892             tz               = _mm_mul_ps(fscal,dz11);
1893
1894             /* Update vectorial force */
1895             fix1             = _mm_add_ps(fix1,tx);
1896             fiy1             = _mm_add_ps(fiy1,ty);
1897             fiz1             = _mm_add_ps(fiz1,tz);
1898
1899             fjx1             = _mm_add_ps(fjx1,tx);
1900             fjy1             = _mm_add_ps(fjy1,ty);
1901             fjz1             = _mm_add_ps(fjz1,tz);
1902             
1903             /**************************
1904              * CALCULATE INTERACTIONS *
1905              **************************/
1906
1907             r12              = _mm_mul_ps(rsq12,rinv12);
1908             r12              = _mm_andnot_ps(dummy_mask,r12);
1909
1910             /* EWALD ELECTROSTATICS */
1911
1912             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1913             ewrt             = _mm_mul_ps(r12,ewtabscale);
1914             ewitab           = _mm_cvttps_epi32(ewrt);
1915             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1916             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1917                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1918                                          &ewtabF,&ewtabFn);
1919             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1920             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1921
1922             fscal            = felec;
1923
1924             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1925
1926             /* Calculate temporary vectorial force */
1927             tx               = _mm_mul_ps(fscal,dx12);
1928             ty               = _mm_mul_ps(fscal,dy12);
1929             tz               = _mm_mul_ps(fscal,dz12);
1930
1931             /* Update vectorial force */
1932             fix1             = _mm_add_ps(fix1,tx);
1933             fiy1             = _mm_add_ps(fiy1,ty);
1934             fiz1             = _mm_add_ps(fiz1,tz);
1935
1936             fjx2             = _mm_add_ps(fjx2,tx);
1937             fjy2             = _mm_add_ps(fjy2,ty);
1938             fjz2             = _mm_add_ps(fjz2,tz);
1939             
1940             /**************************
1941              * CALCULATE INTERACTIONS *
1942              **************************/
1943
1944             r13              = _mm_mul_ps(rsq13,rinv13);
1945             r13              = _mm_andnot_ps(dummy_mask,r13);
1946
1947             /* EWALD ELECTROSTATICS */
1948
1949             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1950             ewrt             = _mm_mul_ps(r13,ewtabscale);
1951             ewitab           = _mm_cvttps_epi32(ewrt);
1952             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1953             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1954                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1955                                          &ewtabF,&ewtabFn);
1956             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1957             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1958
1959             fscal            = felec;
1960
1961             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1962
1963             /* Calculate temporary vectorial force */
1964             tx               = _mm_mul_ps(fscal,dx13);
1965             ty               = _mm_mul_ps(fscal,dy13);
1966             tz               = _mm_mul_ps(fscal,dz13);
1967
1968             /* Update vectorial force */
1969             fix1             = _mm_add_ps(fix1,tx);
1970             fiy1             = _mm_add_ps(fiy1,ty);
1971             fiz1             = _mm_add_ps(fiz1,tz);
1972
1973             fjx3             = _mm_add_ps(fjx3,tx);
1974             fjy3             = _mm_add_ps(fjy3,ty);
1975             fjz3             = _mm_add_ps(fjz3,tz);
1976             
1977             /**************************
1978              * CALCULATE INTERACTIONS *
1979              **************************/
1980
1981             r21              = _mm_mul_ps(rsq21,rinv21);
1982             r21              = _mm_andnot_ps(dummy_mask,r21);
1983
1984             /* EWALD ELECTROSTATICS */
1985
1986             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1987             ewrt             = _mm_mul_ps(r21,ewtabscale);
1988             ewitab           = _mm_cvttps_epi32(ewrt);
1989             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1990             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1991                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1992                                          &ewtabF,&ewtabFn);
1993             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1994             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1995
1996             fscal            = felec;
1997
1998             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1999
2000             /* Calculate temporary vectorial force */
2001             tx               = _mm_mul_ps(fscal,dx21);
2002             ty               = _mm_mul_ps(fscal,dy21);
2003             tz               = _mm_mul_ps(fscal,dz21);
2004
2005             /* Update vectorial force */
2006             fix2             = _mm_add_ps(fix2,tx);
2007             fiy2             = _mm_add_ps(fiy2,ty);
2008             fiz2             = _mm_add_ps(fiz2,tz);
2009
2010             fjx1             = _mm_add_ps(fjx1,tx);
2011             fjy1             = _mm_add_ps(fjy1,ty);
2012             fjz1             = _mm_add_ps(fjz1,tz);
2013             
2014             /**************************
2015              * CALCULATE INTERACTIONS *
2016              **************************/
2017
2018             r22              = _mm_mul_ps(rsq22,rinv22);
2019             r22              = _mm_andnot_ps(dummy_mask,r22);
2020
2021             /* EWALD ELECTROSTATICS */
2022
2023             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2024             ewrt             = _mm_mul_ps(r22,ewtabscale);
2025             ewitab           = _mm_cvttps_epi32(ewrt);
2026             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2027             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2028                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2029                                          &ewtabF,&ewtabFn);
2030             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2031             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2032
2033             fscal            = felec;
2034
2035             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2036
2037             /* Calculate temporary vectorial force */
2038             tx               = _mm_mul_ps(fscal,dx22);
2039             ty               = _mm_mul_ps(fscal,dy22);
2040             tz               = _mm_mul_ps(fscal,dz22);
2041
2042             /* Update vectorial force */
2043             fix2             = _mm_add_ps(fix2,tx);
2044             fiy2             = _mm_add_ps(fiy2,ty);
2045             fiz2             = _mm_add_ps(fiz2,tz);
2046
2047             fjx2             = _mm_add_ps(fjx2,tx);
2048             fjy2             = _mm_add_ps(fjy2,ty);
2049             fjz2             = _mm_add_ps(fjz2,tz);
2050             
2051             /**************************
2052              * CALCULATE INTERACTIONS *
2053              **************************/
2054
2055             r23              = _mm_mul_ps(rsq23,rinv23);
2056             r23              = _mm_andnot_ps(dummy_mask,r23);
2057
2058             /* EWALD ELECTROSTATICS */
2059
2060             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2061             ewrt             = _mm_mul_ps(r23,ewtabscale);
2062             ewitab           = _mm_cvttps_epi32(ewrt);
2063             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2064             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2065                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2066                                          &ewtabF,&ewtabFn);
2067             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2068             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2069
2070             fscal            = felec;
2071
2072             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2073
2074             /* Calculate temporary vectorial force */
2075             tx               = _mm_mul_ps(fscal,dx23);
2076             ty               = _mm_mul_ps(fscal,dy23);
2077             tz               = _mm_mul_ps(fscal,dz23);
2078
2079             /* Update vectorial force */
2080             fix2             = _mm_add_ps(fix2,tx);
2081             fiy2             = _mm_add_ps(fiy2,ty);
2082             fiz2             = _mm_add_ps(fiz2,tz);
2083
2084             fjx3             = _mm_add_ps(fjx3,tx);
2085             fjy3             = _mm_add_ps(fjy3,ty);
2086             fjz3             = _mm_add_ps(fjz3,tz);
2087             
2088             /**************************
2089              * CALCULATE INTERACTIONS *
2090              **************************/
2091
2092             r31              = _mm_mul_ps(rsq31,rinv31);
2093             r31              = _mm_andnot_ps(dummy_mask,r31);
2094
2095             /* EWALD ELECTROSTATICS */
2096
2097             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2098             ewrt             = _mm_mul_ps(r31,ewtabscale);
2099             ewitab           = _mm_cvttps_epi32(ewrt);
2100             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2101             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2102                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2103                                          &ewtabF,&ewtabFn);
2104             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2105             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2106
2107             fscal            = felec;
2108
2109             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2110
2111             /* Calculate temporary vectorial force */
2112             tx               = _mm_mul_ps(fscal,dx31);
2113             ty               = _mm_mul_ps(fscal,dy31);
2114             tz               = _mm_mul_ps(fscal,dz31);
2115
2116             /* Update vectorial force */
2117             fix3             = _mm_add_ps(fix3,tx);
2118             fiy3             = _mm_add_ps(fiy3,ty);
2119             fiz3             = _mm_add_ps(fiz3,tz);
2120
2121             fjx1             = _mm_add_ps(fjx1,tx);
2122             fjy1             = _mm_add_ps(fjy1,ty);
2123             fjz1             = _mm_add_ps(fjz1,tz);
2124             
2125             /**************************
2126              * CALCULATE INTERACTIONS *
2127              **************************/
2128
2129             r32              = _mm_mul_ps(rsq32,rinv32);
2130             r32              = _mm_andnot_ps(dummy_mask,r32);
2131
2132             /* EWALD ELECTROSTATICS */
2133
2134             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2135             ewrt             = _mm_mul_ps(r32,ewtabscale);
2136             ewitab           = _mm_cvttps_epi32(ewrt);
2137             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2138             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2139                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2140                                          &ewtabF,&ewtabFn);
2141             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2142             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2143
2144             fscal            = felec;
2145
2146             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2147
2148             /* Calculate temporary vectorial force */
2149             tx               = _mm_mul_ps(fscal,dx32);
2150             ty               = _mm_mul_ps(fscal,dy32);
2151             tz               = _mm_mul_ps(fscal,dz32);
2152
2153             /* Update vectorial force */
2154             fix3             = _mm_add_ps(fix3,tx);
2155             fiy3             = _mm_add_ps(fiy3,ty);
2156             fiz3             = _mm_add_ps(fiz3,tz);
2157
2158             fjx2             = _mm_add_ps(fjx2,tx);
2159             fjy2             = _mm_add_ps(fjy2,ty);
2160             fjz2             = _mm_add_ps(fjz2,tz);
2161             
2162             /**************************
2163              * CALCULATE INTERACTIONS *
2164              **************************/
2165
2166             r33              = _mm_mul_ps(rsq33,rinv33);
2167             r33              = _mm_andnot_ps(dummy_mask,r33);
2168
2169             /* EWALD ELECTROSTATICS */
2170
2171             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2172             ewrt             = _mm_mul_ps(r33,ewtabscale);
2173             ewitab           = _mm_cvttps_epi32(ewrt);
2174             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2175             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2176                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2177                                          &ewtabF,&ewtabFn);
2178             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2179             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2180
2181             fscal            = felec;
2182
2183             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2184
2185             /* Calculate temporary vectorial force */
2186             tx               = _mm_mul_ps(fscal,dx33);
2187             ty               = _mm_mul_ps(fscal,dy33);
2188             tz               = _mm_mul_ps(fscal,dz33);
2189
2190             /* Update vectorial force */
2191             fix3             = _mm_add_ps(fix3,tx);
2192             fiy3             = _mm_add_ps(fiy3,ty);
2193             fiz3             = _mm_add_ps(fiz3,tz);
2194
2195             fjx3             = _mm_add_ps(fjx3,tx);
2196             fjy3             = _mm_add_ps(fjy3,ty);
2197             fjz3             = _mm_add_ps(fjz3,tz);
2198             
2199             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2200             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2201             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2202             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2203
2204             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2205                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2206
2207             /* Inner loop uses 333 flops */
2208         }
2209
2210         /* End of innermost loop */
2211
2212         gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2213                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
2214
2215         /* Increment number of inner iterations */
2216         inneriter                  += j_index_end - j_index_start;
2217
2218         /* Outer loop uses 18 flops */
2219     }
2220
2221     /* Increment number of outer iterations */
2222     outeriter        += nri;
2223
2224     /* Update outer/inner flops */
2225
2226     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*333);
2227 }