Remove all unnecessary HAVE_CONFIG_H
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecEw_VdwNone_GeomW3W3_sse2_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse2_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
46
47 #include "gromacs/simd/math_x86_sse2_single.h"
48 #include "kernelutil_x86_sse2_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sse2_single
52  * Electrostatics interaction: Ewald
53  * VdW interaction:            None
54  * Geometry:                   Water3-Water3
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sse2_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83     int              vdwioffset0;
84     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     int              vdwioffset1;
86     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     int              vdwioffset2;
88     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
105     real             *charge;
106     __m128i          ewitab;
107     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
108     real             *ewtab;
109     __m128           dummy_mask,cutoff_mask;
110     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
111     __m128           one     = _mm_set1_ps(1.0);
112     __m128           two     = _mm_set1_ps(2.0);
113     x                = xx[0];
114     f                = ff[0];
115
116     nri              = nlist->nri;
117     iinr             = nlist->iinr;
118     jindex           = nlist->jindex;
119     jjnr             = nlist->jjnr;
120     shiftidx         = nlist->shift;
121     gid              = nlist->gid;
122     shiftvec         = fr->shift_vec[0];
123     fshift           = fr->fshift[0];
124     facel            = _mm_set1_ps(fr->epsfac);
125     charge           = mdatoms->chargeA;
126
127     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
128     ewtab            = fr->ic->tabq_coul_FDV0;
129     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
130     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
131
132     /* Setup water-specific parameters */
133     inr              = nlist->iinr[0];
134     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
135     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
136     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
137
138     jq0              = _mm_set1_ps(charge[inr+0]);
139     jq1              = _mm_set1_ps(charge[inr+1]);
140     jq2              = _mm_set1_ps(charge[inr+2]);
141     qq00             = _mm_mul_ps(iq0,jq0);
142     qq01             = _mm_mul_ps(iq0,jq1);
143     qq02             = _mm_mul_ps(iq0,jq2);
144     qq10             = _mm_mul_ps(iq1,jq0);
145     qq11             = _mm_mul_ps(iq1,jq1);
146     qq12             = _mm_mul_ps(iq1,jq2);
147     qq20             = _mm_mul_ps(iq2,jq0);
148     qq21             = _mm_mul_ps(iq2,jq1);
149     qq22             = _mm_mul_ps(iq2,jq2);
150
151     /* Avoid stupid compiler warnings */
152     jnrA = jnrB = jnrC = jnrD = 0;
153     j_coord_offsetA = 0;
154     j_coord_offsetB = 0;
155     j_coord_offsetC = 0;
156     j_coord_offsetD = 0;
157
158     outeriter        = 0;
159     inneriter        = 0;
160
161     for(iidx=0;iidx<4*DIM;iidx++)
162     {
163         scratch[iidx] = 0.0;
164     }  
165
166     /* Start outer loop over neighborlists */
167     for(iidx=0; iidx<nri; iidx++)
168     {
169         /* Load shift vector for this list */
170         i_shift_offset   = DIM*shiftidx[iidx];
171
172         /* Load limits for loop over neighbors */
173         j_index_start    = jindex[iidx];
174         j_index_end      = jindex[iidx+1];
175
176         /* Get outer coordinate index */
177         inr              = iinr[iidx];
178         i_coord_offset   = DIM*inr;
179
180         /* Load i particle coords and add shift vector */
181         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
182                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
183         
184         fix0             = _mm_setzero_ps();
185         fiy0             = _mm_setzero_ps();
186         fiz0             = _mm_setzero_ps();
187         fix1             = _mm_setzero_ps();
188         fiy1             = _mm_setzero_ps();
189         fiz1             = _mm_setzero_ps();
190         fix2             = _mm_setzero_ps();
191         fiy2             = _mm_setzero_ps();
192         fiz2             = _mm_setzero_ps();
193
194         /* Reset potential sums */
195         velecsum         = _mm_setzero_ps();
196
197         /* Start inner kernel loop */
198         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
199         {
200
201             /* Get j neighbor index, and coordinate index */
202             jnrA             = jjnr[jidx];
203             jnrB             = jjnr[jidx+1];
204             jnrC             = jjnr[jidx+2];
205             jnrD             = jjnr[jidx+3];
206             j_coord_offsetA  = DIM*jnrA;
207             j_coord_offsetB  = DIM*jnrB;
208             j_coord_offsetC  = DIM*jnrC;
209             j_coord_offsetD  = DIM*jnrD;
210
211             /* load j atom coordinates */
212             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
213                                               x+j_coord_offsetC,x+j_coord_offsetD,
214                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
215
216             /* Calculate displacement vector */
217             dx00             = _mm_sub_ps(ix0,jx0);
218             dy00             = _mm_sub_ps(iy0,jy0);
219             dz00             = _mm_sub_ps(iz0,jz0);
220             dx01             = _mm_sub_ps(ix0,jx1);
221             dy01             = _mm_sub_ps(iy0,jy1);
222             dz01             = _mm_sub_ps(iz0,jz1);
223             dx02             = _mm_sub_ps(ix0,jx2);
224             dy02             = _mm_sub_ps(iy0,jy2);
225             dz02             = _mm_sub_ps(iz0,jz2);
226             dx10             = _mm_sub_ps(ix1,jx0);
227             dy10             = _mm_sub_ps(iy1,jy0);
228             dz10             = _mm_sub_ps(iz1,jz0);
229             dx11             = _mm_sub_ps(ix1,jx1);
230             dy11             = _mm_sub_ps(iy1,jy1);
231             dz11             = _mm_sub_ps(iz1,jz1);
232             dx12             = _mm_sub_ps(ix1,jx2);
233             dy12             = _mm_sub_ps(iy1,jy2);
234             dz12             = _mm_sub_ps(iz1,jz2);
235             dx20             = _mm_sub_ps(ix2,jx0);
236             dy20             = _mm_sub_ps(iy2,jy0);
237             dz20             = _mm_sub_ps(iz2,jz0);
238             dx21             = _mm_sub_ps(ix2,jx1);
239             dy21             = _mm_sub_ps(iy2,jy1);
240             dz21             = _mm_sub_ps(iz2,jz1);
241             dx22             = _mm_sub_ps(ix2,jx2);
242             dy22             = _mm_sub_ps(iy2,jy2);
243             dz22             = _mm_sub_ps(iz2,jz2);
244
245             /* Calculate squared distance and things based on it */
246             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
247             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
248             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
249             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
250             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
251             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
252             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
253             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
254             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
255
256             rinv00           = gmx_mm_invsqrt_ps(rsq00);
257             rinv01           = gmx_mm_invsqrt_ps(rsq01);
258             rinv02           = gmx_mm_invsqrt_ps(rsq02);
259             rinv10           = gmx_mm_invsqrt_ps(rsq10);
260             rinv11           = gmx_mm_invsqrt_ps(rsq11);
261             rinv12           = gmx_mm_invsqrt_ps(rsq12);
262             rinv20           = gmx_mm_invsqrt_ps(rsq20);
263             rinv21           = gmx_mm_invsqrt_ps(rsq21);
264             rinv22           = gmx_mm_invsqrt_ps(rsq22);
265
266             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
267             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
268             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
269             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
270             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
271             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
272             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
273             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
274             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
275
276             fjx0             = _mm_setzero_ps();
277             fjy0             = _mm_setzero_ps();
278             fjz0             = _mm_setzero_ps();
279             fjx1             = _mm_setzero_ps();
280             fjy1             = _mm_setzero_ps();
281             fjz1             = _mm_setzero_ps();
282             fjx2             = _mm_setzero_ps();
283             fjy2             = _mm_setzero_ps();
284             fjz2             = _mm_setzero_ps();
285
286             /**************************
287              * CALCULATE INTERACTIONS *
288              **************************/
289
290             r00              = _mm_mul_ps(rsq00,rinv00);
291
292             /* EWALD ELECTROSTATICS */
293
294             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
295             ewrt             = _mm_mul_ps(r00,ewtabscale);
296             ewitab           = _mm_cvttps_epi32(ewrt);
297             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
298             ewitab           = _mm_slli_epi32(ewitab,2);
299             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
300             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
301             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
302             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
303             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
304             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
305             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
306             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
307             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
308
309             /* Update potential sum for this i atom from the interaction with this j atom. */
310             velecsum         = _mm_add_ps(velecsum,velec);
311
312             fscal            = felec;
313
314             /* Calculate temporary vectorial force */
315             tx               = _mm_mul_ps(fscal,dx00);
316             ty               = _mm_mul_ps(fscal,dy00);
317             tz               = _mm_mul_ps(fscal,dz00);
318
319             /* Update vectorial force */
320             fix0             = _mm_add_ps(fix0,tx);
321             fiy0             = _mm_add_ps(fiy0,ty);
322             fiz0             = _mm_add_ps(fiz0,tz);
323
324             fjx0             = _mm_add_ps(fjx0,tx);
325             fjy0             = _mm_add_ps(fjy0,ty);
326             fjz0             = _mm_add_ps(fjz0,tz);
327             
328             /**************************
329              * CALCULATE INTERACTIONS *
330              **************************/
331
332             r01              = _mm_mul_ps(rsq01,rinv01);
333
334             /* EWALD ELECTROSTATICS */
335
336             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
337             ewrt             = _mm_mul_ps(r01,ewtabscale);
338             ewitab           = _mm_cvttps_epi32(ewrt);
339             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
340             ewitab           = _mm_slli_epi32(ewitab,2);
341             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
342             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
343             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
344             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
345             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
346             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
347             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
348             velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
349             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
350
351             /* Update potential sum for this i atom from the interaction with this j atom. */
352             velecsum         = _mm_add_ps(velecsum,velec);
353
354             fscal            = felec;
355
356             /* Calculate temporary vectorial force */
357             tx               = _mm_mul_ps(fscal,dx01);
358             ty               = _mm_mul_ps(fscal,dy01);
359             tz               = _mm_mul_ps(fscal,dz01);
360
361             /* Update vectorial force */
362             fix0             = _mm_add_ps(fix0,tx);
363             fiy0             = _mm_add_ps(fiy0,ty);
364             fiz0             = _mm_add_ps(fiz0,tz);
365
366             fjx1             = _mm_add_ps(fjx1,tx);
367             fjy1             = _mm_add_ps(fjy1,ty);
368             fjz1             = _mm_add_ps(fjz1,tz);
369             
370             /**************************
371              * CALCULATE INTERACTIONS *
372              **************************/
373
374             r02              = _mm_mul_ps(rsq02,rinv02);
375
376             /* EWALD ELECTROSTATICS */
377
378             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
379             ewrt             = _mm_mul_ps(r02,ewtabscale);
380             ewitab           = _mm_cvttps_epi32(ewrt);
381             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
382             ewitab           = _mm_slli_epi32(ewitab,2);
383             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
384             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
385             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
386             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
387             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
388             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
389             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
390             velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
391             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
392
393             /* Update potential sum for this i atom from the interaction with this j atom. */
394             velecsum         = _mm_add_ps(velecsum,velec);
395
396             fscal            = felec;
397
398             /* Calculate temporary vectorial force */
399             tx               = _mm_mul_ps(fscal,dx02);
400             ty               = _mm_mul_ps(fscal,dy02);
401             tz               = _mm_mul_ps(fscal,dz02);
402
403             /* Update vectorial force */
404             fix0             = _mm_add_ps(fix0,tx);
405             fiy0             = _mm_add_ps(fiy0,ty);
406             fiz0             = _mm_add_ps(fiz0,tz);
407
408             fjx2             = _mm_add_ps(fjx2,tx);
409             fjy2             = _mm_add_ps(fjy2,ty);
410             fjz2             = _mm_add_ps(fjz2,tz);
411             
412             /**************************
413              * CALCULATE INTERACTIONS *
414              **************************/
415
416             r10              = _mm_mul_ps(rsq10,rinv10);
417
418             /* EWALD ELECTROSTATICS */
419
420             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
421             ewrt             = _mm_mul_ps(r10,ewtabscale);
422             ewitab           = _mm_cvttps_epi32(ewrt);
423             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
424             ewitab           = _mm_slli_epi32(ewitab,2);
425             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
426             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
427             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
428             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
429             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
430             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
431             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
432             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
433             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
434
435             /* Update potential sum for this i atom from the interaction with this j atom. */
436             velecsum         = _mm_add_ps(velecsum,velec);
437
438             fscal            = felec;
439
440             /* Calculate temporary vectorial force */
441             tx               = _mm_mul_ps(fscal,dx10);
442             ty               = _mm_mul_ps(fscal,dy10);
443             tz               = _mm_mul_ps(fscal,dz10);
444
445             /* Update vectorial force */
446             fix1             = _mm_add_ps(fix1,tx);
447             fiy1             = _mm_add_ps(fiy1,ty);
448             fiz1             = _mm_add_ps(fiz1,tz);
449
450             fjx0             = _mm_add_ps(fjx0,tx);
451             fjy0             = _mm_add_ps(fjy0,ty);
452             fjz0             = _mm_add_ps(fjz0,tz);
453             
454             /**************************
455              * CALCULATE INTERACTIONS *
456              **************************/
457
458             r11              = _mm_mul_ps(rsq11,rinv11);
459
460             /* EWALD ELECTROSTATICS */
461
462             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
463             ewrt             = _mm_mul_ps(r11,ewtabscale);
464             ewitab           = _mm_cvttps_epi32(ewrt);
465             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
466             ewitab           = _mm_slli_epi32(ewitab,2);
467             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
468             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
469             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
470             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
471             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
472             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
473             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
474             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
475             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
476
477             /* Update potential sum for this i atom from the interaction with this j atom. */
478             velecsum         = _mm_add_ps(velecsum,velec);
479
480             fscal            = felec;
481
482             /* Calculate temporary vectorial force */
483             tx               = _mm_mul_ps(fscal,dx11);
484             ty               = _mm_mul_ps(fscal,dy11);
485             tz               = _mm_mul_ps(fscal,dz11);
486
487             /* Update vectorial force */
488             fix1             = _mm_add_ps(fix1,tx);
489             fiy1             = _mm_add_ps(fiy1,ty);
490             fiz1             = _mm_add_ps(fiz1,tz);
491
492             fjx1             = _mm_add_ps(fjx1,tx);
493             fjy1             = _mm_add_ps(fjy1,ty);
494             fjz1             = _mm_add_ps(fjz1,tz);
495             
496             /**************************
497              * CALCULATE INTERACTIONS *
498              **************************/
499
500             r12              = _mm_mul_ps(rsq12,rinv12);
501
502             /* EWALD ELECTROSTATICS */
503
504             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
505             ewrt             = _mm_mul_ps(r12,ewtabscale);
506             ewitab           = _mm_cvttps_epi32(ewrt);
507             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
508             ewitab           = _mm_slli_epi32(ewitab,2);
509             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
510             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
511             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
512             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
513             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
514             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
515             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
516             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
517             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
518
519             /* Update potential sum for this i atom from the interaction with this j atom. */
520             velecsum         = _mm_add_ps(velecsum,velec);
521
522             fscal            = felec;
523
524             /* Calculate temporary vectorial force */
525             tx               = _mm_mul_ps(fscal,dx12);
526             ty               = _mm_mul_ps(fscal,dy12);
527             tz               = _mm_mul_ps(fscal,dz12);
528
529             /* Update vectorial force */
530             fix1             = _mm_add_ps(fix1,tx);
531             fiy1             = _mm_add_ps(fiy1,ty);
532             fiz1             = _mm_add_ps(fiz1,tz);
533
534             fjx2             = _mm_add_ps(fjx2,tx);
535             fjy2             = _mm_add_ps(fjy2,ty);
536             fjz2             = _mm_add_ps(fjz2,tz);
537             
538             /**************************
539              * CALCULATE INTERACTIONS *
540              **************************/
541
542             r20              = _mm_mul_ps(rsq20,rinv20);
543
544             /* EWALD ELECTROSTATICS */
545
546             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
547             ewrt             = _mm_mul_ps(r20,ewtabscale);
548             ewitab           = _mm_cvttps_epi32(ewrt);
549             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
550             ewitab           = _mm_slli_epi32(ewitab,2);
551             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
552             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
553             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
554             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
555             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
556             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
557             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
558             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
559             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
560
561             /* Update potential sum for this i atom from the interaction with this j atom. */
562             velecsum         = _mm_add_ps(velecsum,velec);
563
564             fscal            = felec;
565
566             /* Calculate temporary vectorial force */
567             tx               = _mm_mul_ps(fscal,dx20);
568             ty               = _mm_mul_ps(fscal,dy20);
569             tz               = _mm_mul_ps(fscal,dz20);
570
571             /* Update vectorial force */
572             fix2             = _mm_add_ps(fix2,tx);
573             fiy2             = _mm_add_ps(fiy2,ty);
574             fiz2             = _mm_add_ps(fiz2,tz);
575
576             fjx0             = _mm_add_ps(fjx0,tx);
577             fjy0             = _mm_add_ps(fjy0,ty);
578             fjz0             = _mm_add_ps(fjz0,tz);
579             
580             /**************************
581              * CALCULATE INTERACTIONS *
582              **************************/
583
584             r21              = _mm_mul_ps(rsq21,rinv21);
585
586             /* EWALD ELECTROSTATICS */
587
588             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
589             ewrt             = _mm_mul_ps(r21,ewtabscale);
590             ewitab           = _mm_cvttps_epi32(ewrt);
591             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
592             ewitab           = _mm_slli_epi32(ewitab,2);
593             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
594             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
595             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
596             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
597             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
598             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
599             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
600             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
601             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
602
603             /* Update potential sum for this i atom from the interaction with this j atom. */
604             velecsum         = _mm_add_ps(velecsum,velec);
605
606             fscal            = felec;
607
608             /* Calculate temporary vectorial force */
609             tx               = _mm_mul_ps(fscal,dx21);
610             ty               = _mm_mul_ps(fscal,dy21);
611             tz               = _mm_mul_ps(fscal,dz21);
612
613             /* Update vectorial force */
614             fix2             = _mm_add_ps(fix2,tx);
615             fiy2             = _mm_add_ps(fiy2,ty);
616             fiz2             = _mm_add_ps(fiz2,tz);
617
618             fjx1             = _mm_add_ps(fjx1,tx);
619             fjy1             = _mm_add_ps(fjy1,ty);
620             fjz1             = _mm_add_ps(fjz1,tz);
621             
622             /**************************
623              * CALCULATE INTERACTIONS *
624              **************************/
625
626             r22              = _mm_mul_ps(rsq22,rinv22);
627
628             /* EWALD ELECTROSTATICS */
629
630             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
631             ewrt             = _mm_mul_ps(r22,ewtabscale);
632             ewitab           = _mm_cvttps_epi32(ewrt);
633             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
634             ewitab           = _mm_slli_epi32(ewitab,2);
635             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
636             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
637             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
638             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
639             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
640             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
641             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
642             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
643             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
644
645             /* Update potential sum for this i atom from the interaction with this j atom. */
646             velecsum         = _mm_add_ps(velecsum,velec);
647
648             fscal            = felec;
649
650             /* Calculate temporary vectorial force */
651             tx               = _mm_mul_ps(fscal,dx22);
652             ty               = _mm_mul_ps(fscal,dy22);
653             tz               = _mm_mul_ps(fscal,dz22);
654
655             /* Update vectorial force */
656             fix2             = _mm_add_ps(fix2,tx);
657             fiy2             = _mm_add_ps(fiy2,ty);
658             fiz2             = _mm_add_ps(fiz2,tz);
659
660             fjx2             = _mm_add_ps(fjx2,tx);
661             fjy2             = _mm_add_ps(fjy2,ty);
662             fjz2             = _mm_add_ps(fjz2,tz);
663             
664             fjptrA             = f+j_coord_offsetA;
665             fjptrB             = f+j_coord_offsetB;
666             fjptrC             = f+j_coord_offsetC;
667             fjptrD             = f+j_coord_offsetD;
668
669             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
670                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
671
672             /* Inner loop uses 369 flops */
673         }
674
675         if(jidx<j_index_end)
676         {
677
678             /* Get j neighbor index, and coordinate index */
679             jnrlistA         = jjnr[jidx];
680             jnrlistB         = jjnr[jidx+1];
681             jnrlistC         = jjnr[jidx+2];
682             jnrlistD         = jjnr[jidx+3];
683             /* Sign of each element will be negative for non-real atoms.
684              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
685              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
686              */
687             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
688             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
689             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
690             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
691             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
692             j_coord_offsetA  = DIM*jnrA;
693             j_coord_offsetB  = DIM*jnrB;
694             j_coord_offsetC  = DIM*jnrC;
695             j_coord_offsetD  = DIM*jnrD;
696
697             /* load j atom coordinates */
698             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
699                                               x+j_coord_offsetC,x+j_coord_offsetD,
700                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
701
702             /* Calculate displacement vector */
703             dx00             = _mm_sub_ps(ix0,jx0);
704             dy00             = _mm_sub_ps(iy0,jy0);
705             dz00             = _mm_sub_ps(iz0,jz0);
706             dx01             = _mm_sub_ps(ix0,jx1);
707             dy01             = _mm_sub_ps(iy0,jy1);
708             dz01             = _mm_sub_ps(iz0,jz1);
709             dx02             = _mm_sub_ps(ix0,jx2);
710             dy02             = _mm_sub_ps(iy0,jy2);
711             dz02             = _mm_sub_ps(iz0,jz2);
712             dx10             = _mm_sub_ps(ix1,jx0);
713             dy10             = _mm_sub_ps(iy1,jy0);
714             dz10             = _mm_sub_ps(iz1,jz0);
715             dx11             = _mm_sub_ps(ix1,jx1);
716             dy11             = _mm_sub_ps(iy1,jy1);
717             dz11             = _mm_sub_ps(iz1,jz1);
718             dx12             = _mm_sub_ps(ix1,jx2);
719             dy12             = _mm_sub_ps(iy1,jy2);
720             dz12             = _mm_sub_ps(iz1,jz2);
721             dx20             = _mm_sub_ps(ix2,jx0);
722             dy20             = _mm_sub_ps(iy2,jy0);
723             dz20             = _mm_sub_ps(iz2,jz0);
724             dx21             = _mm_sub_ps(ix2,jx1);
725             dy21             = _mm_sub_ps(iy2,jy1);
726             dz21             = _mm_sub_ps(iz2,jz1);
727             dx22             = _mm_sub_ps(ix2,jx2);
728             dy22             = _mm_sub_ps(iy2,jy2);
729             dz22             = _mm_sub_ps(iz2,jz2);
730
731             /* Calculate squared distance and things based on it */
732             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
733             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
734             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
735             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
736             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
737             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
738             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
739             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
740             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
741
742             rinv00           = gmx_mm_invsqrt_ps(rsq00);
743             rinv01           = gmx_mm_invsqrt_ps(rsq01);
744             rinv02           = gmx_mm_invsqrt_ps(rsq02);
745             rinv10           = gmx_mm_invsqrt_ps(rsq10);
746             rinv11           = gmx_mm_invsqrt_ps(rsq11);
747             rinv12           = gmx_mm_invsqrt_ps(rsq12);
748             rinv20           = gmx_mm_invsqrt_ps(rsq20);
749             rinv21           = gmx_mm_invsqrt_ps(rsq21);
750             rinv22           = gmx_mm_invsqrt_ps(rsq22);
751
752             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
753             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
754             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
755             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
756             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
757             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
758             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
759             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
760             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
761
762             fjx0             = _mm_setzero_ps();
763             fjy0             = _mm_setzero_ps();
764             fjz0             = _mm_setzero_ps();
765             fjx1             = _mm_setzero_ps();
766             fjy1             = _mm_setzero_ps();
767             fjz1             = _mm_setzero_ps();
768             fjx2             = _mm_setzero_ps();
769             fjy2             = _mm_setzero_ps();
770             fjz2             = _mm_setzero_ps();
771
772             /**************************
773              * CALCULATE INTERACTIONS *
774              **************************/
775
776             r00              = _mm_mul_ps(rsq00,rinv00);
777             r00              = _mm_andnot_ps(dummy_mask,r00);
778
779             /* EWALD ELECTROSTATICS */
780
781             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
782             ewrt             = _mm_mul_ps(r00,ewtabscale);
783             ewitab           = _mm_cvttps_epi32(ewrt);
784             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
785             ewitab           = _mm_slli_epi32(ewitab,2);
786             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
787             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
788             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
789             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
790             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
791             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
792             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
793             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
794             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
795
796             /* Update potential sum for this i atom from the interaction with this j atom. */
797             velec            = _mm_andnot_ps(dummy_mask,velec);
798             velecsum         = _mm_add_ps(velecsum,velec);
799
800             fscal            = felec;
801
802             fscal            = _mm_andnot_ps(dummy_mask,fscal);
803
804             /* Calculate temporary vectorial force */
805             tx               = _mm_mul_ps(fscal,dx00);
806             ty               = _mm_mul_ps(fscal,dy00);
807             tz               = _mm_mul_ps(fscal,dz00);
808
809             /* Update vectorial force */
810             fix0             = _mm_add_ps(fix0,tx);
811             fiy0             = _mm_add_ps(fiy0,ty);
812             fiz0             = _mm_add_ps(fiz0,tz);
813
814             fjx0             = _mm_add_ps(fjx0,tx);
815             fjy0             = _mm_add_ps(fjy0,ty);
816             fjz0             = _mm_add_ps(fjz0,tz);
817             
818             /**************************
819              * CALCULATE INTERACTIONS *
820              **************************/
821
822             r01              = _mm_mul_ps(rsq01,rinv01);
823             r01              = _mm_andnot_ps(dummy_mask,r01);
824
825             /* EWALD ELECTROSTATICS */
826
827             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
828             ewrt             = _mm_mul_ps(r01,ewtabscale);
829             ewitab           = _mm_cvttps_epi32(ewrt);
830             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
831             ewitab           = _mm_slli_epi32(ewitab,2);
832             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
833             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
834             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
835             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
836             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
837             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
838             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
839             velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
840             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
841
842             /* Update potential sum for this i atom from the interaction with this j atom. */
843             velec            = _mm_andnot_ps(dummy_mask,velec);
844             velecsum         = _mm_add_ps(velecsum,velec);
845
846             fscal            = felec;
847
848             fscal            = _mm_andnot_ps(dummy_mask,fscal);
849
850             /* Calculate temporary vectorial force */
851             tx               = _mm_mul_ps(fscal,dx01);
852             ty               = _mm_mul_ps(fscal,dy01);
853             tz               = _mm_mul_ps(fscal,dz01);
854
855             /* Update vectorial force */
856             fix0             = _mm_add_ps(fix0,tx);
857             fiy0             = _mm_add_ps(fiy0,ty);
858             fiz0             = _mm_add_ps(fiz0,tz);
859
860             fjx1             = _mm_add_ps(fjx1,tx);
861             fjy1             = _mm_add_ps(fjy1,ty);
862             fjz1             = _mm_add_ps(fjz1,tz);
863             
864             /**************************
865              * CALCULATE INTERACTIONS *
866              **************************/
867
868             r02              = _mm_mul_ps(rsq02,rinv02);
869             r02              = _mm_andnot_ps(dummy_mask,r02);
870
871             /* EWALD ELECTROSTATICS */
872
873             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
874             ewrt             = _mm_mul_ps(r02,ewtabscale);
875             ewitab           = _mm_cvttps_epi32(ewrt);
876             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
877             ewitab           = _mm_slli_epi32(ewitab,2);
878             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
879             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
880             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
881             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
882             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
883             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
884             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
885             velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
886             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
887
888             /* Update potential sum for this i atom from the interaction with this j atom. */
889             velec            = _mm_andnot_ps(dummy_mask,velec);
890             velecsum         = _mm_add_ps(velecsum,velec);
891
892             fscal            = felec;
893
894             fscal            = _mm_andnot_ps(dummy_mask,fscal);
895
896             /* Calculate temporary vectorial force */
897             tx               = _mm_mul_ps(fscal,dx02);
898             ty               = _mm_mul_ps(fscal,dy02);
899             tz               = _mm_mul_ps(fscal,dz02);
900
901             /* Update vectorial force */
902             fix0             = _mm_add_ps(fix0,tx);
903             fiy0             = _mm_add_ps(fiy0,ty);
904             fiz0             = _mm_add_ps(fiz0,tz);
905
906             fjx2             = _mm_add_ps(fjx2,tx);
907             fjy2             = _mm_add_ps(fjy2,ty);
908             fjz2             = _mm_add_ps(fjz2,tz);
909             
910             /**************************
911              * CALCULATE INTERACTIONS *
912              **************************/
913
914             r10              = _mm_mul_ps(rsq10,rinv10);
915             r10              = _mm_andnot_ps(dummy_mask,r10);
916
917             /* EWALD ELECTROSTATICS */
918
919             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
920             ewrt             = _mm_mul_ps(r10,ewtabscale);
921             ewitab           = _mm_cvttps_epi32(ewrt);
922             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
923             ewitab           = _mm_slli_epi32(ewitab,2);
924             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
925             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
926             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
927             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
928             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
929             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
930             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
931             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
932             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
933
934             /* Update potential sum for this i atom from the interaction with this j atom. */
935             velec            = _mm_andnot_ps(dummy_mask,velec);
936             velecsum         = _mm_add_ps(velecsum,velec);
937
938             fscal            = felec;
939
940             fscal            = _mm_andnot_ps(dummy_mask,fscal);
941
942             /* Calculate temporary vectorial force */
943             tx               = _mm_mul_ps(fscal,dx10);
944             ty               = _mm_mul_ps(fscal,dy10);
945             tz               = _mm_mul_ps(fscal,dz10);
946
947             /* Update vectorial force */
948             fix1             = _mm_add_ps(fix1,tx);
949             fiy1             = _mm_add_ps(fiy1,ty);
950             fiz1             = _mm_add_ps(fiz1,tz);
951
952             fjx0             = _mm_add_ps(fjx0,tx);
953             fjy0             = _mm_add_ps(fjy0,ty);
954             fjz0             = _mm_add_ps(fjz0,tz);
955             
956             /**************************
957              * CALCULATE INTERACTIONS *
958              **************************/
959
960             r11              = _mm_mul_ps(rsq11,rinv11);
961             r11              = _mm_andnot_ps(dummy_mask,r11);
962
963             /* EWALD ELECTROSTATICS */
964
965             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
966             ewrt             = _mm_mul_ps(r11,ewtabscale);
967             ewitab           = _mm_cvttps_epi32(ewrt);
968             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
969             ewitab           = _mm_slli_epi32(ewitab,2);
970             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
971             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
972             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
973             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
974             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
975             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
976             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
977             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
978             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
979
980             /* Update potential sum for this i atom from the interaction with this j atom. */
981             velec            = _mm_andnot_ps(dummy_mask,velec);
982             velecsum         = _mm_add_ps(velecsum,velec);
983
984             fscal            = felec;
985
986             fscal            = _mm_andnot_ps(dummy_mask,fscal);
987
988             /* Calculate temporary vectorial force */
989             tx               = _mm_mul_ps(fscal,dx11);
990             ty               = _mm_mul_ps(fscal,dy11);
991             tz               = _mm_mul_ps(fscal,dz11);
992
993             /* Update vectorial force */
994             fix1             = _mm_add_ps(fix1,tx);
995             fiy1             = _mm_add_ps(fiy1,ty);
996             fiz1             = _mm_add_ps(fiz1,tz);
997
998             fjx1             = _mm_add_ps(fjx1,tx);
999             fjy1             = _mm_add_ps(fjy1,ty);
1000             fjz1             = _mm_add_ps(fjz1,tz);
1001             
1002             /**************************
1003              * CALCULATE INTERACTIONS *
1004              **************************/
1005
1006             r12              = _mm_mul_ps(rsq12,rinv12);
1007             r12              = _mm_andnot_ps(dummy_mask,r12);
1008
1009             /* EWALD ELECTROSTATICS */
1010
1011             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1012             ewrt             = _mm_mul_ps(r12,ewtabscale);
1013             ewitab           = _mm_cvttps_epi32(ewrt);
1014             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1015             ewitab           = _mm_slli_epi32(ewitab,2);
1016             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1017             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1018             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1019             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1020             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1021             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1022             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1023             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1024             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1025
1026             /* Update potential sum for this i atom from the interaction with this j atom. */
1027             velec            = _mm_andnot_ps(dummy_mask,velec);
1028             velecsum         = _mm_add_ps(velecsum,velec);
1029
1030             fscal            = felec;
1031
1032             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1033
1034             /* Calculate temporary vectorial force */
1035             tx               = _mm_mul_ps(fscal,dx12);
1036             ty               = _mm_mul_ps(fscal,dy12);
1037             tz               = _mm_mul_ps(fscal,dz12);
1038
1039             /* Update vectorial force */
1040             fix1             = _mm_add_ps(fix1,tx);
1041             fiy1             = _mm_add_ps(fiy1,ty);
1042             fiz1             = _mm_add_ps(fiz1,tz);
1043
1044             fjx2             = _mm_add_ps(fjx2,tx);
1045             fjy2             = _mm_add_ps(fjy2,ty);
1046             fjz2             = _mm_add_ps(fjz2,tz);
1047             
1048             /**************************
1049              * CALCULATE INTERACTIONS *
1050              **************************/
1051
1052             r20              = _mm_mul_ps(rsq20,rinv20);
1053             r20              = _mm_andnot_ps(dummy_mask,r20);
1054
1055             /* EWALD ELECTROSTATICS */
1056
1057             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1058             ewrt             = _mm_mul_ps(r20,ewtabscale);
1059             ewitab           = _mm_cvttps_epi32(ewrt);
1060             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1061             ewitab           = _mm_slli_epi32(ewitab,2);
1062             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1063             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1064             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1065             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1066             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1067             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1068             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1069             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1070             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1071
1072             /* Update potential sum for this i atom from the interaction with this j atom. */
1073             velec            = _mm_andnot_ps(dummy_mask,velec);
1074             velecsum         = _mm_add_ps(velecsum,velec);
1075
1076             fscal            = felec;
1077
1078             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1079
1080             /* Calculate temporary vectorial force */
1081             tx               = _mm_mul_ps(fscal,dx20);
1082             ty               = _mm_mul_ps(fscal,dy20);
1083             tz               = _mm_mul_ps(fscal,dz20);
1084
1085             /* Update vectorial force */
1086             fix2             = _mm_add_ps(fix2,tx);
1087             fiy2             = _mm_add_ps(fiy2,ty);
1088             fiz2             = _mm_add_ps(fiz2,tz);
1089
1090             fjx0             = _mm_add_ps(fjx0,tx);
1091             fjy0             = _mm_add_ps(fjy0,ty);
1092             fjz0             = _mm_add_ps(fjz0,tz);
1093             
1094             /**************************
1095              * CALCULATE INTERACTIONS *
1096              **************************/
1097
1098             r21              = _mm_mul_ps(rsq21,rinv21);
1099             r21              = _mm_andnot_ps(dummy_mask,r21);
1100
1101             /* EWALD ELECTROSTATICS */
1102
1103             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1104             ewrt             = _mm_mul_ps(r21,ewtabscale);
1105             ewitab           = _mm_cvttps_epi32(ewrt);
1106             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1107             ewitab           = _mm_slli_epi32(ewitab,2);
1108             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1109             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1110             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1111             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1112             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1113             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1114             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1115             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1116             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1117
1118             /* Update potential sum for this i atom from the interaction with this j atom. */
1119             velec            = _mm_andnot_ps(dummy_mask,velec);
1120             velecsum         = _mm_add_ps(velecsum,velec);
1121
1122             fscal            = felec;
1123
1124             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1125
1126             /* Calculate temporary vectorial force */
1127             tx               = _mm_mul_ps(fscal,dx21);
1128             ty               = _mm_mul_ps(fscal,dy21);
1129             tz               = _mm_mul_ps(fscal,dz21);
1130
1131             /* Update vectorial force */
1132             fix2             = _mm_add_ps(fix2,tx);
1133             fiy2             = _mm_add_ps(fiy2,ty);
1134             fiz2             = _mm_add_ps(fiz2,tz);
1135
1136             fjx1             = _mm_add_ps(fjx1,tx);
1137             fjy1             = _mm_add_ps(fjy1,ty);
1138             fjz1             = _mm_add_ps(fjz1,tz);
1139             
1140             /**************************
1141              * CALCULATE INTERACTIONS *
1142              **************************/
1143
1144             r22              = _mm_mul_ps(rsq22,rinv22);
1145             r22              = _mm_andnot_ps(dummy_mask,r22);
1146
1147             /* EWALD ELECTROSTATICS */
1148
1149             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1150             ewrt             = _mm_mul_ps(r22,ewtabscale);
1151             ewitab           = _mm_cvttps_epi32(ewrt);
1152             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1153             ewitab           = _mm_slli_epi32(ewitab,2);
1154             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1155             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1156             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1157             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1158             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1159             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1160             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1161             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1162             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1163
1164             /* Update potential sum for this i atom from the interaction with this j atom. */
1165             velec            = _mm_andnot_ps(dummy_mask,velec);
1166             velecsum         = _mm_add_ps(velecsum,velec);
1167
1168             fscal            = felec;
1169
1170             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1171
1172             /* Calculate temporary vectorial force */
1173             tx               = _mm_mul_ps(fscal,dx22);
1174             ty               = _mm_mul_ps(fscal,dy22);
1175             tz               = _mm_mul_ps(fscal,dz22);
1176
1177             /* Update vectorial force */
1178             fix2             = _mm_add_ps(fix2,tx);
1179             fiy2             = _mm_add_ps(fiy2,ty);
1180             fiz2             = _mm_add_ps(fiz2,tz);
1181
1182             fjx2             = _mm_add_ps(fjx2,tx);
1183             fjy2             = _mm_add_ps(fjy2,ty);
1184             fjz2             = _mm_add_ps(fjz2,tz);
1185             
1186             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1187             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1188             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1189             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1190
1191             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1192                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1193
1194             /* Inner loop uses 378 flops */
1195         }
1196
1197         /* End of innermost loop */
1198
1199         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1200                                               f+i_coord_offset,fshift+i_shift_offset);
1201
1202         ggid                        = gid[iidx];
1203         /* Update potential energies */
1204         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1205
1206         /* Increment number of inner iterations */
1207         inneriter                  += j_index_end - j_index_start;
1208
1209         /* Outer loop uses 19 flops */
1210     }
1211
1212     /* Increment number of outer iterations */
1213     outeriter        += nri;
1214
1215     /* Update outer/inner flops */
1216
1217     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*378);
1218 }
1219 /*
1220  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse2_single
1221  * Electrostatics interaction: Ewald
1222  * VdW interaction:            None
1223  * Geometry:                   Water3-Water3
1224  * Calculate force/pot:        Force
1225  */
1226 void
1227 nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse2_single
1228                     (t_nblist                    * gmx_restrict       nlist,
1229                      rvec                        * gmx_restrict          xx,
1230                      rvec                        * gmx_restrict          ff,
1231                      t_forcerec                  * gmx_restrict          fr,
1232                      t_mdatoms                   * gmx_restrict     mdatoms,
1233                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1234                      t_nrnb                      * gmx_restrict        nrnb)
1235 {
1236     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1237      * just 0 for non-waters.
1238      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1239      * jnr indices corresponding to data put in the four positions in the SIMD register.
1240      */
1241     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1242     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1243     int              jnrA,jnrB,jnrC,jnrD;
1244     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1245     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1246     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1247     real             rcutoff_scalar;
1248     real             *shiftvec,*fshift,*x,*f;
1249     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1250     real             scratch[4*DIM];
1251     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1252     int              vdwioffset0;
1253     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1254     int              vdwioffset1;
1255     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1256     int              vdwioffset2;
1257     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1258     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1259     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1260     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1261     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1262     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1263     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1264     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1265     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1266     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1267     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1268     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1269     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1270     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1271     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1272     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1273     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1274     real             *charge;
1275     __m128i          ewitab;
1276     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1277     real             *ewtab;
1278     __m128           dummy_mask,cutoff_mask;
1279     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1280     __m128           one     = _mm_set1_ps(1.0);
1281     __m128           two     = _mm_set1_ps(2.0);
1282     x                = xx[0];
1283     f                = ff[0];
1284
1285     nri              = nlist->nri;
1286     iinr             = nlist->iinr;
1287     jindex           = nlist->jindex;
1288     jjnr             = nlist->jjnr;
1289     shiftidx         = nlist->shift;
1290     gid              = nlist->gid;
1291     shiftvec         = fr->shift_vec[0];
1292     fshift           = fr->fshift[0];
1293     facel            = _mm_set1_ps(fr->epsfac);
1294     charge           = mdatoms->chargeA;
1295
1296     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1297     ewtab            = fr->ic->tabq_coul_F;
1298     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1299     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1300
1301     /* Setup water-specific parameters */
1302     inr              = nlist->iinr[0];
1303     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1304     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1305     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1306
1307     jq0              = _mm_set1_ps(charge[inr+0]);
1308     jq1              = _mm_set1_ps(charge[inr+1]);
1309     jq2              = _mm_set1_ps(charge[inr+2]);
1310     qq00             = _mm_mul_ps(iq0,jq0);
1311     qq01             = _mm_mul_ps(iq0,jq1);
1312     qq02             = _mm_mul_ps(iq0,jq2);
1313     qq10             = _mm_mul_ps(iq1,jq0);
1314     qq11             = _mm_mul_ps(iq1,jq1);
1315     qq12             = _mm_mul_ps(iq1,jq2);
1316     qq20             = _mm_mul_ps(iq2,jq0);
1317     qq21             = _mm_mul_ps(iq2,jq1);
1318     qq22             = _mm_mul_ps(iq2,jq2);
1319
1320     /* Avoid stupid compiler warnings */
1321     jnrA = jnrB = jnrC = jnrD = 0;
1322     j_coord_offsetA = 0;
1323     j_coord_offsetB = 0;
1324     j_coord_offsetC = 0;
1325     j_coord_offsetD = 0;
1326
1327     outeriter        = 0;
1328     inneriter        = 0;
1329
1330     for(iidx=0;iidx<4*DIM;iidx++)
1331     {
1332         scratch[iidx] = 0.0;
1333     }  
1334
1335     /* Start outer loop over neighborlists */
1336     for(iidx=0; iidx<nri; iidx++)
1337     {
1338         /* Load shift vector for this list */
1339         i_shift_offset   = DIM*shiftidx[iidx];
1340
1341         /* Load limits for loop over neighbors */
1342         j_index_start    = jindex[iidx];
1343         j_index_end      = jindex[iidx+1];
1344
1345         /* Get outer coordinate index */
1346         inr              = iinr[iidx];
1347         i_coord_offset   = DIM*inr;
1348
1349         /* Load i particle coords and add shift vector */
1350         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1351                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1352         
1353         fix0             = _mm_setzero_ps();
1354         fiy0             = _mm_setzero_ps();
1355         fiz0             = _mm_setzero_ps();
1356         fix1             = _mm_setzero_ps();
1357         fiy1             = _mm_setzero_ps();
1358         fiz1             = _mm_setzero_ps();
1359         fix2             = _mm_setzero_ps();
1360         fiy2             = _mm_setzero_ps();
1361         fiz2             = _mm_setzero_ps();
1362
1363         /* Start inner kernel loop */
1364         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1365         {
1366
1367             /* Get j neighbor index, and coordinate index */
1368             jnrA             = jjnr[jidx];
1369             jnrB             = jjnr[jidx+1];
1370             jnrC             = jjnr[jidx+2];
1371             jnrD             = jjnr[jidx+3];
1372             j_coord_offsetA  = DIM*jnrA;
1373             j_coord_offsetB  = DIM*jnrB;
1374             j_coord_offsetC  = DIM*jnrC;
1375             j_coord_offsetD  = DIM*jnrD;
1376
1377             /* load j atom coordinates */
1378             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1379                                               x+j_coord_offsetC,x+j_coord_offsetD,
1380                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1381
1382             /* Calculate displacement vector */
1383             dx00             = _mm_sub_ps(ix0,jx0);
1384             dy00             = _mm_sub_ps(iy0,jy0);
1385             dz00             = _mm_sub_ps(iz0,jz0);
1386             dx01             = _mm_sub_ps(ix0,jx1);
1387             dy01             = _mm_sub_ps(iy0,jy1);
1388             dz01             = _mm_sub_ps(iz0,jz1);
1389             dx02             = _mm_sub_ps(ix0,jx2);
1390             dy02             = _mm_sub_ps(iy0,jy2);
1391             dz02             = _mm_sub_ps(iz0,jz2);
1392             dx10             = _mm_sub_ps(ix1,jx0);
1393             dy10             = _mm_sub_ps(iy1,jy0);
1394             dz10             = _mm_sub_ps(iz1,jz0);
1395             dx11             = _mm_sub_ps(ix1,jx1);
1396             dy11             = _mm_sub_ps(iy1,jy1);
1397             dz11             = _mm_sub_ps(iz1,jz1);
1398             dx12             = _mm_sub_ps(ix1,jx2);
1399             dy12             = _mm_sub_ps(iy1,jy2);
1400             dz12             = _mm_sub_ps(iz1,jz2);
1401             dx20             = _mm_sub_ps(ix2,jx0);
1402             dy20             = _mm_sub_ps(iy2,jy0);
1403             dz20             = _mm_sub_ps(iz2,jz0);
1404             dx21             = _mm_sub_ps(ix2,jx1);
1405             dy21             = _mm_sub_ps(iy2,jy1);
1406             dz21             = _mm_sub_ps(iz2,jz1);
1407             dx22             = _mm_sub_ps(ix2,jx2);
1408             dy22             = _mm_sub_ps(iy2,jy2);
1409             dz22             = _mm_sub_ps(iz2,jz2);
1410
1411             /* Calculate squared distance and things based on it */
1412             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1413             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1414             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1415             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1416             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1417             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1418             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1419             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1420             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1421
1422             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1423             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1424             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1425             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1426             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1427             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1428             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1429             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1430             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1431
1432             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1433             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1434             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1435             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1436             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1437             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1438             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1439             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1440             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1441
1442             fjx0             = _mm_setzero_ps();
1443             fjy0             = _mm_setzero_ps();
1444             fjz0             = _mm_setzero_ps();
1445             fjx1             = _mm_setzero_ps();
1446             fjy1             = _mm_setzero_ps();
1447             fjz1             = _mm_setzero_ps();
1448             fjx2             = _mm_setzero_ps();
1449             fjy2             = _mm_setzero_ps();
1450             fjz2             = _mm_setzero_ps();
1451
1452             /**************************
1453              * CALCULATE INTERACTIONS *
1454              **************************/
1455
1456             r00              = _mm_mul_ps(rsq00,rinv00);
1457
1458             /* EWALD ELECTROSTATICS */
1459
1460             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1461             ewrt             = _mm_mul_ps(r00,ewtabscale);
1462             ewitab           = _mm_cvttps_epi32(ewrt);
1463             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1464             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1465                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1466                                          &ewtabF,&ewtabFn);
1467             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1468             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1469
1470             fscal            = felec;
1471
1472             /* Calculate temporary vectorial force */
1473             tx               = _mm_mul_ps(fscal,dx00);
1474             ty               = _mm_mul_ps(fscal,dy00);
1475             tz               = _mm_mul_ps(fscal,dz00);
1476
1477             /* Update vectorial force */
1478             fix0             = _mm_add_ps(fix0,tx);
1479             fiy0             = _mm_add_ps(fiy0,ty);
1480             fiz0             = _mm_add_ps(fiz0,tz);
1481
1482             fjx0             = _mm_add_ps(fjx0,tx);
1483             fjy0             = _mm_add_ps(fjy0,ty);
1484             fjz0             = _mm_add_ps(fjz0,tz);
1485             
1486             /**************************
1487              * CALCULATE INTERACTIONS *
1488              **************************/
1489
1490             r01              = _mm_mul_ps(rsq01,rinv01);
1491
1492             /* EWALD ELECTROSTATICS */
1493
1494             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1495             ewrt             = _mm_mul_ps(r01,ewtabscale);
1496             ewitab           = _mm_cvttps_epi32(ewrt);
1497             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1498             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1499                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1500                                          &ewtabF,&ewtabFn);
1501             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1502             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1503
1504             fscal            = felec;
1505
1506             /* Calculate temporary vectorial force */
1507             tx               = _mm_mul_ps(fscal,dx01);
1508             ty               = _mm_mul_ps(fscal,dy01);
1509             tz               = _mm_mul_ps(fscal,dz01);
1510
1511             /* Update vectorial force */
1512             fix0             = _mm_add_ps(fix0,tx);
1513             fiy0             = _mm_add_ps(fiy0,ty);
1514             fiz0             = _mm_add_ps(fiz0,tz);
1515
1516             fjx1             = _mm_add_ps(fjx1,tx);
1517             fjy1             = _mm_add_ps(fjy1,ty);
1518             fjz1             = _mm_add_ps(fjz1,tz);
1519             
1520             /**************************
1521              * CALCULATE INTERACTIONS *
1522              **************************/
1523
1524             r02              = _mm_mul_ps(rsq02,rinv02);
1525
1526             /* EWALD ELECTROSTATICS */
1527
1528             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1529             ewrt             = _mm_mul_ps(r02,ewtabscale);
1530             ewitab           = _mm_cvttps_epi32(ewrt);
1531             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1532             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1533                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1534                                          &ewtabF,&ewtabFn);
1535             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1536             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1537
1538             fscal            = felec;
1539
1540             /* Calculate temporary vectorial force */
1541             tx               = _mm_mul_ps(fscal,dx02);
1542             ty               = _mm_mul_ps(fscal,dy02);
1543             tz               = _mm_mul_ps(fscal,dz02);
1544
1545             /* Update vectorial force */
1546             fix0             = _mm_add_ps(fix0,tx);
1547             fiy0             = _mm_add_ps(fiy0,ty);
1548             fiz0             = _mm_add_ps(fiz0,tz);
1549
1550             fjx2             = _mm_add_ps(fjx2,tx);
1551             fjy2             = _mm_add_ps(fjy2,ty);
1552             fjz2             = _mm_add_ps(fjz2,tz);
1553             
1554             /**************************
1555              * CALCULATE INTERACTIONS *
1556              **************************/
1557
1558             r10              = _mm_mul_ps(rsq10,rinv10);
1559
1560             /* EWALD ELECTROSTATICS */
1561
1562             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1563             ewrt             = _mm_mul_ps(r10,ewtabscale);
1564             ewitab           = _mm_cvttps_epi32(ewrt);
1565             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1566             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1567                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1568                                          &ewtabF,&ewtabFn);
1569             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1570             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1571
1572             fscal            = felec;
1573
1574             /* Calculate temporary vectorial force */
1575             tx               = _mm_mul_ps(fscal,dx10);
1576             ty               = _mm_mul_ps(fscal,dy10);
1577             tz               = _mm_mul_ps(fscal,dz10);
1578
1579             /* Update vectorial force */
1580             fix1             = _mm_add_ps(fix1,tx);
1581             fiy1             = _mm_add_ps(fiy1,ty);
1582             fiz1             = _mm_add_ps(fiz1,tz);
1583
1584             fjx0             = _mm_add_ps(fjx0,tx);
1585             fjy0             = _mm_add_ps(fjy0,ty);
1586             fjz0             = _mm_add_ps(fjz0,tz);
1587             
1588             /**************************
1589              * CALCULATE INTERACTIONS *
1590              **************************/
1591
1592             r11              = _mm_mul_ps(rsq11,rinv11);
1593
1594             /* EWALD ELECTROSTATICS */
1595
1596             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1597             ewrt             = _mm_mul_ps(r11,ewtabscale);
1598             ewitab           = _mm_cvttps_epi32(ewrt);
1599             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1600             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1601                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1602                                          &ewtabF,&ewtabFn);
1603             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1604             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1605
1606             fscal            = felec;
1607
1608             /* Calculate temporary vectorial force */
1609             tx               = _mm_mul_ps(fscal,dx11);
1610             ty               = _mm_mul_ps(fscal,dy11);
1611             tz               = _mm_mul_ps(fscal,dz11);
1612
1613             /* Update vectorial force */
1614             fix1             = _mm_add_ps(fix1,tx);
1615             fiy1             = _mm_add_ps(fiy1,ty);
1616             fiz1             = _mm_add_ps(fiz1,tz);
1617
1618             fjx1             = _mm_add_ps(fjx1,tx);
1619             fjy1             = _mm_add_ps(fjy1,ty);
1620             fjz1             = _mm_add_ps(fjz1,tz);
1621             
1622             /**************************
1623              * CALCULATE INTERACTIONS *
1624              **************************/
1625
1626             r12              = _mm_mul_ps(rsq12,rinv12);
1627
1628             /* EWALD ELECTROSTATICS */
1629
1630             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1631             ewrt             = _mm_mul_ps(r12,ewtabscale);
1632             ewitab           = _mm_cvttps_epi32(ewrt);
1633             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1634             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1635                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1636                                          &ewtabF,&ewtabFn);
1637             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1638             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1639
1640             fscal            = felec;
1641
1642             /* Calculate temporary vectorial force */
1643             tx               = _mm_mul_ps(fscal,dx12);
1644             ty               = _mm_mul_ps(fscal,dy12);
1645             tz               = _mm_mul_ps(fscal,dz12);
1646
1647             /* Update vectorial force */
1648             fix1             = _mm_add_ps(fix1,tx);
1649             fiy1             = _mm_add_ps(fiy1,ty);
1650             fiz1             = _mm_add_ps(fiz1,tz);
1651
1652             fjx2             = _mm_add_ps(fjx2,tx);
1653             fjy2             = _mm_add_ps(fjy2,ty);
1654             fjz2             = _mm_add_ps(fjz2,tz);
1655             
1656             /**************************
1657              * CALCULATE INTERACTIONS *
1658              **************************/
1659
1660             r20              = _mm_mul_ps(rsq20,rinv20);
1661
1662             /* EWALD ELECTROSTATICS */
1663
1664             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1665             ewrt             = _mm_mul_ps(r20,ewtabscale);
1666             ewitab           = _mm_cvttps_epi32(ewrt);
1667             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1668             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1669                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1670                                          &ewtabF,&ewtabFn);
1671             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1672             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1673
1674             fscal            = felec;
1675
1676             /* Calculate temporary vectorial force */
1677             tx               = _mm_mul_ps(fscal,dx20);
1678             ty               = _mm_mul_ps(fscal,dy20);
1679             tz               = _mm_mul_ps(fscal,dz20);
1680
1681             /* Update vectorial force */
1682             fix2             = _mm_add_ps(fix2,tx);
1683             fiy2             = _mm_add_ps(fiy2,ty);
1684             fiz2             = _mm_add_ps(fiz2,tz);
1685
1686             fjx0             = _mm_add_ps(fjx0,tx);
1687             fjy0             = _mm_add_ps(fjy0,ty);
1688             fjz0             = _mm_add_ps(fjz0,tz);
1689             
1690             /**************************
1691              * CALCULATE INTERACTIONS *
1692              **************************/
1693
1694             r21              = _mm_mul_ps(rsq21,rinv21);
1695
1696             /* EWALD ELECTROSTATICS */
1697
1698             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1699             ewrt             = _mm_mul_ps(r21,ewtabscale);
1700             ewitab           = _mm_cvttps_epi32(ewrt);
1701             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1702             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1703                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1704                                          &ewtabF,&ewtabFn);
1705             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1706             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1707
1708             fscal            = felec;
1709
1710             /* Calculate temporary vectorial force */
1711             tx               = _mm_mul_ps(fscal,dx21);
1712             ty               = _mm_mul_ps(fscal,dy21);
1713             tz               = _mm_mul_ps(fscal,dz21);
1714
1715             /* Update vectorial force */
1716             fix2             = _mm_add_ps(fix2,tx);
1717             fiy2             = _mm_add_ps(fiy2,ty);
1718             fiz2             = _mm_add_ps(fiz2,tz);
1719
1720             fjx1             = _mm_add_ps(fjx1,tx);
1721             fjy1             = _mm_add_ps(fjy1,ty);
1722             fjz1             = _mm_add_ps(fjz1,tz);
1723             
1724             /**************************
1725              * CALCULATE INTERACTIONS *
1726              **************************/
1727
1728             r22              = _mm_mul_ps(rsq22,rinv22);
1729
1730             /* EWALD ELECTROSTATICS */
1731
1732             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1733             ewrt             = _mm_mul_ps(r22,ewtabscale);
1734             ewitab           = _mm_cvttps_epi32(ewrt);
1735             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1736             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1737                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1738                                          &ewtabF,&ewtabFn);
1739             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1740             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1741
1742             fscal            = felec;
1743
1744             /* Calculate temporary vectorial force */
1745             tx               = _mm_mul_ps(fscal,dx22);
1746             ty               = _mm_mul_ps(fscal,dy22);
1747             tz               = _mm_mul_ps(fscal,dz22);
1748
1749             /* Update vectorial force */
1750             fix2             = _mm_add_ps(fix2,tx);
1751             fiy2             = _mm_add_ps(fiy2,ty);
1752             fiz2             = _mm_add_ps(fiz2,tz);
1753
1754             fjx2             = _mm_add_ps(fjx2,tx);
1755             fjy2             = _mm_add_ps(fjy2,ty);
1756             fjz2             = _mm_add_ps(fjz2,tz);
1757             
1758             fjptrA             = f+j_coord_offsetA;
1759             fjptrB             = f+j_coord_offsetB;
1760             fjptrC             = f+j_coord_offsetC;
1761             fjptrD             = f+j_coord_offsetD;
1762
1763             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1764                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1765
1766             /* Inner loop uses 324 flops */
1767         }
1768
1769         if(jidx<j_index_end)
1770         {
1771
1772             /* Get j neighbor index, and coordinate index */
1773             jnrlistA         = jjnr[jidx];
1774             jnrlistB         = jjnr[jidx+1];
1775             jnrlistC         = jjnr[jidx+2];
1776             jnrlistD         = jjnr[jidx+3];
1777             /* Sign of each element will be negative for non-real atoms.
1778              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1779              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1780              */
1781             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1782             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1783             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1784             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1785             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1786             j_coord_offsetA  = DIM*jnrA;
1787             j_coord_offsetB  = DIM*jnrB;
1788             j_coord_offsetC  = DIM*jnrC;
1789             j_coord_offsetD  = DIM*jnrD;
1790
1791             /* load j atom coordinates */
1792             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1793                                               x+j_coord_offsetC,x+j_coord_offsetD,
1794                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1795
1796             /* Calculate displacement vector */
1797             dx00             = _mm_sub_ps(ix0,jx0);
1798             dy00             = _mm_sub_ps(iy0,jy0);
1799             dz00             = _mm_sub_ps(iz0,jz0);
1800             dx01             = _mm_sub_ps(ix0,jx1);
1801             dy01             = _mm_sub_ps(iy0,jy1);
1802             dz01             = _mm_sub_ps(iz0,jz1);
1803             dx02             = _mm_sub_ps(ix0,jx2);
1804             dy02             = _mm_sub_ps(iy0,jy2);
1805             dz02             = _mm_sub_ps(iz0,jz2);
1806             dx10             = _mm_sub_ps(ix1,jx0);
1807             dy10             = _mm_sub_ps(iy1,jy0);
1808             dz10             = _mm_sub_ps(iz1,jz0);
1809             dx11             = _mm_sub_ps(ix1,jx1);
1810             dy11             = _mm_sub_ps(iy1,jy1);
1811             dz11             = _mm_sub_ps(iz1,jz1);
1812             dx12             = _mm_sub_ps(ix1,jx2);
1813             dy12             = _mm_sub_ps(iy1,jy2);
1814             dz12             = _mm_sub_ps(iz1,jz2);
1815             dx20             = _mm_sub_ps(ix2,jx0);
1816             dy20             = _mm_sub_ps(iy2,jy0);
1817             dz20             = _mm_sub_ps(iz2,jz0);
1818             dx21             = _mm_sub_ps(ix2,jx1);
1819             dy21             = _mm_sub_ps(iy2,jy1);
1820             dz21             = _mm_sub_ps(iz2,jz1);
1821             dx22             = _mm_sub_ps(ix2,jx2);
1822             dy22             = _mm_sub_ps(iy2,jy2);
1823             dz22             = _mm_sub_ps(iz2,jz2);
1824
1825             /* Calculate squared distance and things based on it */
1826             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1827             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1828             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1829             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1830             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1831             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1832             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1833             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1834             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1835
1836             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1837             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1838             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1839             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1840             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1841             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1842             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1843             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1844             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1845
1846             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1847             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1848             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1849             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1850             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1851             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1852             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1853             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1854             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1855
1856             fjx0             = _mm_setzero_ps();
1857             fjy0             = _mm_setzero_ps();
1858             fjz0             = _mm_setzero_ps();
1859             fjx1             = _mm_setzero_ps();
1860             fjy1             = _mm_setzero_ps();
1861             fjz1             = _mm_setzero_ps();
1862             fjx2             = _mm_setzero_ps();
1863             fjy2             = _mm_setzero_ps();
1864             fjz2             = _mm_setzero_ps();
1865
1866             /**************************
1867              * CALCULATE INTERACTIONS *
1868              **************************/
1869
1870             r00              = _mm_mul_ps(rsq00,rinv00);
1871             r00              = _mm_andnot_ps(dummy_mask,r00);
1872
1873             /* EWALD ELECTROSTATICS */
1874
1875             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1876             ewrt             = _mm_mul_ps(r00,ewtabscale);
1877             ewitab           = _mm_cvttps_epi32(ewrt);
1878             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1879             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1880                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1881                                          &ewtabF,&ewtabFn);
1882             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1883             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1884
1885             fscal            = felec;
1886
1887             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1888
1889             /* Calculate temporary vectorial force */
1890             tx               = _mm_mul_ps(fscal,dx00);
1891             ty               = _mm_mul_ps(fscal,dy00);
1892             tz               = _mm_mul_ps(fscal,dz00);
1893
1894             /* Update vectorial force */
1895             fix0             = _mm_add_ps(fix0,tx);
1896             fiy0             = _mm_add_ps(fiy0,ty);
1897             fiz0             = _mm_add_ps(fiz0,tz);
1898
1899             fjx0             = _mm_add_ps(fjx0,tx);
1900             fjy0             = _mm_add_ps(fjy0,ty);
1901             fjz0             = _mm_add_ps(fjz0,tz);
1902             
1903             /**************************
1904              * CALCULATE INTERACTIONS *
1905              **************************/
1906
1907             r01              = _mm_mul_ps(rsq01,rinv01);
1908             r01              = _mm_andnot_ps(dummy_mask,r01);
1909
1910             /* EWALD ELECTROSTATICS */
1911
1912             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1913             ewrt             = _mm_mul_ps(r01,ewtabscale);
1914             ewitab           = _mm_cvttps_epi32(ewrt);
1915             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1916             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1917                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1918                                          &ewtabF,&ewtabFn);
1919             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1920             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1921
1922             fscal            = felec;
1923
1924             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1925
1926             /* Calculate temporary vectorial force */
1927             tx               = _mm_mul_ps(fscal,dx01);
1928             ty               = _mm_mul_ps(fscal,dy01);
1929             tz               = _mm_mul_ps(fscal,dz01);
1930
1931             /* Update vectorial force */
1932             fix0             = _mm_add_ps(fix0,tx);
1933             fiy0             = _mm_add_ps(fiy0,ty);
1934             fiz0             = _mm_add_ps(fiz0,tz);
1935
1936             fjx1             = _mm_add_ps(fjx1,tx);
1937             fjy1             = _mm_add_ps(fjy1,ty);
1938             fjz1             = _mm_add_ps(fjz1,tz);
1939             
1940             /**************************
1941              * CALCULATE INTERACTIONS *
1942              **************************/
1943
1944             r02              = _mm_mul_ps(rsq02,rinv02);
1945             r02              = _mm_andnot_ps(dummy_mask,r02);
1946
1947             /* EWALD ELECTROSTATICS */
1948
1949             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1950             ewrt             = _mm_mul_ps(r02,ewtabscale);
1951             ewitab           = _mm_cvttps_epi32(ewrt);
1952             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1953             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1954                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1955                                          &ewtabF,&ewtabFn);
1956             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1957             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1958
1959             fscal            = felec;
1960
1961             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1962
1963             /* Calculate temporary vectorial force */
1964             tx               = _mm_mul_ps(fscal,dx02);
1965             ty               = _mm_mul_ps(fscal,dy02);
1966             tz               = _mm_mul_ps(fscal,dz02);
1967
1968             /* Update vectorial force */
1969             fix0             = _mm_add_ps(fix0,tx);
1970             fiy0             = _mm_add_ps(fiy0,ty);
1971             fiz0             = _mm_add_ps(fiz0,tz);
1972
1973             fjx2             = _mm_add_ps(fjx2,tx);
1974             fjy2             = _mm_add_ps(fjy2,ty);
1975             fjz2             = _mm_add_ps(fjz2,tz);
1976             
1977             /**************************
1978              * CALCULATE INTERACTIONS *
1979              **************************/
1980
1981             r10              = _mm_mul_ps(rsq10,rinv10);
1982             r10              = _mm_andnot_ps(dummy_mask,r10);
1983
1984             /* EWALD ELECTROSTATICS */
1985
1986             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1987             ewrt             = _mm_mul_ps(r10,ewtabscale);
1988             ewitab           = _mm_cvttps_epi32(ewrt);
1989             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1990             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1991                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1992                                          &ewtabF,&ewtabFn);
1993             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1994             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1995
1996             fscal            = felec;
1997
1998             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1999
2000             /* Calculate temporary vectorial force */
2001             tx               = _mm_mul_ps(fscal,dx10);
2002             ty               = _mm_mul_ps(fscal,dy10);
2003             tz               = _mm_mul_ps(fscal,dz10);
2004
2005             /* Update vectorial force */
2006             fix1             = _mm_add_ps(fix1,tx);
2007             fiy1             = _mm_add_ps(fiy1,ty);
2008             fiz1             = _mm_add_ps(fiz1,tz);
2009
2010             fjx0             = _mm_add_ps(fjx0,tx);
2011             fjy0             = _mm_add_ps(fjy0,ty);
2012             fjz0             = _mm_add_ps(fjz0,tz);
2013             
2014             /**************************
2015              * CALCULATE INTERACTIONS *
2016              **************************/
2017
2018             r11              = _mm_mul_ps(rsq11,rinv11);
2019             r11              = _mm_andnot_ps(dummy_mask,r11);
2020
2021             /* EWALD ELECTROSTATICS */
2022
2023             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2024             ewrt             = _mm_mul_ps(r11,ewtabscale);
2025             ewitab           = _mm_cvttps_epi32(ewrt);
2026             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2027             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2028                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2029                                          &ewtabF,&ewtabFn);
2030             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2031             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2032
2033             fscal            = felec;
2034
2035             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2036
2037             /* Calculate temporary vectorial force */
2038             tx               = _mm_mul_ps(fscal,dx11);
2039             ty               = _mm_mul_ps(fscal,dy11);
2040             tz               = _mm_mul_ps(fscal,dz11);
2041
2042             /* Update vectorial force */
2043             fix1             = _mm_add_ps(fix1,tx);
2044             fiy1             = _mm_add_ps(fiy1,ty);
2045             fiz1             = _mm_add_ps(fiz1,tz);
2046
2047             fjx1             = _mm_add_ps(fjx1,tx);
2048             fjy1             = _mm_add_ps(fjy1,ty);
2049             fjz1             = _mm_add_ps(fjz1,tz);
2050             
2051             /**************************
2052              * CALCULATE INTERACTIONS *
2053              **************************/
2054
2055             r12              = _mm_mul_ps(rsq12,rinv12);
2056             r12              = _mm_andnot_ps(dummy_mask,r12);
2057
2058             /* EWALD ELECTROSTATICS */
2059
2060             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2061             ewrt             = _mm_mul_ps(r12,ewtabscale);
2062             ewitab           = _mm_cvttps_epi32(ewrt);
2063             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2064             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2065                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2066                                          &ewtabF,&ewtabFn);
2067             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2068             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2069
2070             fscal            = felec;
2071
2072             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2073
2074             /* Calculate temporary vectorial force */
2075             tx               = _mm_mul_ps(fscal,dx12);
2076             ty               = _mm_mul_ps(fscal,dy12);
2077             tz               = _mm_mul_ps(fscal,dz12);
2078
2079             /* Update vectorial force */
2080             fix1             = _mm_add_ps(fix1,tx);
2081             fiy1             = _mm_add_ps(fiy1,ty);
2082             fiz1             = _mm_add_ps(fiz1,tz);
2083
2084             fjx2             = _mm_add_ps(fjx2,tx);
2085             fjy2             = _mm_add_ps(fjy2,ty);
2086             fjz2             = _mm_add_ps(fjz2,tz);
2087             
2088             /**************************
2089              * CALCULATE INTERACTIONS *
2090              **************************/
2091
2092             r20              = _mm_mul_ps(rsq20,rinv20);
2093             r20              = _mm_andnot_ps(dummy_mask,r20);
2094
2095             /* EWALD ELECTROSTATICS */
2096
2097             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2098             ewrt             = _mm_mul_ps(r20,ewtabscale);
2099             ewitab           = _mm_cvttps_epi32(ewrt);
2100             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2101             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2102                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2103                                          &ewtabF,&ewtabFn);
2104             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2105             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2106
2107             fscal            = felec;
2108
2109             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2110
2111             /* Calculate temporary vectorial force */
2112             tx               = _mm_mul_ps(fscal,dx20);
2113             ty               = _mm_mul_ps(fscal,dy20);
2114             tz               = _mm_mul_ps(fscal,dz20);
2115
2116             /* Update vectorial force */
2117             fix2             = _mm_add_ps(fix2,tx);
2118             fiy2             = _mm_add_ps(fiy2,ty);
2119             fiz2             = _mm_add_ps(fiz2,tz);
2120
2121             fjx0             = _mm_add_ps(fjx0,tx);
2122             fjy0             = _mm_add_ps(fjy0,ty);
2123             fjz0             = _mm_add_ps(fjz0,tz);
2124             
2125             /**************************
2126              * CALCULATE INTERACTIONS *
2127              **************************/
2128
2129             r21              = _mm_mul_ps(rsq21,rinv21);
2130             r21              = _mm_andnot_ps(dummy_mask,r21);
2131
2132             /* EWALD ELECTROSTATICS */
2133
2134             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2135             ewrt             = _mm_mul_ps(r21,ewtabscale);
2136             ewitab           = _mm_cvttps_epi32(ewrt);
2137             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2138             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2139                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2140                                          &ewtabF,&ewtabFn);
2141             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2142             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2143
2144             fscal            = felec;
2145
2146             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2147
2148             /* Calculate temporary vectorial force */
2149             tx               = _mm_mul_ps(fscal,dx21);
2150             ty               = _mm_mul_ps(fscal,dy21);
2151             tz               = _mm_mul_ps(fscal,dz21);
2152
2153             /* Update vectorial force */
2154             fix2             = _mm_add_ps(fix2,tx);
2155             fiy2             = _mm_add_ps(fiy2,ty);
2156             fiz2             = _mm_add_ps(fiz2,tz);
2157
2158             fjx1             = _mm_add_ps(fjx1,tx);
2159             fjy1             = _mm_add_ps(fjy1,ty);
2160             fjz1             = _mm_add_ps(fjz1,tz);
2161             
2162             /**************************
2163              * CALCULATE INTERACTIONS *
2164              **************************/
2165
2166             r22              = _mm_mul_ps(rsq22,rinv22);
2167             r22              = _mm_andnot_ps(dummy_mask,r22);
2168
2169             /* EWALD ELECTROSTATICS */
2170
2171             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2172             ewrt             = _mm_mul_ps(r22,ewtabscale);
2173             ewitab           = _mm_cvttps_epi32(ewrt);
2174             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2175             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2176                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2177                                          &ewtabF,&ewtabFn);
2178             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2179             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2180
2181             fscal            = felec;
2182
2183             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2184
2185             /* Calculate temporary vectorial force */
2186             tx               = _mm_mul_ps(fscal,dx22);
2187             ty               = _mm_mul_ps(fscal,dy22);
2188             tz               = _mm_mul_ps(fscal,dz22);
2189
2190             /* Update vectorial force */
2191             fix2             = _mm_add_ps(fix2,tx);
2192             fiy2             = _mm_add_ps(fiy2,ty);
2193             fiz2             = _mm_add_ps(fiz2,tz);
2194
2195             fjx2             = _mm_add_ps(fjx2,tx);
2196             fjy2             = _mm_add_ps(fjy2,ty);
2197             fjz2             = _mm_add_ps(fjz2,tz);
2198             
2199             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2200             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2201             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2202             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2203
2204             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2205                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2206
2207             /* Inner loop uses 333 flops */
2208         }
2209
2210         /* End of innermost loop */
2211
2212         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2213                                               f+i_coord_offset,fshift+i_shift_offset);
2214
2215         /* Increment number of inner iterations */
2216         inneriter                  += j_index_end - j_index_start;
2217
2218         /* Outer loop uses 18 flops */
2219     }
2220
2221     /* Increment number of outer iterations */
2222     outeriter        += nri;
2223
2224     /* Update outer/inner flops */
2225
2226     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*333);
2227 }