81bfc88ff3d0e1788cd090fc4e44f3b0ad58a65c
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecEwSh_VdwNone_GeomW3W3_sse2_single.c
1 /*
2  * Note: this file was generated by the Gromacs sse2_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse2_single
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            None
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse2_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
64     real             rcutoff_scalar;
65     real             *shiftvec,*fshift,*x,*f;
66     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
67     real             scratch[4*DIM];
68     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
69     int              vdwioffset0;
70     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
71     int              vdwioffset1;
72     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
73     int              vdwioffset2;
74     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
78     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
79     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
80     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
81     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
82     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
83     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
84     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
85     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
86     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
87     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
88     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
89     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
90     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
91     real             *charge;
92     __m128i          ewitab;
93     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
94     real             *ewtab;
95     __m128           dummy_mask,cutoff_mask;
96     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
97     __m128           one     = _mm_set1_ps(1.0);
98     __m128           two     = _mm_set1_ps(2.0);
99     x                = xx[0];
100     f                = ff[0];
101
102     nri              = nlist->nri;
103     iinr             = nlist->iinr;
104     jindex           = nlist->jindex;
105     jjnr             = nlist->jjnr;
106     shiftidx         = nlist->shift;
107     gid              = nlist->gid;
108     shiftvec         = fr->shift_vec[0];
109     fshift           = fr->fshift[0];
110     facel            = _mm_set1_ps(fr->epsfac);
111     charge           = mdatoms->chargeA;
112
113     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
114     ewtab            = fr->ic->tabq_coul_FDV0;
115     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
116     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
117
118     /* Setup water-specific parameters */
119     inr              = nlist->iinr[0];
120     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
121     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
122     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
123
124     jq0              = _mm_set1_ps(charge[inr+0]);
125     jq1              = _mm_set1_ps(charge[inr+1]);
126     jq2              = _mm_set1_ps(charge[inr+2]);
127     qq00             = _mm_mul_ps(iq0,jq0);
128     qq01             = _mm_mul_ps(iq0,jq1);
129     qq02             = _mm_mul_ps(iq0,jq2);
130     qq10             = _mm_mul_ps(iq1,jq0);
131     qq11             = _mm_mul_ps(iq1,jq1);
132     qq12             = _mm_mul_ps(iq1,jq2);
133     qq20             = _mm_mul_ps(iq2,jq0);
134     qq21             = _mm_mul_ps(iq2,jq1);
135     qq22             = _mm_mul_ps(iq2,jq2);
136
137     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
138     rcutoff_scalar   = fr->rcoulomb;
139     rcutoff          = _mm_set1_ps(rcutoff_scalar);
140     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
141
142     /* Avoid stupid compiler warnings */
143     jnrA = jnrB = jnrC = jnrD = 0;
144     j_coord_offsetA = 0;
145     j_coord_offsetB = 0;
146     j_coord_offsetC = 0;
147     j_coord_offsetD = 0;
148
149     outeriter        = 0;
150     inneriter        = 0;
151
152     for(iidx=0;iidx<4*DIM;iidx++)
153     {
154         scratch[iidx] = 0.0;
155     }  
156
157     /* Start outer loop over neighborlists */
158     for(iidx=0; iidx<nri; iidx++)
159     {
160         /* Load shift vector for this list */
161         i_shift_offset   = DIM*shiftidx[iidx];
162
163         /* Load limits for loop over neighbors */
164         j_index_start    = jindex[iidx];
165         j_index_end      = jindex[iidx+1];
166
167         /* Get outer coordinate index */
168         inr              = iinr[iidx];
169         i_coord_offset   = DIM*inr;
170
171         /* Load i particle coords and add shift vector */
172         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
173                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
174         
175         fix0             = _mm_setzero_ps();
176         fiy0             = _mm_setzero_ps();
177         fiz0             = _mm_setzero_ps();
178         fix1             = _mm_setzero_ps();
179         fiy1             = _mm_setzero_ps();
180         fiz1             = _mm_setzero_ps();
181         fix2             = _mm_setzero_ps();
182         fiy2             = _mm_setzero_ps();
183         fiz2             = _mm_setzero_ps();
184
185         /* Reset potential sums */
186         velecsum         = _mm_setzero_ps();
187
188         /* Start inner kernel loop */
189         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
190         {
191
192             /* Get j neighbor index, and coordinate index */
193             jnrA             = jjnr[jidx];
194             jnrB             = jjnr[jidx+1];
195             jnrC             = jjnr[jidx+2];
196             jnrD             = jjnr[jidx+3];
197             j_coord_offsetA  = DIM*jnrA;
198             j_coord_offsetB  = DIM*jnrB;
199             j_coord_offsetC  = DIM*jnrC;
200             j_coord_offsetD  = DIM*jnrD;
201
202             /* load j atom coordinates */
203             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
204                                               x+j_coord_offsetC,x+j_coord_offsetD,
205                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
206
207             /* Calculate displacement vector */
208             dx00             = _mm_sub_ps(ix0,jx0);
209             dy00             = _mm_sub_ps(iy0,jy0);
210             dz00             = _mm_sub_ps(iz0,jz0);
211             dx01             = _mm_sub_ps(ix0,jx1);
212             dy01             = _mm_sub_ps(iy0,jy1);
213             dz01             = _mm_sub_ps(iz0,jz1);
214             dx02             = _mm_sub_ps(ix0,jx2);
215             dy02             = _mm_sub_ps(iy0,jy2);
216             dz02             = _mm_sub_ps(iz0,jz2);
217             dx10             = _mm_sub_ps(ix1,jx0);
218             dy10             = _mm_sub_ps(iy1,jy0);
219             dz10             = _mm_sub_ps(iz1,jz0);
220             dx11             = _mm_sub_ps(ix1,jx1);
221             dy11             = _mm_sub_ps(iy1,jy1);
222             dz11             = _mm_sub_ps(iz1,jz1);
223             dx12             = _mm_sub_ps(ix1,jx2);
224             dy12             = _mm_sub_ps(iy1,jy2);
225             dz12             = _mm_sub_ps(iz1,jz2);
226             dx20             = _mm_sub_ps(ix2,jx0);
227             dy20             = _mm_sub_ps(iy2,jy0);
228             dz20             = _mm_sub_ps(iz2,jz0);
229             dx21             = _mm_sub_ps(ix2,jx1);
230             dy21             = _mm_sub_ps(iy2,jy1);
231             dz21             = _mm_sub_ps(iz2,jz1);
232             dx22             = _mm_sub_ps(ix2,jx2);
233             dy22             = _mm_sub_ps(iy2,jy2);
234             dz22             = _mm_sub_ps(iz2,jz2);
235
236             /* Calculate squared distance and things based on it */
237             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
238             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
239             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
240             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
241             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
242             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
243             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
244             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
245             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
246
247             rinv00           = gmx_mm_invsqrt_ps(rsq00);
248             rinv01           = gmx_mm_invsqrt_ps(rsq01);
249             rinv02           = gmx_mm_invsqrt_ps(rsq02);
250             rinv10           = gmx_mm_invsqrt_ps(rsq10);
251             rinv11           = gmx_mm_invsqrt_ps(rsq11);
252             rinv12           = gmx_mm_invsqrt_ps(rsq12);
253             rinv20           = gmx_mm_invsqrt_ps(rsq20);
254             rinv21           = gmx_mm_invsqrt_ps(rsq21);
255             rinv22           = gmx_mm_invsqrt_ps(rsq22);
256
257             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
258             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
259             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
260             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
261             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
262             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
263             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
264             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
265             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
266
267             fjx0             = _mm_setzero_ps();
268             fjy0             = _mm_setzero_ps();
269             fjz0             = _mm_setzero_ps();
270             fjx1             = _mm_setzero_ps();
271             fjy1             = _mm_setzero_ps();
272             fjz1             = _mm_setzero_ps();
273             fjx2             = _mm_setzero_ps();
274             fjy2             = _mm_setzero_ps();
275             fjz2             = _mm_setzero_ps();
276
277             /**************************
278              * CALCULATE INTERACTIONS *
279              **************************/
280
281             if (gmx_mm_any_lt(rsq00,rcutoff2))
282             {
283
284             r00              = _mm_mul_ps(rsq00,rinv00);
285
286             /* EWALD ELECTROSTATICS */
287
288             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
289             ewrt             = _mm_mul_ps(r00,ewtabscale);
290             ewitab           = _mm_cvttps_epi32(ewrt);
291             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
292             ewitab           = _mm_slli_epi32(ewitab,2);
293             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
294             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
295             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
296             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
297             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
298             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
299             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
300             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
301             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
302
303             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
304
305             /* Update potential sum for this i atom from the interaction with this j atom. */
306             velec            = _mm_and_ps(velec,cutoff_mask);
307             velecsum         = _mm_add_ps(velecsum,velec);
308
309             fscal            = felec;
310
311             fscal            = _mm_and_ps(fscal,cutoff_mask);
312
313             /* Calculate temporary vectorial force */
314             tx               = _mm_mul_ps(fscal,dx00);
315             ty               = _mm_mul_ps(fscal,dy00);
316             tz               = _mm_mul_ps(fscal,dz00);
317
318             /* Update vectorial force */
319             fix0             = _mm_add_ps(fix0,tx);
320             fiy0             = _mm_add_ps(fiy0,ty);
321             fiz0             = _mm_add_ps(fiz0,tz);
322
323             fjx0             = _mm_add_ps(fjx0,tx);
324             fjy0             = _mm_add_ps(fjy0,ty);
325             fjz0             = _mm_add_ps(fjz0,tz);
326             
327             }
328
329             /**************************
330              * CALCULATE INTERACTIONS *
331              **************************/
332
333             if (gmx_mm_any_lt(rsq01,rcutoff2))
334             {
335
336             r01              = _mm_mul_ps(rsq01,rinv01);
337
338             /* EWALD ELECTROSTATICS */
339
340             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
341             ewrt             = _mm_mul_ps(r01,ewtabscale);
342             ewitab           = _mm_cvttps_epi32(ewrt);
343             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
344             ewitab           = _mm_slli_epi32(ewitab,2);
345             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
346             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
347             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
348             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
349             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
350             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
351             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
352             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
353             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
354
355             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
356
357             /* Update potential sum for this i atom from the interaction with this j atom. */
358             velec            = _mm_and_ps(velec,cutoff_mask);
359             velecsum         = _mm_add_ps(velecsum,velec);
360
361             fscal            = felec;
362
363             fscal            = _mm_and_ps(fscal,cutoff_mask);
364
365             /* Calculate temporary vectorial force */
366             tx               = _mm_mul_ps(fscal,dx01);
367             ty               = _mm_mul_ps(fscal,dy01);
368             tz               = _mm_mul_ps(fscal,dz01);
369
370             /* Update vectorial force */
371             fix0             = _mm_add_ps(fix0,tx);
372             fiy0             = _mm_add_ps(fiy0,ty);
373             fiz0             = _mm_add_ps(fiz0,tz);
374
375             fjx1             = _mm_add_ps(fjx1,tx);
376             fjy1             = _mm_add_ps(fjy1,ty);
377             fjz1             = _mm_add_ps(fjz1,tz);
378             
379             }
380
381             /**************************
382              * CALCULATE INTERACTIONS *
383              **************************/
384
385             if (gmx_mm_any_lt(rsq02,rcutoff2))
386             {
387
388             r02              = _mm_mul_ps(rsq02,rinv02);
389
390             /* EWALD ELECTROSTATICS */
391
392             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
393             ewrt             = _mm_mul_ps(r02,ewtabscale);
394             ewitab           = _mm_cvttps_epi32(ewrt);
395             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
396             ewitab           = _mm_slli_epi32(ewitab,2);
397             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
398             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
399             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
400             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
401             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
402             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
403             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
404             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
405             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
406
407             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
408
409             /* Update potential sum for this i atom from the interaction with this j atom. */
410             velec            = _mm_and_ps(velec,cutoff_mask);
411             velecsum         = _mm_add_ps(velecsum,velec);
412
413             fscal            = felec;
414
415             fscal            = _mm_and_ps(fscal,cutoff_mask);
416
417             /* Calculate temporary vectorial force */
418             tx               = _mm_mul_ps(fscal,dx02);
419             ty               = _mm_mul_ps(fscal,dy02);
420             tz               = _mm_mul_ps(fscal,dz02);
421
422             /* Update vectorial force */
423             fix0             = _mm_add_ps(fix0,tx);
424             fiy0             = _mm_add_ps(fiy0,ty);
425             fiz0             = _mm_add_ps(fiz0,tz);
426
427             fjx2             = _mm_add_ps(fjx2,tx);
428             fjy2             = _mm_add_ps(fjy2,ty);
429             fjz2             = _mm_add_ps(fjz2,tz);
430             
431             }
432
433             /**************************
434              * CALCULATE INTERACTIONS *
435              **************************/
436
437             if (gmx_mm_any_lt(rsq10,rcutoff2))
438             {
439
440             r10              = _mm_mul_ps(rsq10,rinv10);
441
442             /* EWALD ELECTROSTATICS */
443
444             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
445             ewrt             = _mm_mul_ps(r10,ewtabscale);
446             ewitab           = _mm_cvttps_epi32(ewrt);
447             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
448             ewitab           = _mm_slli_epi32(ewitab,2);
449             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
450             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
451             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
452             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
453             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
454             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
455             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
456             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
457             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
458
459             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
460
461             /* Update potential sum for this i atom from the interaction with this j atom. */
462             velec            = _mm_and_ps(velec,cutoff_mask);
463             velecsum         = _mm_add_ps(velecsum,velec);
464
465             fscal            = felec;
466
467             fscal            = _mm_and_ps(fscal,cutoff_mask);
468
469             /* Calculate temporary vectorial force */
470             tx               = _mm_mul_ps(fscal,dx10);
471             ty               = _mm_mul_ps(fscal,dy10);
472             tz               = _mm_mul_ps(fscal,dz10);
473
474             /* Update vectorial force */
475             fix1             = _mm_add_ps(fix1,tx);
476             fiy1             = _mm_add_ps(fiy1,ty);
477             fiz1             = _mm_add_ps(fiz1,tz);
478
479             fjx0             = _mm_add_ps(fjx0,tx);
480             fjy0             = _mm_add_ps(fjy0,ty);
481             fjz0             = _mm_add_ps(fjz0,tz);
482             
483             }
484
485             /**************************
486              * CALCULATE INTERACTIONS *
487              **************************/
488
489             if (gmx_mm_any_lt(rsq11,rcutoff2))
490             {
491
492             r11              = _mm_mul_ps(rsq11,rinv11);
493
494             /* EWALD ELECTROSTATICS */
495
496             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
497             ewrt             = _mm_mul_ps(r11,ewtabscale);
498             ewitab           = _mm_cvttps_epi32(ewrt);
499             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
500             ewitab           = _mm_slli_epi32(ewitab,2);
501             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
502             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
503             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
504             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
505             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
506             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
507             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
508             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
509             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
510
511             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
512
513             /* Update potential sum for this i atom from the interaction with this j atom. */
514             velec            = _mm_and_ps(velec,cutoff_mask);
515             velecsum         = _mm_add_ps(velecsum,velec);
516
517             fscal            = felec;
518
519             fscal            = _mm_and_ps(fscal,cutoff_mask);
520
521             /* Calculate temporary vectorial force */
522             tx               = _mm_mul_ps(fscal,dx11);
523             ty               = _mm_mul_ps(fscal,dy11);
524             tz               = _mm_mul_ps(fscal,dz11);
525
526             /* Update vectorial force */
527             fix1             = _mm_add_ps(fix1,tx);
528             fiy1             = _mm_add_ps(fiy1,ty);
529             fiz1             = _mm_add_ps(fiz1,tz);
530
531             fjx1             = _mm_add_ps(fjx1,tx);
532             fjy1             = _mm_add_ps(fjy1,ty);
533             fjz1             = _mm_add_ps(fjz1,tz);
534             
535             }
536
537             /**************************
538              * CALCULATE INTERACTIONS *
539              **************************/
540
541             if (gmx_mm_any_lt(rsq12,rcutoff2))
542             {
543
544             r12              = _mm_mul_ps(rsq12,rinv12);
545
546             /* EWALD ELECTROSTATICS */
547
548             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
549             ewrt             = _mm_mul_ps(r12,ewtabscale);
550             ewitab           = _mm_cvttps_epi32(ewrt);
551             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
552             ewitab           = _mm_slli_epi32(ewitab,2);
553             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
554             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
555             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
556             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
557             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
558             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
559             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
560             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
561             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
562
563             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
564
565             /* Update potential sum for this i atom from the interaction with this j atom. */
566             velec            = _mm_and_ps(velec,cutoff_mask);
567             velecsum         = _mm_add_ps(velecsum,velec);
568
569             fscal            = felec;
570
571             fscal            = _mm_and_ps(fscal,cutoff_mask);
572
573             /* Calculate temporary vectorial force */
574             tx               = _mm_mul_ps(fscal,dx12);
575             ty               = _mm_mul_ps(fscal,dy12);
576             tz               = _mm_mul_ps(fscal,dz12);
577
578             /* Update vectorial force */
579             fix1             = _mm_add_ps(fix1,tx);
580             fiy1             = _mm_add_ps(fiy1,ty);
581             fiz1             = _mm_add_ps(fiz1,tz);
582
583             fjx2             = _mm_add_ps(fjx2,tx);
584             fjy2             = _mm_add_ps(fjy2,ty);
585             fjz2             = _mm_add_ps(fjz2,tz);
586             
587             }
588
589             /**************************
590              * CALCULATE INTERACTIONS *
591              **************************/
592
593             if (gmx_mm_any_lt(rsq20,rcutoff2))
594             {
595
596             r20              = _mm_mul_ps(rsq20,rinv20);
597
598             /* EWALD ELECTROSTATICS */
599
600             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
601             ewrt             = _mm_mul_ps(r20,ewtabscale);
602             ewitab           = _mm_cvttps_epi32(ewrt);
603             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
604             ewitab           = _mm_slli_epi32(ewitab,2);
605             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
606             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
607             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
608             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
609             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
610             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
611             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
612             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
613             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
614
615             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
616
617             /* Update potential sum for this i atom from the interaction with this j atom. */
618             velec            = _mm_and_ps(velec,cutoff_mask);
619             velecsum         = _mm_add_ps(velecsum,velec);
620
621             fscal            = felec;
622
623             fscal            = _mm_and_ps(fscal,cutoff_mask);
624
625             /* Calculate temporary vectorial force */
626             tx               = _mm_mul_ps(fscal,dx20);
627             ty               = _mm_mul_ps(fscal,dy20);
628             tz               = _mm_mul_ps(fscal,dz20);
629
630             /* Update vectorial force */
631             fix2             = _mm_add_ps(fix2,tx);
632             fiy2             = _mm_add_ps(fiy2,ty);
633             fiz2             = _mm_add_ps(fiz2,tz);
634
635             fjx0             = _mm_add_ps(fjx0,tx);
636             fjy0             = _mm_add_ps(fjy0,ty);
637             fjz0             = _mm_add_ps(fjz0,tz);
638             
639             }
640
641             /**************************
642              * CALCULATE INTERACTIONS *
643              **************************/
644
645             if (gmx_mm_any_lt(rsq21,rcutoff2))
646             {
647
648             r21              = _mm_mul_ps(rsq21,rinv21);
649
650             /* EWALD ELECTROSTATICS */
651
652             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
653             ewrt             = _mm_mul_ps(r21,ewtabscale);
654             ewitab           = _mm_cvttps_epi32(ewrt);
655             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
656             ewitab           = _mm_slli_epi32(ewitab,2);
657             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
658             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
659             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
660             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
661             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
662             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
663             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
664             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
665             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
666
667             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
668
669             /* Update potential sum for this i atom from the interaction with this j atom. */
670             velec            = _mm_and_ps(velec,cutoff_mask);
671             velecsum         = _mm_add_ps(velecsum,velec);
672
673             fscal            = felec;
674
675             fscal            = _mm_and_ps(fscal,cutoff_mask);
676
677             /* Calculate temporary vectorial force */
678             tx               = _mm_mul_ps(fscal,dx21);
679             ty               = _mm_mul_ps(fscal,dy21);
680             tz               = _mm_mul_ps(fscal,dz21);
681
682             /* Update vectorial force */
683             fix2             = _mm_add_ps(fix2,tx);
684             fiy2             = _mm_add_ps(fiy2,ty);
685             fiz2             = _mm_add_ps(fiz2,tz);
686
687             fjx1             = _mm_add_ps(fjx1,tx);
688             fjy1             = _mm_add_ps(fjy1,ty);
689             fjz1             = _mm_add_ps(fjz1,tz);
690             
691             }
692
693             /**************************
694              * CALCULATE INTERACTIONS *
695              **************************/
696
697             if (gmx_mm_any_lt(rsq22,rcutoff2))
698             {
699
700             r22              = _mm_mul_ps(rsq22,rinv22);
701
702             /* EWALD ELECTROSTATICS */
703
704             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
705             ewrt             = _mm_mul_ps(r22,ewtabscale);
706             ewitab           = _mm_cvttps_epi32(ewrt);
707             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
708             ewitab           = _mm_slli_epi32(ewitab,2);
709             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
710             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
711             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
712             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
713             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
714             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
715             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
716             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
717             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
718
719             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
720
721             /* Update potential sum for this i atom from the interaction with this j atom. */
722             velec            = _mm_and_ps(velec,cutoff_mask);
723             velecsum         = _mm_add_ps(velecsum,velec);
724
725             fscal            = felec;
726
727             fscal            = _mm_and_ps(fscal,cutoff_mask);
728
729             /* Calculate temporary vectorial force */
730             tx               = _mm_mul_ps(fscal,dx22);
731             ty               = _mm_mul_ps(fscal,dy22);
732             tz               = _mm_mul_ps(fscal,dz22);
733
734             /* Update vectorial force */
735             fix2             = _mm_add_ps(fix2,tx);
736             fiy2             = _mm_add_ps(fiy2,ty);
737             fiz2             = _mm_add_ps(fiz2,tz);
738
739             fjx2             = _mm_add_ps(fjx2,tx);
740             fjy2             = _mm_add_ps(fjy2,ty);
741             fjz2             = _mm_add_ps(fjz2,tz);
742             
743             }
744
745             fjptrA             = f+j_coord_offsetA;
746             fjptrB             = f+j_coord_offsetB;
747             fjptrC             = f+j_coord_offsetC;
748             fjptrD             = f+j_coord_offsetD;
749
750             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
751                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
752
753             /* Inner loop uses 414 flops */
754         }
755
756         if(jidx<j_index_end)
757         {
758
759             /* Get j neighbor index, and coordinate index */
760             jnrlistA         = jjnr[jidx];
761             jnrlistB         = jjnr[jidx+1];
762             jnrlistC         = jjnr[jidx+2];
763             jnrlistD         = jjnr[jidx+3];
764             /* Sign of each element will be negative for non-real atoms.
765              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
766              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
767              */
768             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
769             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
770             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
771             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
772             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
773             j_coord_offsetA  = DIM*jnrA;
774             j_coord_offsetB  = DIM*jnrB;
775             j_coord_offsetC  = DIM*jnrC;
776             j_coord_offsetD  = DIM*jnrD;
777
778             /* load j atom coordinates */
779             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
780                                               x+j_coord_offsetC,x+j_coord_offsetD,
781                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
782
783             /* Calculate displacement vector */
784             dx00             = _mm_sub_ps(ix0,jx0);
785             dy00             = _mm_sub_ps(iy0,jy0);
786             dz00             = _mm_sub_ps(iz0,jz0);
787             dx01             = _mm_sub_ps(ix0,jx1);
788             dy01             = _mm_sub_ps(iy0,jy1);
789             dz01             = _mm_sub_ps(iz0,jz1);
790             dx02             = _mm_sub_ps(ix0,jx2);
791             dy02             = _mm_sub_ps(iy0,jy2);
792             dz02             = _mm_sub_ps(iz0,jz2);
793             dx10             = _mm_sub_ps(ix1,jx0);
794             dy10             = _mm_sub_ps(iy1,jy0);
795             dz10             = _mm_sub_ps(iz1,jz0);
796             dx11             = _mm_sub_ps(ix1,jx1);
797             dy11             = _mm_sub_ps(iy1,jy1);
798             dz11             = _mm_sub_ps(iz1,jz1);
799             dx12             = _mm_sub_ps(ix1,jx2);
800             dy12             = _mm_sub_ps(iy1,jy2);
801             dz12             = _mm_sub_ps(iz1,jz2);
802             dx20             = _mm_sub_ps(ix2,jx0);
803             dy20             = _mm_sub_ps(iy2,jy0);
804             dz20             = _mm_sub_ps(iz2,jz0);
805             dx21             = _mm_sub_ps(ix2,jx1);
806             dy21             = _mm_sub_ps(iy2,jy1);
807             dz21             = _mm_sub_ps(iz2,jz1);
808             dx22             = _mm_sub_ps(ix2,jx2);
809             dy22             = _mm_sub_ps(iy2,jy2);
810             dz22             = _mm_sub_ps(iz2,jz2);
811
812             /* Calculate squared distance and things based on it */
813             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
814             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
815             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
816             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
817             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
818             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
819             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
820             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
821             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
822
823             rinv00           = gmx_mm_invsqrt_ps(rsq00);
824             rinv01           = gmx_mm_invsqrt_ps(rsq01);
825             rinv02           = gmx_mm_invsqrt_ps(rsq02);
826             rinv10           = gmx_mm_invsqrt_ps(rsq10);
827             rinv11           = gmx_mm_invsqrt_ps(rsq11);
828             rinv12           = gmx_mm_invsqrt_ps(rsq12);
829             rinv20           = gmx_mm_invsqrt_ps(rsq20);
830             rinv21           = gmx_mm_invsqrt_ps(rsq21);
831             rinv22           = gmx_mm_invsqrt_ps(rsq22);
832
833             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
834             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
835             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
836             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
837             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
838             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
839             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
840             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
841             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
842
843             fjx0             = _mm_setzero_ps();
844             fjy0             = _mm_setzero_ps();
845             fjz0             = _mm_setzero_ps();
846             fjx1             = _mm_setzero_ps();
847             fjy1             = _mm_setzero_ps();
848             fjz1             = _mm_setzero_ps();
849             fjx2             = _mm_setzero_ps();
850             fjy2             = _mm_setzero_ps();
851             fjz2             = _mm_setzero_ps();
852
853             /**************************
854              * CALCULATE INTERACTIONS *
855              **************************/
856
857             if (gmx_mm_any_lt(rsq00,rcutoff2))
858             {
859
860             r00              = _mm_mul_ps(rsq00,rinv00);
861             r00              = _mm_andnot_ps(dummy_mask,r00);
862
863             /* EWALD ELECTROSTATICS */
864
865             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
866             ewrt             = _mm_mul_ps(r00,ewtabscale);
867             ewitab           = _mm_cvttps_epi32(ewrt);
868             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
869             ewitab           = _mm_slli_epi32(ewitab,2);
870             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
871             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
872             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
873             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
874             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
875             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
876             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
877             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
878             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
879
880             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
881
882             /* Update potential sum for this i atom from the interaction with this j atom. */
883             velec            = _mm_and_ps(velec,cutoff_mask);
884             velec            = _mm_andnot_ps(dummy_mask,velec);
885             velecsum         = _mm_add_ps(velecsum,velec);
886
887             fscal            = felec;
888
889             fscal            = _mm_and_ps(fscal,cutoff_mask);
890
891             fscal            = _mm_andnot_ps(dummy_mask,fscal);
892
893             /* Calculate temporary vectorial force */
894             tx               = _mm_mul_ps(fscal,dx00);
895             ty               = _mm_mul_ps(fscal,dy00);
896             tz               = _mm_mul_ps(fscal,dz00);
897
898             /* Update vectorial force */
899             fix0             = _mm_add_ps(fix0,tx);
900             fiy0             = _mm_add_ps(fiy0,ty);
901             fiz0             = _mm_add_ps(fiz0,tz);
902
903             fjx0             = _mm_add_ps(fjx0,tx);
904             fjy0             = _mm_add_ps(fjy0,ty);
905             fjz0             = _mm_add_ps(fjz0,tz);
906             
907             }
908
909             /**************************
910              * CALCULATE INTERACTIONS *
911              **************************/
912
913             if (gmx_mm_any_lt(rsq01,rcutoff2))
914             {
915
916             r01              = _mm_mul_ps(rsq01,rinv01);
917             r01              = _mm_andnot_ps(dummy_mask,r01);
918
919             /* EWALD ELECTROSTATICS */
920
921             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
922             ewrt             = _mm_mul_ps(r01,ewtabscale);
923             ewitab           = _mm_cvttps_epi32(ewrt);
924             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
925             ewitab           = _mm_slli_epi32(ewitab,2);
926             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
927             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
928             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
929             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
930             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
931             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
932             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
933             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
934             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
935
936             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
937
938             /* Update potential sum for this i atom from the interaction with this j atom. */
939             velec            = _mm_and_ps(velec,cutoff_mask);
940             velec            = _mm_andnot_ps(dummy_mask,velec);
941             velecsum         = _mm_add_ps(velecsum,velec);
942
943             fscal            = felec;
944
945             fscal            = _mm_and_ps(fscal,cutoff_mask);
946
947             fscal            = _mm_andnot_ps(dummy_mask,fscal);
948
949             /* Calculate temporary vectorial force */
950             tx               = _mm_mul_ps(fscal,dx01);
951             ty               = _mm_mul_ps(fscal,dy01);
952             tz               = _mm_mul_ps(fscal,dz01);
953
954             /* Update vectorial force */
955             fix0             = _mm_add_ps(fix0,tx);
956             fiy0             = _mm_add_ps(fiy0,ty);
957             fiz0             = _mm_add_ps(fiz0,tz);
958
959             fjx1             = _mm_add_ps(fjx1,tx);
960             fjy1             = _mm_add_ps(fjy1,ty);
961             fjz1             = _mm_add_ps(fjz1,tz);
962             
963             }
964
965             /**************************
966              * CALCULATE INTERACTIONS *
967              **************************/
968
969             if (gmx_mm_any_lt(rsq02,rcutoff2))
970             {
971
972             r02              = _mm_mul_ps(rsq02,rinv02);
973             r02              = _mm_andnot_ps(dummy_mask,r02);
974
975             /* EWALD ELECTROSTATICS */
976
977             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
978             ewrt             = _mm_mul_ps(r02,ewtabscale);
979             ewitab           = _mm_cvttps_epi32(ewrt);
980             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
981             ewitab           = _mm_slli_epi32(ewitab,2);
982             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
983             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
984             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
985             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
986             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
987             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
988             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
989             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
990             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
991
992             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
993
994             /* Update potential sum for this i atom from the interaction with this j atom. */
995             velec            = _mm_and_ps(velec,cutoff_mask);
996             velec            = _mm_andnot_ps(dummy_mask,velec);
997             velecsum         = _mm_add_ps(velecsum,velec);
998
999             fscal            = felec;
1000
1001             fscal            = _mm_and_ps(fscal,cutoff_mask);
1002
1003             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1004
1005             /* Calculate temporary vectorial force */
1006             tx               = _mm_mul_ps(fscal,dx02);
1007             ty               = _mm_mul_ps(fscal,dy02);
1008             tz               = _mm_mul_ps(fscal,dz02);
1009
1010             /* Update vectorial force */
1011             fix0             = _mm_add_ps(fix0,tx);
1012             fiy0             = _mm_add_ps(fiy0,ty);
1013             fiz0             = _mm_add_ps(fiz0,tz);
1014
1015             fjx2             = _mm_add_ps(fjx2,tx);
1016             fjy2             = _mm_add_ps(fjy2,ty);
1017             fjz2             = _mm_add_ps(fjz2,tz);
1018             
1019             }
1020
1021             /**************************
1022              * CALCULATE INTERACTIONS *
1023              **************************/
1024
1025             if (gmx_mm_any_lt(rsq10,rcutoff2))
1026             {
1027
1028             r10              = _mm_mul_ps(rsq10,rinv10);
1029             r10              = _mm_andnot_ps(dummy_mask,r10);
1030
1031             /* EWALD ELECTROSTATICS */
1032
1033             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1034             ewrt             = _mm_mul_ps(r10,ewtabscale);
1035             ewitab           = _mm_cvttps_epi32(ewrt);
1036             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1037             ewitab           = _mm_slli_epi32(ewitab,2);
1038             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1039             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1040             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1041             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1042             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1043             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1044             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1045             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
1046             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1047
1048             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1049
1050             /* Update potential sum for this i atom from the interaction with this j atom. */
1051             velec            = _mm_and_ps(velec,cutoff_mask);
1052             velec            = _mm_andnot_ps(dummy_mask,velec);
1053             velecsum         = _mm_add_ps(velecsum,velec);
1054
1055             fscal            = felec;
1056
1057             fscal            = _mm_and_ps(fscal,cutoff_mask);
1058
1059             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1060
1061             /* Calculate temporary vectorial force */
1062             tx               = _mm_mul_ps(fscal,dx10);
1063             ty               = _mm_mul_ps(fscal,dy10);
1064             tz               = _mm_mul_ps(fscal,dz10);
1065
1066             /* Update vectorial force */
1067             fix1             = _mm_add_ps(fix1,tx);
1068             fiy1             = _mm_add_ps(fiy1,ty);
1069             fiz1             = _mm_add_ps(fiz1,tz);
1070
1071             fjx0             = _mm_add_ps(fjx0,tx);
1072             fjy0             = _mm_add_ps(fjy0,ty);
1073             fjz0             = _mm_add_ps(fjz0,tz);
1074             
1075             }
1076
1077             /**************************
1078              * CALCULATE INTERACTIONS *
1079              **************************/
1080
1081             if (gmx_mm_any_lt(rsq11,rcutoff2))
1082             {
1083
1084             r11              = _mm_mul_ps(rsq11,rinv11);
1085             r11              = _mm_andnot_ps(dummy_mask,r11);
1086
1087             /* EWALD ELECTROSTATICS */
1088
1089             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1090             ewrt             = _mm_mul_ps(r11,ewtabscale);
1091             ewitab           = _mm_cvttps_epi32(ewrt);
1092             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1093             ewitab           = _mm_slli_epi32(ewitab,2);
1094             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1095             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1096             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1097             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1098             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1099             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1100             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1101             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
1102             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1103
1104             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1105
1106             /* Update potential sum for this i atom from the interaction with this j atom. */
1107             velec            = _mm_and_ps(velec,cutoff_mask);
1108             velec            = _mm_andnot_ps(dummy_mask,velec);
1109             velecsum         = _mm_add_ps(velecsum,velec);
1110
1111             fscal            = felec;
1112
1113             fscal            = _mm_and_ps(fscal,cutoff_mask);
1114
1115             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1116
1117             /* Calculate temporary vectorial force */
1118             tx               = _mm_mul_ps(fscal,dx11);
1119             ty               = _mm_mul_ps(fscal,dy11);
1120             tz               = _mm_mul_ps(fscal,dz11);
1121
1122             /* Update vectorial force */
1123             fix1             = _mm_add_ps(fix1,tx);
1124             fiy1             = _mm_add_ps(fiy1,ty);
1125             fiz1             = _mm_add_ps(fiz1,tz);
1126
1127             fjx1             = _mm_add_ps(fjx1,tx);
1128             fjy1             = _mm_add_ps(fjy1,ty);
1129             fjz1             = _mm_add_ps(fjz1,tz);
1130             
1131             }
1132
1133             /**************************
1134              * CALCULATE INTERACTIONS *
1135              **************************/
1136
1137             if (gmx_mm_any_lt(rsq12,rcutoff2))
1138             {
1139
1140             r12              = _mm_mul_ps(rsq12,rinv12);
1141             r12              = _mm_andnot_ps(dummy_mask,r12);
1142
1143             /* EWALD ELECTROSTATICS */
1144
1145             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1146             ewrt             = _mm_mul_ps(r12,ewtabscale);
1147             ewitab           = _mm_cvttps_epi32(ewrt);
1148             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1149             ewitab           = _mm_slli_epi32(ewitab,2);
1150             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1151             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1152             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1153             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1154             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1155             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1156             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1157             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
1158             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1159
1160             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1161
1162             /* Update potential sum for this i atom from the interaction with this j atom. */
1163             velec            = _mm_and_ps(velec,cutoff_mask);
1164             velec            = _mm_andnot_ps(dummy_mask,velec);
1165             velecsum         = _mm_add_ps(velecsum,velec);
1166
1167             fscal            = felec;
1168
1169             fscal            = _mm_and_ps(fscal,cutoff_mask);
1170
1171             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1172
1173             /* Calculate temporary vectorial force */
1174             tx               = _mm_mul_ps(fscal,dx12);
1175             ty               = _mm_mul_ps(fscal,dy12);
1176             tz               = _mm_mul_ps(fscal,dz12);
1177
1178             /* Update vectorial force */
1179             fix1             = _mm_add_ps(fix1,tx);
1180             fiy1             = _mm_add_ps(fiy1,ty);
1181             fiz1             = _mm_add_ps(fiz1,tz);
1182
1183             fjx2             = _mm_add_ps(fjx2,tx);
1184             fjy2             = _mm_add_ps(fjy2,ty);
1185             fjz2             = _mm_add_ps(fjz2,tz);
1186             
1187             }
1188
1189             /**************************
1190              * CALCULATE INTERACTIONS *
1191              **************************/
1192
1193             if (gmx_mm_any_lt(rsq20,rcutoff2))
1194             {
1195
1196             r20              = _mm_mul_ps(rsq20,rinv20);
1197             r20              = _mm_andnot_ps(dummy_mask,r20);
1198
1199             /* EWALD ELECTROSTATICS */
1200
1201             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1202             ewrt             = _mm_mul_ps(r20,ewtabscale);
1203             ewitab           = _mm_cvttps_epi32(ewrt);
1204             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1205             ewitab           = _mm_slli_epi32(ewitab,2);
1206             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1207             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1208             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1209             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1210             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1211             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1212             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1213             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
1214             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1215
1216             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1217
1218             /* Update potential sum for this i atom from the interaction with this j atom. */
1219             velec            = _mm_and_ps(velec,cutoff_mask);
1220             velec            = _mm_andnot_ps(dummy_mask,velec);
1221             velecsum         = _mm_add_ps(velecsum,velec);
1222
1223             fscal            = felec;
1224
1225             fscal            = _mm_and_ps(fscal,cutoff_mask);
1226
1227             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1228
1229             /* Calculate temporary vectorial force */
1230             tx               = _mm_mul_ps(fscal,dx20);
1231             ty               = _mm_mul_ps(fscal,dy20);
1232             tz               = _mm_mul_ps(fscal,dz20);
1233
1234             /* Update vectorial force */
1235             fix2             = _mm_add_ps(fix2,tx);
1236             fiy2             = _mm_add_ps(fiy2,ty);
1237             fiz2             = _mm_add_ps(fiz2,tz);
1238
1239             fjx0             = _mm_add_ps(fjx0,tx);
1240             fjy0             = _mm_add_ps(fjy0,ty);
1241             fjz0             = _mm_add_ps(fjz0,tz);
1242             
1243             }
1244
1245             /**************************
1246              * CALCULATE INTERACTIONS *
1247              **************************/
1248
1249             if (gmx_mm_any_lt(rsq21,rcutoff2))
1250             {
1251
1252             r21              = _mm_mul_ps(rsq21,rinv21);
1253             r21              = _mm_andnot_ps(dummy_mask,r21);
1254
1255             /* EWALD ELECTROSTATICS */
1256
1257             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1258             ewrt             = _mm_mul_ps(r21,ewtabscale);
1259             ewitab           = _mm_cvttps_epi32(ewrt);
1260             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1261             ewitab           = _mm_slli_epi32(ewitab,2);
1262             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1263             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1264             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1265             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1266             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1267             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1268             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1269             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
1270             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1271
1272             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1273
1274             /* Update potential sum for this i atom from the interaction with this j atom. */
1275             velec            = _mm_and_ps(velec,cutoff_mask);
1276             velec            = _mm_andnot_ps(dummy_mask,velec);
1277             velecsum         = _mm_add_ps(velecsum,velec);
1278
1279             fscal            = felec;
1280
1281             fscal            = _mm_and_ps(fscal,cutoff_mask);
1282
1283             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1284
1285             /* Calculate temporary vectorial force */
1286             tx               = _mm_mul_ps(fscal,dx21);
1287             ty               = _mm_mul_ps(fscal,dy21);
1288             tz               = _mm_mul_ps(fscal,dz21);
1289
1290             /* Update vectorial force */
1291             fix2             = _mm_add_ps(fix2,tx);
1292             fiy2             = _mm_add_ps(fiy2,ty);
1293             fiz2             = _mm_add_ps(fiz2,tz);
1294
1295             fjx1             = _mm_add_ps(fjx1,tx);
1296             fjy1             = _mm_add_ps(fjy1,ty);
1297             fjz1             = _mm_add_ps(fjz1,tz);
1298             
1299             }
1300
1301             /**************************
1302              * CALCULATE INTERACTIONS *
1303              **************************/
1304
1305             if (gmx_mm_any_lt(rsq22,rcutoff2))
1306             {
1307
1308             r22              = _mm_mul_ps(rsq22,rinv22);
1309             r22              = _mm_andnot_ps(dummy_mask,r22);
1310
1311             /* EWALD ELECTROSTATICS */
1312
1313             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1314             ewrt             = _mm_mul_ps(r22,ewtabscale);
1315             ewitab           = _mm_cvttps_epi32(ewrt);
1316             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1317             ewitab           = _mm_slli_epi32(ewitab,2);
1318             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1319             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1320             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1321             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1322             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1323             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1324             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1325             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
1326             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1327
1328             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1329
1330             /* Update potential sum for this i atom from the interaction with this j atom. */
1331             velec            = _mm_and_ps(velec,cutoff_mask);
1332             velec            = _mm_andnot_ps(dummy_mask,velec);
1333             velecsum         = _mm_add_ps(velecsum,velec);
1334
1335             fscal            = felec;
1336
1337             fscal            = _mm_and_ps(fscal,cutoff_mask);
1338
1339             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1340
1341             /* Calculate temporary vectorial force */
1342             tx               = _mm_mul_ps(fscal,dx22);
1343             ty               = _mm_mul_ps(fscal,dy22);
1344             tz               = _mm_mul_ps(fscal,dz22);
1345
1346             /* Update vectorial force */
1347             fix2             = _mm_add_ps(fix2,tx);
1348             fiy2             = _mm_add_ps(fiy2,ty);
1349             fiz2             = _mm_add_ps(fiz2,tz);
1350
1351             fjx2             = _mm_add_ps(fjx2,tx);
1352             fjy2             = _mm_add_ps(fjy2,ty);
1353             fjz2             = _mm_add_ps(fjz2,tz);
1354             
1355             }
1356
1357             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1358             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1359             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1360             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1361
1362             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1363                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1364
1365             /* Inner loop uses 423 flops */
1366         }
1367
1368         /* End of innermost loop */
1369
1370         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1371                                               f+i_coord_offset,fshift+i_shift_offset);
1372
1373         ggid                        = gid[iidx];
1374         /* Update potential energies */
1375         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1376
1377         /* Increment number of inner iterations */
1378         inneriter                  += j_index_end - j_index_start;
1379
1380         /* Outer loop uses 19 flops */
1381     }
1382
1383     /* Increment number of outer iterations */
1384     outeriter        += nri;
1385
1386     /* Update outer/inner flops */
1387
1388     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*423);
1389 }
1390 /*
1391  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse2_single
1392  * Electrostatics interaction: Ewald
1393  * VdW interaction:            None
1394  * Geometry:                   Water3-Water3
1395  * Calculate force/pot:        Force
1396  */
1397 void
1398 nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse2_single
1399                     (t_nblist * gmx_restrict                nlist,
1400                      rvec * gmx_restrict                    xx,
1401                      rvec * gmx_restrict                    ff,
1402                      t_forcerec * gmx_restrict              fr,
1403                      t_mdatoms * gmx_restrict               mdatoms,
1404                      nb_kernel_data_t * gmx_restrict        kernel_data,
1405                      t_nrnb * gmx_restrict                  nrnb)
1406 {
1407     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1408      * just 0 for non-waters.
1409      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1410      * jnr indices corresponding to data put in the four positions in the SIMD register.
1411      */
1412     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1413     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1414     int              jnrA,jnrB,jnrC,jnrD;
1415     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1416     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1417     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1418     real             rcutoff_scalar;
1419     real             *shiftvec,*fshift,*x,*f;
1420     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1421     real             scratch[4*DIM];
1422     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1423     int              vdwioffset0;
1424     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1425     int              vdwioffset1;
1426     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1427     int              vdwioffset2;
1428     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1429     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1430     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1431     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1432     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1433     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1434     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1435     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1436     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1437     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1438     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1439     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1440     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1441     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1442     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1443     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1444     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1445     real             *charge;
1446     __m128i          ewitab;
1447     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1448     real             *ewtab;
1449     __m128           dummy_mask,cutoff_mask;
1450     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1451     __m128           one     = _mm_set1_ps(1.0);
1452     __m128           two     = _mm_set1_ps(2.0);
1453     x                = xx[0];
1454     f                = ff[0];
1455
1456     nri              = nlist->nri;
1457     iinr             = nlist->iinr;
1458     jindex           = nlist->jindex;
1459     jjnr             = nlist->jjnr;
1460     shiftidx         = nlist->shift;
1461     gid              = nlist->gid;
1462     shiftvec         = fr->shift_vec[0];
1463     fshift           = fr->fshift[0];
1464     facel            = _mm_set1_ps(fr->epsfac);
1465     charge           = mdatoms->chargeA;
1466
1467     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1468     ewtab            = fr->ic->tabq_coul_F;
1469     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1470     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1471
1472     /* Setup water-specific parameters */
1473     inr              = nlist->iinr[0];
1474     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1475     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1476     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1477
1478     jq0              = _mm_set1_ps(charge[inr+0]);
1479     jq1              = _mm_set1_ps(charge[inr+1]);
1480     jq2              = _mm_set1_ps(charge[inr+2]);
1481     qq00             = _mm_mul_ps(iq0,jq0);
1482     qq01             = _mm_mul_ps(iq0,jq1);
1483     qq02             = _mm_mul_ps(iq0,jq2);
1484     qq10             = _mm_mul_ps(iq1,jq0);
1485     qq11             = _mm_mul_ps(iq1,jq1);
1486     qq12             = _mm_mul_ps(iq1,jq2);
1487     qq20             = _mm_mul_ps(iq2,jq0);
1488     qq21             = _mm_mul_ps(iq2,jq1);
1489     qq22             = _mm_mul_ps(iq2,jq2);
1490
1491     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1492     rcutoff_scalar   = fr->rcoulomb;
1493     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1494     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1495
1496     /* Avoid stupid compiler warnings */
1497     jnrA = jnrB = jnrC = jnrD = 0;
1498     j_coord_offsetA = 0;
1499     j_coord_offsetB = 0;
1500     j_coord_offsetC = 0;
1501     j_coord_offsetD = 0;
1502
1503     outeriter        = 0;
1504     inneriter        = 0;
1505
1506     for(iidx=0;iidx<4*DIM;iidx++)
1507     {
1508         scratch[iidx] = 0.0;
1509     }  
1510
1511     /* Start outer loop over neighborlists */
1512     for(iidx=0; iidx<nri; iidx++)
1513     {
1514         /* Load shift vector for this list */
1515         i_shift_offset   = DIM*shiftidx[iidx];
1516
1517         /* Load limits for loop over neighbors */
1518         j_index_start    = jindex[iidx];
1519         j_index_end      = jindex[iidx+1];
1520
1521         /* Get outer coordinate index */
1522         inr              = iinr[iidx];
1523         i_coord_offset   = DIM*inr;
1524
1525         /* Load i particle coords and add shift vector */
1526         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1527                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1528         
1529         fix0             = _mm_setzero_ps();
1530         fiy0             = _mm_setzero_ps();
1531         fiz0             = _mm_setzero_ps();
1532         fix1             = _mm_setzero_ps();
1533         fiy1             = _mm_setzero_ps();
1534         fiz1             = _mm_setzero_ps();
1535         fix2             = _mm_setzero_ps();
1536         fiy2             = _mm_setzero_ps();
1537         fiz2             = _mm_setzero_ps();
1538
1539         /* Start inner kernel loop */
1540         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1541         {
1542
1543             /* Get j neighbor index, and coordinate index */
1544             jnrA             = jjnr[jidx];
1545             jnrB             = jjnr[jidx+1];
1546             jnrC             = jjnr[jidx+2];
1547             jnrD             = jjnr[jidx+3];
1548             j_coord_offsetA  = DIM*jnrA;
1549             j_coord_offsetB  = DIM*jnrB;
1550             j_coord_offsetC  = DIM*jnrC;
1551             j_coord_offsetD  = DIM*jnrD;
1552
1553             /* load j atom coordinates */
1554             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1555                                               x+j_coord_offsetC,x+j_coord_offsetD,
1556                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1557
1558             /* Calculate displacement vector */
1559             dx00             = _mm_sub_ps(ix0,jx0);
1560             dy00             = _mm_sub_ps(iy0,jy0);
1561             dz00             = _mm_sub_ps(iz0,jz0);
1562             dx01             = _mm_sub_ps(ix0,jx1);
1563             dy01             = _mm_sub_ps(iy0,jy1);
1564             dz01             = _mm_sub_ps(iz0,jz1);
1565             dx02             = _mm_sub_ps(ix0,jx2);
1566             dy02             = _mm_sub_ps(iy0,jy2);
1567             dz02             = _mm_sub_ps(iz0,jz2);
1568             dx10             = _mm_sub_ps(ix1,jx0);
1569             dy10             = _mm_sub_ps(iy1,jy0);
1570             dz10             = _mm_sub_ps(iz1,jz0);
1571             dx11             = _mm_sub_ps(ix1,jx1);
1572             dy11             = _mm_sub_ps(iy1,jy1);
1573             dz11             = _mm_sub_ps(iz1,jz1);
1574             dx12             = _mm_sub_ps(ix1,jx2);
1575             dy12             = _mm_sub_ps(iy1,jy2);
1576             dz12             = _mm_sub_ps(iz1,jz2);
1577             dx20             = _mm_sub_ps(ix2,jx0);
1578             dy20             = _mm_sub_ps(iy2,jy0);
1579             dz20             = _mm_sub_ps(iz2,jz0);
1580             dx21             = _mm_sub_ps(ix2,jx1);
1581             dy21             = _mm_sub_ps(iy2,jy1);
1582             dz21             = _mm_sub_ps(iz2,jz1);
1583             dx22             = _mm_sub_ps(ix2,jx2);
1584             dy22             = _mm_sub_ps(iy2,jy2);
1585             dz22             = _mm_sub_ps(iz2,jz2);
1586
1587             /* Calculate squared distance and things based on it */
1588             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1589             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1590             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1591             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1592             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1593             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1594             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1595             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1596             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1597
1598             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1599             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1600             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1601             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1602             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1603             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1604             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1605             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1606             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1607
1608             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1609             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1610             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1611             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1612             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1613             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1614             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1615             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1616             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1617
1618             fjx0             = _mm_setzero_ps();
1619             fjy0             = _mm_setzero_ps();
1620             fjz0             = _mm_setzero_ps();
1621             fjx1             = _mm_setzero_ps();
1622             fjy1             = _mm_setzero_ps();
1623             fjz1             = _mm_setzero_ps();
1624             fjx2             = _mm_setzero_ps();
1625             fjy2             = _mm_setzero_ps();
1626             fjz2             = _mm_setzero_ps();
1627
1628             /**************************
1629              * CALCULATE INTERACTIONS *
1630              **************************/
1631
1632             if (gmx_mm_any_lt(rsq00,rcutoff2))
1633             {
1634
1635             r00              = _mm_mul_ps(rsq00,rinv00);
1636
1637             /* EWALD ELECTROSTATICS */
1638
1639             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1640             ewrt             = _mm_mul_ps(r00,ewtabscale);
1641             ewitab           = _mm_cvttps_epi32(ewrt);
1642             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1643             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1644                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1645                                          &ewtabF,&ewtabFn);
1646             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1647             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1648
1649             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1650
1651             fscal            = felec;
1652
1653             fscal            = _mm_and_ps(fscal,cutoff_mask);
1654
1655             /* Calculate temporary vectorial force */
1656             tx               = _mm_mul_ps(fscal,dx00);
1657             ty               = _mm_mul_ps(fscal,dy00);
1658             tz               = _mm_mul_ps(fscal,dz00);
1659
1660             /* Update vectorial force */
1661             fix0             = _mm_add_ps(fix0,tx);
1662             fiy0             = _mm_add_ps(fiy0,ty);
1663             fiz0             = _mm_add_ps(fiz0,tz);
1664
1665             fjx0             = _mm_add_ps(fjx0,tx);
1666             fjy0             = _mm_add_ps(fjy0,ty);
1667             fjz0             = _mm_add_ps(fjz0,tz);
1668             
1669             }
1670
1671             /**************************
1672              * CALCULATE INTERACTIONS *
1673              **************************/
1674
1675             if (gmx_mm_any_lt(rsq01,rcutoff2))
1676             {
1677
1678             r01              = _mm_mul_ps(rsq01,rinv01);
1679
1680             /* EWALD ELECTROSTATICS */
1681
1682             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1683             ewrt             = _mm_mul_ps(r01,ewtabscale);
1684             ewitab           = _mm_cvttps_epi32(ewrt);
1685             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1686             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1687                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1688                                          &ewtabF,&ewtabFn);
1689             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1690             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1691
1692             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1693
1694             fscal            = felec;
1695
1696             fscal            = _mm_and_ps(fscal,cutoff_mask);
1697
1698             /* Calculate temporary vectorial force */
1699             tx               = _mm_mul_ps(fscal,dx01);
1700             ty               = _mm_mul_ps(fscal,dy01);
1701             tz               = _mm_mul_ps(fscal,dz01);
1702
1703             /* Update vectorial force */
1704             fix0             = _mm_add_ps(fix0,tx);
1705             fiy0             = _mm_add_ps(fiy0,ty);
1706             fiz0             = _mm_add_ps(fiz0,tz);
1707
1708             fjx1             = _mm_add_ps(fjx1,tx);
1709             fjy1             = _mm_add_ps(fjy1,ty);
1710             fjz1             = _mm_add_ps(fjz1,tz);
1711             
1712             }
1713
1714             /**************************
1715              * CALCULATE INTERACTIONS *
1716              **************************/
1717
1718             if (gmx_mm_any_lt(rsq02,rcutoff2))
1719             {
1720
1721             r02              = _mm_mul_ps(rsq02,rinv02);
1722
1723             /* EWALD ELECTROSTATICS */
1724
1725             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1726             ewrt             = _mm_mul_ps(r02,ewtabscale);
1727             ewitab           = _mm_cvttps_epi32(ewrt);
1728             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1729             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1730                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1731                                          &ewtabF,&ewtabFn);
1732             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1733             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1734
1735             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1736
1737             fscal            = felec;
1738
1739             fscal            = _mm_and_ps(fscal,cutoff_mask);
1740
1741             /* Calculate temporary vectorial force */
1742             tx               = _mm_mul_ps(fscal,dx02);
1743             ty               = _mm_mul_ps(fscal,dy02);
1744             tz               = _mm_mul_ps(fscal,dz02);
1745
1746             /* Update vectorial force */
1747             fix0             = _mm_add_ps(fix0,tx);
1748             fiy0             = _mm_add_ps(fiy0,ty);
1749             fiz0             = _mm_add_ps(fiz0,tz);
1750
1751             fjx2             = _mm_add_ps(fjx2,tx);
1752             fjy2             = _mm_add_ps(fjy2,ty);
1753             fjz2             = _mm_add_ps(fjz2,tz);
1754             
1755             }
1756
1757             /**************************
1758              * CALCULATE INTERACTIONS *
1759              **************************/
1760
1761             if (gmx_mm_any_lt(rsq10,rcutoff2))
1762             {
1763
1764             r10              = _mm_mul_ps(rsq10,rinv10);
1765
1766             /* EWALD ELECTROSTATICS */
1767
1768             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1769             ewrt             = _mm_mul_ps(r10,ewtabscale);
1770             ewitab           = _mm_cvttps_epi32(ewrt);
1771             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1772             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1773                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1774                                          &ewtabF,&ewtabFn);
1775             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1776             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1777
1778             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1779
1780             fscal            = felec;
1781
1782             fscal            = _mm_and_ps(fscal,cutoff_mask);
1783
1784             /* Calculate temporary vectorial force */
1785             tx               = _mm_mul_ps(fscal,dx10);
1786             ty               = _mm_mul_ps(fscal,dy10);
1787             tz               = _mm_mul_ps(fscal,dz10);
1788
1789             /* Update vectorial force */
1790             fix1             = _mm_add_ps(fix1,tx);
1791             fiy1             = _mm_add_ps(fiy1,ty);
1792             fiz1             = _mm_add_ps(fiz1,tz);
1793
1794             fjx0             = _mm_add_ps(fjx0,tx);
1795             fjy0             = _mm_add_ps(fjy0,ty);
1796             fjz0             = _mm_add_ps(fjz0,tz);
1797             
1798             }
1799
1800             /**************************
1801              * CALCULATE INTERACTIONS *
1802              **************************/
1803
1804             if (gmx_mm_any_lt(rsq11,rcutoff2))
1805             {
1806
1807             r11              = _mm_mul_ps(rsq11,rinv11);
1808
1809             /* EWALD ELECTROSTATICS */
1810
1811             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1812             ewrt             = _mm_mul_ps(r11,ewtabscale);
1813             ewitab           = _mm_cvttps_epi32(ewrt);
1814             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1815             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1816                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1817                                          &ewtabF,&ewtabFn);
1818             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1819             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1820
1821             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1822
1823             fscal            = felec;
1824
1825             fscal            = _mm_and_ps(fscal,cutoff_mask);
1826
1827             /* Calculate temporary vectorial force */
1828             tx               = _mm_mul_ps(fscal,dx11);
1829             ty               = _mm_mul_ps(fscal,dy11);
1830             tz               = _mm_mul_ps(fscal,dz11);
1831
1832             /* Update vectorial force */
1833             fix1             = _mm_add_ps(fix1,tx);
1834             fiy1             = _mm_add_ps(fiy1,ty);
1835             fiz1             = _mm_add_ps(fiz1,tz);
1836
1837             fjx1             = _mm_add_ps(fjx1,tx);
1838             fjy1             = _mm_add_ps(fjy1,ty);
1839             fjz1             = _mm_add_ps(fjz1,tz);
1840             
1841             }
1842
1843             /**************************
1844              * CALCULATE INTERACTIONS *
1845              **************************/
1846
1847             if (gmx_mm_any_lt(rsq12,rcutoff2))
1848             {
1849
1850             r12              = _mm_mul_ps(rsq12,rinv12);
1851
1852             /* EWALD ELECTROSTATICS */
1853
1854             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1855             ewrt             = _mm_mul_ps(r12,ewtabscale);
1856             ewitab           = _mm_cvttps_epi32(ewrt);
1857             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1858             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1859                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1860                                          &ewtabF,&ewtabFn);
1861             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1862             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1863
1864             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1865
1866             fscal            = felec;
1867
1868             fscal            = _mm_and_ps(fscal,cutoff_mask);
1869
1870             /* Calculate temporary vectorial force */
1871             tx               = _mm_mul_ps(fscal,dx12);
1872             ty               = _mm_mul_ps(fscal,dy12);
1873             tz               = _mm_mul_ps(fscal,dz12);
1874
1875             /* Update vectorial force */
1876             fix1             = _mm_add_ps(fix1,tx);
1877             fiy1             = _mm_add_ps(fiy1,ty);
1878             fiz1             = _mm_add_ps(fiz1,tz);
1879
1880             fjx2             = _mm_add_ps(fjx2,tx);
1881             fjy2             = _mm_add_ps(fjy2,ty);
1882             fjz2             = _mm_add_ps(fjz2,tz);
1883             
1884             }
1885
1886             /**************************
1887              * CALCULATE INTERACTIONS *
1888              **************************/
1889
1890             if (gmx_mm_any_lt(rsq20,rcutoff2))
1891             {
1892
1893             r20              = _mm_mul_ps(rsq20,rinv20);
1894
1895             /* EWALD ELECTROSTATICS */
1896
1897             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1898             ewrt             = _mm_mul_ps(r20,ewtabscale);
1899             ewitab           = _mm_cvttps_epi32(ewrt);
1900             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1901             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1902                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1903                                          &ewtabF,&ewtabFn);
1904             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1905             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1906
1907             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1908
1909             fscal            = felec;
1910
1911             fscal            = _mm_and_ps(fscal,cutoff_mask);
1912
1913             /* Calculate temporary vectorial force */
1914             tx               = _mm_mul_ps(fscal,dx20);
1915             ty               = _mm_mul_ps(fscal,dy20);
1916             tz               = _mm_mul_ps(fscal,dz20);
1917
1918             /* Update vectorial force */
1919             fix2             = _mm_add_ps(fix2,tx);
1920             fiy2             = _mm_add_ps(fiy2,ty);
1921             fiz2             = _mm_add_ps(fiz2,tz);
1922
1923             fjx0             = _mm_add_ps(fjx0,tx);
1924             fjy0             = _mm_add_ps(fjy0,ty);
1925             fjz0             = _mm_add_ps(fjz0,tz);
1926             
1927             }
1928
1929             /**************************
1930              * CALCULATE INTERACTIONS *
1931              **************************/
1932
1933             if (gmx_mm_any_lt(rsq21,rcutoff2))
1934             {
1935
1936             r21              = _mm_mul_ps(rsq21,rinv21);
1937
1938             /* EWALD ELECTROSTATICS */
1939
1940             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1941             ewrt             = _mm_mul_ps(r21,ewtabscale);
1942             ewitab           = _mm_cvttps_epi32(ewrt);
1943             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1944             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1945                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1946                                          &ewtabF,&ewtabFn);
1947             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1948             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1949
1950             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1951
1952             fscal            = felec;
1953
1954             fscal            = _mm_and_ps(fscal,cutoff_mask);
1955
1956             /* Calculate temporary vectorial force */
1957             tx               = _mm_mul_ps(fscal,dx21);
1958             ty               = _mm_mul_ps(fscal,dy21);
1959             tz               = _mm_mul_ps(fscal,dz21);
1960
1961             /* Update vectorial force */
1962             fix2             = _mm_add_ps(fix2,tx);
1963             fiy2             = _mm_add_ps(fiy2,ty);
1964             fiz2             = _mm_add_ps(fiz2,tz);
1965
1966             fjx1             = _mm_add_ps(fjx1,tx);
1967             fjy1             = _mm_add_ps(fjy1,ty);
1968             fjz1             = _mm_add_ps(fjz1,tz);
1969             
1970             }
1971
1972             /**************************
1973              * CALCULATE INTERACTIONS *
1974              **************************/
1975
1976             if (gmx_mm_any_lt(rsq22,rcutoff2))
1977             {
1978
1979             r22              = _mm_mul_ps(rsq22,rinv22);
1980
1981             /* EWALD ELECTROSTATICS */
1982
1983             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1984             ewrt             = _mm_mul_ps(r22,ewtabscale);
1985             ewitab           = _mm_cvttps_epi32(ewrt);
1986             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1987             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1988                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1989                                          &ewtabF,&ewtabFn);
1990             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1991             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1992
1993             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1994
1995             fscal            = felec;
1996
1997             fscal            = _mm_and_ps(fscal,cutoff_mask);
1998
1999             /* Calculate temporary vectorial force */
2000             tx               = _mm_mul_ps(fscal,dx22);
2001             ty               = _mm_mul_ps(fscal,dy22);
2002             tz               = _mm_mul_ps(fscal,dz22);
2003
2004             /* Update vectorial force */
2005             fix2             = _mm_add_ps(fix2,tx);
2006             fiy2             = _mm_add_ps(fiy2,ty);
2007             fiz2             = _mm_add_ps(fiz2,tz);
2008
2009             fjx2             = _mm_add_ps(fjx2,tx);
2010             fjy2             = _mm_add_ps(fjy2,ty);
2011             fjz2             = _mm_add_ps(fjz2,tz);
2012             
2013             }
2014
2015             fjptrA             = f+j_coord_offsetA;
2016             fjptrB             = f+j_coord_offsetB;
2017             fjptrC             = f+j_coord_offsetC;
2018             fjptrD             = f+j_coord_offsetD;
2019
2020             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2021                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2022
2023             /* Inner loop uses 351 flops */
2024         }
2025
2026         if(jidx<j_index_end)
2027         {
2028
2029             /* Get j neighbor index, and coordinate index */
2030             jnrlistA         = jjnr[jidx];
2031             jnrlistB         = jjnr[jidx+1];
2032             jnrlistC         = jjnr[jidx+2];
2033             jnrlistD         = jjnr[jidx+3];
2034             /* Sign of each element will be negative for non-real atoms.
2035              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2036              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2037              */
2038             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2039             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2040             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2041             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2042             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2043             j_coord_offsetA  = DIM*jnrA;
2044             j_coord_offsetB  = DIM*jnrB;
2045             j_coord_offsetC  = DIM*jnrC;
2046             j_coord_offsetD  = DIM*jnrD;
2047
2048             /* load j atom coordinates */
2049             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2050                                               x+j_coord_offsetC,x+j_coord_offsetD,
2051                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2052
2053             /* Calculate displacement vector */
2054             dx00             = _mm_sub_ps(ix0,jx0);
2055             dy00             = _mm_sub_ps(iy0,jy0);
2056             dz00             = _mm_sub_ps(iz0,jz0);
2057             dx01             = _mm_sub_ps(ix0,jx1);
2058             dy01             = _mm_sub_ps(iy0,jy1);
2059             dz01             = _mm_sub_ps(iz0,jz1);
2060             dx02             = _mm_sub_ps(ix0,jx2);
2061             dy02             = _mm_sub_ps(iy0,jy2);
2062             dz02             = _mm_sub_ps(iz0,jz2);
2063             dx10             = _mm_sub_ps(ix1,jx0);
2064             dy10             = _mm_sub_ps(iy1,jy0);
2065             dz10             = _mm_sub_ps(iz1,jz0);
2066             dx11             = _mm_sub_ps(ix1,jx1);
2067             dy11             = _mm_sub_ps(iy1,jy1);
2068             dz11             = _mm_sub_ps(iz1,jz1);
2069             dx12             = _mm_sub_ps(ix1,jx2);
2070             dy12             = _mm_sub_ps(iy1,jy2);
2071             dz12             = _mm_sub_ps(iz1,jz2);
2072             dx20             = _mm_sub_ps(ix2,jx0);
2073             dy20             = _mm_sub_ps(iy2,jy0);
2074             dz20             = _mm_sub_ps(iz2,jz0);
2075             dx21             = _mm_sub_ps(ix2,jx1);
2076             dy21             = _mm_sub_ps(iy2,jy1);
2077             dz21             = _mm_sub_ps(iz2,jz1);
2078             dx22             = _mm_sub_ps(ix2,jx2);
2079             dy22             = _mm_sub_ps(iy2,jy2);
2080             dz22             = _mm_sub_ps(iz2,jz2);
2081
2082             /* Calculate squared distance and things based on it */
2083             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2084             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2085             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2086             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2087             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2088             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2089             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2090             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2091             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2092
2093             rinv00           = gmx_mm_invsqrt_ps(rsq00);
2094             rinv01           = gmx_mm_invsqrt_ps(rsq01);
2095             rinv02           = gmx_mm_invsqrt_ps(rsq02);
2096             rinv10           = gmx_mm_invsqrt_ps(rsq10);
2097             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2098             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2099             rinv20           = gmx_mm_invsqrt_ps(rsq20);
2100             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2101             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2102
2103             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
2104             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
2105             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
2106             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
2107             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2108             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2109             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
2110             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2111             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2112
2113             fjx0             = _mm_setzero_ps();
2114             fjy0             = _mm_setzero_ps();
2115             fjz0             = _mm_setzero_ps();
2116             fjx1             = _mm_setzero_ps();
2117             fjy1             = _mm_setzero_ps();
2118             fjz1             = _mm_setzero_ps();
2119             fjx2             = _mm_setzero_ps();
2120             fjy2             = _mm_setzero_ps();
2121             fjz2             = _mm_setzero_ps();
2122
2123             /**************************
2124              * CALCULATE INTERACTIONS *
2125              **************************/
2126
2127             if (gmx_mm_any_lt(rsq00,rcutoff2))
2128             {
2129
2130             r00              = _mm_mul_ps(rsq00,rinv00);
2131             r00              = _mm_andnot_ps(dummy_mask,r00);
2132
2133             /* EWALD ELECTROSTATICS */
2134
2135             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2136             ewrt             = _mm_mul_ps(r00,ewtabscale);
2137             ewitab           = _mm_cvttps_epi32(ewrt);
2138             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2139             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2140                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2141                                          &ewtabF,&ewtabFn);
2142             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2143             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
2144
2145             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2146
2147             fscal            = felec;
2148
2149             fscal            = _mm_and_ps(fscal,cutoff_mask);
2150
2151             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2152
2153             /* Calculate temporary vectorial force */
2154             tx               = _mm_mul_ps(fscal,dx00);
2155             ty               = _mm_mul_ps(fscal,dy00);
2156             tz               = _mm_mul_ps(fscal,dz00);
2157
2158             /* Update vectorial force */
2159             fix0             = _mm_add_ps(fix0,tx);
2160             fiy0             = _mm_add_ps(fiy0,ty);
2161             fiz0             = _mm_add_ps(fiz0,tz);
2162
2163             fjx0             = _mm_add_ps(fjx0,tx);
2164             fjy0             = _mm_add_ps(fjy0,ty);
2165             fjz0             = _mm_add_ps(fjz0,tz);
2166             
2167             }
2168
2169             /**************************
2170              * CALCULATE INTERACTIONS *
2171              **************************/
2172
2173             if (gmx_mm_any_lt(rsq01,rcutoff2))
2174             {
2175
2176             r01              = _mm_mul_ps(rsq01,rinv01);
2177             r01              = _mm_andnot_ps(dummy_mask,r01);
2178
2179             /* EWALD ELECTROSTATICS */
2180
2181             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2182             ewrt             = _mm_mul_ps(r01,ewtabscale);
2183             ewitab           = _mm_cvttps_epi32(ewrt);
2184             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2185             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2186                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2187                                          &ewtabF,&ewtabFn);
2188             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2189             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2190
2191             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
2192
2193             fscal            = felec;
2194
2195             fscal            = _mm_and_ps(fscal,cutoff_mask);
2196
2197             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2198
2199             /* Calculate temporary vectorial force */
2200             tx               = _mm_mul_ps(fscal,dx01);
2201             ty               = _mm_mul_ps(fscal,dy01);
2202             tz               = _mm_mul_ps(fscal,dz01);
2203
2204             /* Update vectorial force */
2205             fix0             = _mm_add_ps(fix0,tx);
2206             fiy0             = _mm_add_ps(fiy0,ty);
2207             fiz0             = _mm_add_ps(fiz0,tz);
2208
2209             fjx1             = _mm_add_ps(fjx1,tx);
2210             fjy1             = _mm_add_ps(fjy1,ty);
2211             fjz1             = _mm_add_ps(fjz1,tz);
2212             
2213             }
2214
2215             /**************************
2216              * CALCULATE INTERACTIONS *
2217              **************************/
2218
2219             if (gmx_mm_any_lt(rsq02,rcutoff2))
2220             {
2221
2222             r02              = _mm_mul_ps(rsq02,rinv02);
2223             r02              = _mm_andnot_ps(dummy_mask,r02);
2224
2225             /* EWALD ELECTROSTATICS */
2226
2227             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2228             ewrt             = _mm_mul_ps(r02,ewtabscale);
2229             ewitab           = _mm_cvttps_epi32(ewrt);
2230             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2231             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2232                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2233                                          &ewtabF,&ewtabFn);
2234             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2235             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2236
2237             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
2238
2239             fscal            = felec;
2240
2241             fscal            = _mm_and_ps(fscal,cutoff_mask);
2242
2243             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2244
2245             /* Calculate temporary vectorial force */
2246             tx               = _mm_mul_ps(fscal,dx02);
2247             ty               = _mm_mul_ps(fscal,dy02);
2248             tz               = _mm_mul_ps(fscal,dz02);
2249
2250             /* Update vectorial force */
2251             fix0             = _mm_add_ps(fix0,tx);
2252             fiy0             = _mm_add_ps(fiy0,ty);
2253             fiz0             = _mm_add_ps(fiz0,tz);
2254
2255             fjx2             = _mm_add_ps(fjx2,tx);
2256             fjy2             = _mm_add_ps(fjy2,ty);
2257             fjz2             = _mm_add_ps(fjz2,tz);
2258             
2259             }
2260
2261             /**************************
2262              * CALCULATE INTERACTIONS *
2263              **************************/
2264
2265             if (gmx_mm_any_lt(rsq10,rcutoff2))
2266             {
2267
2268             r10              = _mm_mul_ps(rsq10,rinv10);
2269             r10              = _mm_andnot_ps(dummy_mask,r10);
2270
2271             /* EWALD ELECTROSTATICS */
2272
2273             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2274             ewrt             = _mm_mul_ps(r10,ewtabscale);
2275             ewitab           = _mm_cvttps_epi32(ewrt);
2276             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2277             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2278                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2279                                          &ewtabF,&ewtabFn);
2280             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2281             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2282
2283             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
2284
2285             fscal            = felec;
2286
2287             fscal            = _mm_and_ps(fscal,cutoff_mask);
2288
2289             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2290
2291             /* Calculate temporary vectorial force */
2292             tx               = _mm_mul_ps(fscal,dx10);
2293             ty               = _mm_mul_ps(fscal,dy10);
2294             tz               = _mm_mul_ps(fscal,dz10);
2295
2296             /* Update vectorial force */
2297             fix1             = _mm_add_ps(fix1,tx);
2298             fiy1             = _mm_add_ps(fiy1,ty);
2299             fiz1             = _mm_add_ps(fiz1,tz);
2300
2301             fjx0             = _mm_add_ps(fjx0,tx);
2302             fjy0             = _mm_add_ps(fjy0,ty);
2303             fjz0             = _mm_add_ps(fjz0,tz);
2304             
2305             }
2306
2307             /**************************
2308              * CALCULATE INTERACTIONS *
2309              **************************/
2310
2311             if (gmx_mm_any_lt(rsq11,rcutoff2))
2312             {
2313
2314             r11              = _mm_mul_ps(rsq11,rinv11);
2315             r11              = _mm_andnot_ps(dummy_mask,r11);
2316
2317             /* EWALD ELECTROSTATICS */
2318
2319             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2320             ewrt             = _mm_mul_ps(r11,ewtabscale);
2321             ewitab           = _mm_cvttps_epi32(ewrt);
2322             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2323             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2324                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2325                                          &ewtabF,&ewtabFn);
2326             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2327             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2328
2329             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2330
2331             fscal            = felec;
2332
2333             fscal            = _mm_and_ps(fscal,cutoff_mask);
2334
2335             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2336
2337             /* Calculate temporary vectorial force */
2338             tx               = _mm_mul_ps(fscal,dx11);
2339             ty               = _mm_mul_ps(fscal,dy11);
2340             tz               = _mm_mul_ps(fscal,dz11);
2341
2342             /* Update vectorial force */
2343             fix1             = _mm_add_ps(fix1,tx);
2344             fiy1             = _mm_add_ps(fiy1,ty);
2345             fiz1             = _mm_add_ps(fiz1,tz);
2346
2347             fjx1             = _mm_add_ps(fjx1,tx);
2348             fjy1             = _mm_add_ps(fjy1,ty);
2349             fjz1             = _mm_add_ps(fjz1,tz);
2350             
2351             }
2352
2353             /**************************
2354              * CALCULATE INTERACTIONS *
2355              **************************/
2356
2357             if (gmx_mm_any_lt(rsq12,rcutoff2))
2358             {
2359
2360             r12              = _mm_mul_ps(rsq12,rinv12);
2361             r12              = _mm_andnot_ps(dummy_mask,r12);
2362
2363             /* EWALD ELECTROSTATICS */
2364
2365             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2366             ewrt             = _mm_mul_ps(r12,ewtabscale);
2367             ewitab           = _mm_cvttps_epi32(ewrt);
2368             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2369             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2370                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2371                                          &ewtabF,&ewtabFn);
2372             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2373             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2374
2375             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2376
2377             fscal            = felec;
2378
2379             fscal            = _mm_and_ps(fscal,cutoff_mask);
2380
2381             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2382
2383             /* Calculate temporary vectorial force */
2384             tx               = _mm_mul_ps(fscal,dx12);
2385             ty               = _mm_mul_ps(fscal,dy12);
2386             tz               = _mm_mul_ps(fscal,dz12);
2387
2388             /* Update vectorial force */
2389             fix1             = _mm_add_ps(fix1,tx);
2390             fiy1             = _mm_add_ps(fiy1,ty);
2391             fiz1             = _mm_add_ps(fiz1,tz);
2392
2393             fjx2             = _mm_add_ps(fjx2,tx);
2394             fjy2             = _mm_add_ps(fjy2,ty);
2395             fjz2             = _mm_add_ps(fjz2,tz);
2396             
2397             }
2398
2399             /**************************
2400              * CALCULATE INTERACTIONS *
2401              **************************/
2402
2403             if (gmx_mm_any_lt(rsq20,rcutoff2))
2404             {
2405
2406             r20              = _mm_mul_ps(rsq20,rinv20);
2407             r20              = _mm_andnot_ps(dummy_mask,r20);
2408
2409             /* EWALD ELECTROSTATICS */
2410
2411             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2412             ewrt             = _mm_mul_ps(r20,ewtabscale);
2413             ewitab           = _mm_cvttps_epi32(ewrt);
2414             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2415             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2416                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2417                                          &ewtabF,&ewtabFn);
2418             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2419             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2420
2421             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
2422
2423             fscal            = felec;
2424
2425             fscal            = _mm_and_ps(fscal,cutoff_mask);
2426
2427             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2428
2429             /* Calculate temporary vectorial force */
2430             tx               = _mm_mul_ps(fscal,dx20);
2431             ty               = _mm_mul_ps(fscal,dy20);
2432             tz               = _mm_mul_ps(fscal,dz20);
2433
2434             /* Update vectorial force */
2435             fix2             = _mm_add_ps(fix2,tx);
2436             fiy2             = _mm_add_ps(fiy2,ty);
2437             fiz2             = _mm_add_ps(fiz2,tz);
2438
2439             fjx0             = _mm_add_ps(fjx0,tx);
2440             fjy0             = _mm_add_ps(fjy0,ty);
2441             fjz0             = _mm_add_ps(fjz0,tz);
2442             
2443             }
2444
2445             /**************************
2446              * CALCULATE INTERACTIONS *
2447              **************************/
2448
2449             if (gmx_mm_any_lt(rsq21,rcutoff2))
2450             {
2451
2452             r21              = _mm_mul_ps(rsq21,rinv21);
2453             r21              = _mm_andnot_ps(dummy_mask,r21);
2454
2455             /* EWALD ELECTROSTATICS */
2456
2457             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2458             ewrt             = _mm_mul_ps(r21,ewtabscale);
2459             ewitab           = _mm_cvttps_epi32(ewrt);
2460             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2461             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2462                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2463                                          &ewtabF,&ewtabFn);
2464             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2465             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2466
2467             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2468
2469             fscal            = felec;
2470
2471             fscal            = _mm_and_ps(fscal,cutoff_mask);
2472
2473             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2474
2475             /* Calculate temporary vectorial force */
2476             tx               = _mm_mul_ps(fscal,dx21);
2477             ty               = _mm_mul_ps(fscal,dy21);
2478             tz               = _mm_mul_ps(fscal,dz21);
2479
2480             /* Update vectorial force */
2481             fix2             = _mm_add_ps(fix2,tx);
2482             fiy2             = _mm_add_ps(fiy2,ty);
2483             fiz2             = _mm_add_ps(fiz2,tz);
2484
2485             fjx1             = _mm_add_ps(fjx1,tx);
2486             fjy1             = _mm_add_ps(fjy1,ty);
2487             fjz1             = _mm_add_ps(fjz1,tz);
2488             
2489             }
2490
2491             /**************************
2492              * CALCULATE INTERACTIONS *
2493              **************************/
2494
2495             if (gmx_mm_any_lt(rsq22,rcutoff2))
2496             {
2497
2498             r22              = _mm_mul_ps(rsq22,rinv22);
2499             r22              = _mm_andnot_ps(dummy_mask,r22);
2500
2501             /* EWALD ELECTROSTATICS */
2502
2503             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2504             ewrt             = _mm_mul_ps(r22,ewtabscale);
2505             ewitab           = _mm_cvttps_epi32(ewrt);
2506             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2507             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2508                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2509                                          &ewtabF,&ewtabFn);
2510             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2511             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2512
2513             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2514
2515             fscal            = felec;
2516
2517             fscal            = _mm_and_ps(fscal,cutoff_mask);
2518
2519             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2520
2521             /* Calculate temporary vectorial force */
2522             tx               = _mm_mul_ps(fscal,dx22);
2523             ty               = _mm_mul_ps(fscal,dy22);
2524             tz               = _mm_mul_ps(fscal,dz22);
2525
2526             /* Update vectorial force */
2527             fix2             = _mm_add_ps(fix2,tx);
2528             fiy2             = _mm_add_ps(fiy2,ty);
2529             fiz2             = _mm_add_ps(fiz2,tz);
2530
2531             fjx2             = _mm_add_ps(fjx2,tx);
2532             fjy2             = _mm_add_ps(fjy2,ty);
2533             fjz2             = _mm_add_ps(fjz2,tz);
2534             
2535             }
2536
2537             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2538             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2539             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2540             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2541
2542             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2543                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2544
2545             /* Inner loop uses 360 flops */
2546         }
2547
2548         /* End of innermost loop */
2549
2550         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2551                                               f+i_coord_offset,fshift+i_shift_offset);
2552
2553         /* Increment number of inner iterations */
2554         inneriter                  += j_index_end - j_index_start;
2555
2556         /* Outer loop uses 18 flops */
2557     }
2558
2559     /* Increment number of outer iterations */
2560     outeriter        += nri;
2561
2562     /* Update outer/inner flops */
2563
2564     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*360);
2565 }