Merge 'release-4-6' into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_sse2_single.c
1 /*
2  * Note: this file was generated by the Gromacs sse2_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
38  * Electrostatics interaction: ReactionField
39  * VdW interaction:            CubicSplineTable
40  * Geometry:                   Water4-Water4
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
63     real             shX,shY,shZ,rcutoff_scalar;
64     real             *shiftvec,*fshift,*x,*f;
65     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66     int              vdwioffset0;
67     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68     int              vdwioffset1;
69     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
70     int              vdwioffset2;
71     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72     int              vdwioffset3;
73     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
75     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
77     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
78     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
79     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
80     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
81     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
82     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
84     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
85     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
86     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
87     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
88     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
89     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
90     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
91     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
92     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
93     real             *charge;
94     int              nvdwtype;
95     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
96     int              *vdwtype;
97     real             *vdwparam;
98     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
99     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
100     __m128i          vfitab;
101     __m128i          ifour       = _mm_set1_epi32(4);
102     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
103     real             *vftab;
104     __m128           dummy_mask,cutoff_mask;
105     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
106     __m128           one     = _mm_set1_ps(1.0);
107     __m128           two     = _mm_set1_ps(2.0);
108     x                = xx[0];
109     f                = ff[0];
110
111     nri              = nlist->nri;
112     iinr             = nlist->iinr;
113     jindex           = nlist->jindex;
114     jjnr             = nlist->jjnr;
115     shiftidx         = nlist->shift;
116     gid              = nlist->gid;
117     shiftvec         = fr->shift_vec[0];
118     fshift           = fr->fshift[0];
119     facel            = _mm_set1_ps(fr->epsfac);
120     charge           = mdatoms->chargeA;
121     krf              = _mm_set1_ps(fr->ic->k_rf);
122     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
123     crf              = _mm_set1_ps(fr->ic->c_rf);
124     nvdwtype         = fr->ntype;
125     vdwparam         = fr->nbfp;
126     vdwtype          = mdatoms->typeA;
127
128     vftab            = kernel_data->table_vdw->data;
129     vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
130
131     /* Setup water-specific parameters */
132     inr              = nlist->iinr[0];
133     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
134     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
135     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
136     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
137
138     jq1              = _mm_set1_ps(charge[inr+1]);
139     jq2              = _mm_set1_ps(charge[inr+2]);
140     jq3              = _mm_set1_ps(charge[inr+3]);
141     vdwjidx0A        = 2*vdwtype[inr+0];
142     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
143     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
144     qq11             = _mm_mul_ps(iq1,jq1);
145     qq12             = _mm_mul_ps(iq1,jq2);
146     qq13             = _mm_mul_ps(iq1,jq3);
147     qq21             = _mm_mul_ps(iq2,jq1);
148     qq22             = _mm_mul_ps(iq2,jq2);
149     qq23             = _mm_mul_ps(iq2,jq3);
150     qq31             = _mm_mul_ps(iq3,jq1);
151     qq32             = _mm_mul_ps(iq3,jq2);
152     qq33             = _mm_mul_ps(iq3,jq3);
153
154     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
155     rcutoff_scalar   = fr->rcoulomb;
156     rcutoff          = _mm_set1_ps(rcutoff_scalar);
157     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
158
159     /* Avoid stupid compiler warnings */
160     jnrA = jnrB = jnrC = jnrD = 0;
161     j_coord_offsetA = 0;
162     j_coord_offsetB = 0;
163     j_coord_offsetC = 0;
164     j_coord_offsetD = 0;
165
166     outeriter        = 0;
167     inneriter        = 0;
168
169     /* Start outer loop over neighborlists */
170     for(iidx=0; iidx<nri; iidx++)
171     {
172         /* Load shift vector for this list */
173         i_shift_offset   = DIM*shiftidx[iidx];
174         shX              = shiftvec[i_shift_offset+XX];
175         shY              = shiftvec[i_shift_offset+YY];
176         shZ              = shiftvec[i_shift_offset+ZZ];
177
178         /* Load limits for loop over neighbors */
179         j_index_start    = jindex[iidx];
180         j_index_end      = jindex[iidx+1];
181
182         /* Get outer coordinate index */
183         inr              = iinr[iidx];
184         i_coord_offset   = DIM*inr;
185
186         /* Load i particle coords and add shift vector */
187         ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
188         iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
189         iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
190         ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
191         iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
192         iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
193         ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
194         iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
195         iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
196         ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
197         iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
198         iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
199
200         fix0             = _mm_setzero_ps();
201         fiy0             = _mm_setzero_ps();
202         fiz0             = _mm_setzero_ps();
203         fix1             = _mm_setzero_ps();
204         fiy1             = _mm_setzero_ps();
205         fiz1             = _mm_setzero_ps();
206         fix2             = _mm_setzero_ps();
207         fiy2             = _mm_setzero_ps();
208         fiz2             = _mm_setzero_ps();
209         fix3             = _mm_setzero_ps();
210         fiy3             = _mm_setzero_ps();
211         fiz3             = _mm_setzero_ps();
212
213         /* Reset potential sums */
214         velecsum         = _mm_setzero_ps();
215         vvdwsum          = _mm_setzero_ps();
216
217         /* Start inner kernel loop */
218         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
219         {
220
221             /* Get j neighbor index, and coordinate index */
222             jnrA             = jjnr[jidx];
223             jnrB             = jjnr[jidx+1];
224             jnrC             = jjnr[jidx+2];
225             jnrD             = jjnr[jidx+3];
226
227             j_coord_offsetA  = DIM*jnrA;
228             j_coord_offsetB  = DIM*jnrB;
229             j_coord_offsetC  = DIM*jnrC;
230             j_coord_offsetD  = DIM*jnrD;
231
232             /* load j atom coordinates */
233             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
234                                               x+j_coord_offsetC,x+j_coord_offsetD,
235                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
236                                               &jy2,&jz2,&jx3,&jy3,&jz3);
237
238             /* Calculate displacement vector */
239             dx00             = _mm_sub_ps(ix0,jx0);
240             dy00             = _mm_sub_ps(iy0,jy0);
241             dz00             = _mm_sub_ps(iz0,jz0);
242             dx11             = _mm_sub_ps(ix1,jx1);
243             dy11             = _mm_sub_ps(iy1,jy1);
244             dz11             = _mm_sub_ps(iz1,jz1);
245             dx12             = _mm_sub_ps(ix1,jx2);
246             dy12             = _mm_sub_ps(iy1,jy2);
247             dz12             = _mm_sub_ps(iz1,jz2);
248             dx13             = _mm_sub_ps(ix1,jx3);
249             dy13             = _mm_sub_ps(iy1,jy3);
250             dz13             = _mm_sub_ps(iz1,jz3);
251             dx21             = _mm_sub_ps(ix2,jx1);
252             dy21             = _mm_sub_ps(iy2,jy1);
253             dz21             = _mm_sub_ps(iz2,jz1);
254             dx22             = _mm_sub_ps(ix2,jx2);
255             dy22             = _mm_sub_ps(iy2,jy2);
256             dz22             = _mm_sub_ps(iz2,jz2);
257             dx23             = _mm_sub_ps(ix2,jx3);
258             dy23             = _mm_sub_ps(iy2,jy3);
259             dz23             = _mm_sub_ps(iz2,jz3);
260             dx31             = _mm_sub_ps(ix3,jx1);
261             dy31             = _mm_sub_ps(iy3,jy1);
262             dz31             = _mm_sub_ps(iz3,jz1);
263             dx32             = _mm_sub_ps(ix3,jx2);
264             dy32             = _mm_sub_ps(iy3,jy2);
265             dz32             = _mm_sub_ps(iz3,jz2);
266             dx33             = _mm_sub_ps(ix3,jx3);
267             dy33             = _mm_sub_ps(iy3,jy3);
268             dz33             = _mm_sub_ps(iz3,jz3);
269
270             /* Calculate squared distance and things based on it */
271             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
272             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
273             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
274             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
275             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
276             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
277             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
278             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
279             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
280             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
281
282             rinv00           = gmx_mm_invsqrt_ps(rsq00);
283             rinv11           = gmx_mm_invsqrt_ps(rsq11);
284             rinv12           = gmx_mm_invsqrt_ps(rsq12);
285             rinv13           = gmx_mm_invsqrt_ps(rsq13);
286             rinv21           = gmx_mm_invsqrt_ps(rsq21);
287             rinv22           = gmx_mm_invsqrt_ps(rsq22);
288             rinv23           = gmx_mm_invsqrt_ps(rsq23);
289             rinv31           = gmx_mm_invsqrt_ps(rsq31);
290             rinv32           = gmx_mm_invsqrt_ps(rsq32);
291             rinv33           = gmx_mm_invsqrt_ps(rsq33);
292
293             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
294             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
295             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
296             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
297             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
298             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
299             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
300             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
301             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
302
303             fjx0             = _mm_setzero_ps();
304             fjy0             = _mm_setzero_ps();
305             fjz0             = _mm_setzero_ps();
306             fjx1             = _mm_setzero_ps();
307             fjy1             = _mm_setzero_ps();
308             fjz1             = _mm_setzero_ps();
309             fjx2             = _mm_setzero_ps();
310             fjy2             = _mm_setzero_ps();
311             fjz2             = _mm_setzero_ps();
312             fjx3             = _mm_setzero_ps();
313             fjy3             = _mm_setzero_ps();
314             fjz3             = _mm_setzero_ps();
315
316             /**************************
317              * CALCULATE INTERACTIONS *
318              **************************/
319
320             r00              = _mm_mul_ps(rsq00,rinv00);
321
322             /* Calculate table index by multiplying r with table scale and truncate to integer */
323             rt               = _mm_mul_ps(r00,vftabscale);
324             vfitab           = _mm_cvttps_epi32(rt);
325             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
326             vfitab           = _mm_slli_epi32(vfitab,3);
327
328             /* CUBIC SPLINE TABLE DISPERSION */
329             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
330             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
331             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
332             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
333             _MM_TRANSPOSE4_PS(Y,F,G,H);
334             Heps             = _mm_mul_ps(vfeps,H);
335             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
336             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
337             vvdw6            = _mm_mul_ps(c6_00,VV);
338             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
339             fvdw6            = _mm_mul_ps(c6_00,FF);
340
341             /* CUBIC SPLINE TABLE REPULSION */
342             vfitab           = _mm_add_epi32(vfitab,ifour);
343             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
344             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
345             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
346             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
347             _MM_TRANSPOSE4_PS(Y,F,G,H);
348             Heps             = _mm_mul_ps(vfeps,H);
349             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
350             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
351             vvdw12           = _mm_mul_ps(c12_00,VV);
352             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
353             fvdw12           = _mm_mul_ps(c12_00,FF);
354             vvdw             = _mm_add_ps(vvdw12,vvdw6);
355             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
356
357             /* Update potential sum for this i atom from the interaction with this j atom. */
358             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
359
360             fscal            = fvdw;
361
362             /* Calculate temporary vectorial force */
363             tx               = _mm_mul_ps(fscal,dx00);
364             ty               = _mm_mul_ps(fscal,dy00);
365             tz               = _mm_mul_ps(fscal,dz00);
366
367             /* Update vectorial force */
368             fix0             = _mm_add_ps(fix0,tx);
369             fiy0             = _mm_add_ps(fiy0,ty);
370             fiz0             = _mm_add_ps(fiz0,tz);
371
372             fjx0             = _mm_add_ps(fjx0,tx);
373             fjy0             = _mm_add_ps(fjy0,ty);
374             fjz0             = _mm_add_ps(fjz0,tz);
375
376             /**************************
377              * CALCULATE INTERACTIONS *
378              **************************/
379
380             if (gmx_mm_any_lt(rsq11,rcutoff2))
381             {
382
383             /* REACTION-FIELD ELECTROSTATICS */
384             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
385             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
386
387             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
388
389             /* Update potential sum for this i atom from the interaction with this j atom. */
390             velec            = _mm_and_ps(velec,cutoff_mask);
391             velecsum         = _mm_add_ps(velecsum,velec);
392
393             fscal            = felec;
394
395             fscal            = _mm_and_ps(fscal,cutoff_mask);
396
397             /* Calculate temporary vectorial force */
398             tx               = _mm_mul_ps(fscal,dx11);
399             ty               = _mm_mul_ps(fscal,dy11);
400             tz               = _mm_mul_ps(fscal,dz11);
401
402             /* Update vectorial force */
403             fix1             = _mm_add_ps(fix1,tx);
404             fiy1             = _mm_add_ps(fiy1,ty);
405             fiz1             = _mm_add_ps(fiz1,tz);
406
407             fjx1             = _mm_add_ps(fjx1,tx);
408             fjy1             = _mm_add_ps(fjy1,ty);
409             fjz1             = _mm_add_ps(fjz1,tz);
410
411             }
412
413             /**************************
414              * CALCULATE INTERACTIONS *
415              **************************/
416
417             if (gmx_mm_any_lt(rsq12,rcutoff2))
418             {
419
420             /* REACTION-FIELD ELECTROSTATICS */
421             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
422             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
423
424             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
425
426             /* Update potential sum for this i atom from the interaction with this j atom. */
427             velec            = _mm_and_ps(velec,cutoff_mask);
428             velecsum         = _mm_add_ps(velecsum,velec);
429
430             fscal            = felec;
431
432             fscal            = _mm_and_ps(fscal,cutoff_mask);
433
434             /* Calculate temporary vectorial force */
435             tx               = _mm_mul_ps(fscal,dx12);
436             ty               = _mm_mul_ps(fscal,dy12);
437             tz               = _mm_mul_ps(fscal,dz12);
438
439             /* Update vectorial force */
440             fix1             = _mm_add_ps(fix1,tx);
441             fiy1             = _mm_add_ps(fiy1,ty);
442             fiz1             = _mm_add_ps(fiz1,tz);
443
444             fjx2             = _mm_add_ps(fjx2,tx);
445             fjy2             = _mm_add_ps(fjy2,ty);
446             fjz2             = _mm_add_ps(fjz2,tz);
447
448             }
449
450             /**************************
451              * CALCULATE INTERACTIONS *
452              **************************/
453
454             if (gmx_mm_any_lt(rsq13,rcutoff2))
455             {
456
457             /* REACTION-FIELD ELECTROSTATICS */
458             velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
459             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
460
461             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
462
463             /* Update potential sum for this i atom from the interaction with this j atom. */
464             velec            = _mm_and_ps(velec,cutoff_mask);
465             velecsum         = _mm_add_ps(velecsum,velec);
466
467             fscal            = felec;
468
469             fscal            = _mm_and_ps(fscal,cutoff_mask);
470
471             /* Calculate temporary vectorial force */
472             tx               = _mm_mul_ps(fscal,dx13);
473             ty               = _mm_mul_ps(fscal,dy13);
474             tz               = _mm_mul_ps(fscal,dz13);
475
476             /* Update vectorial force */
477             fix1             = _mm_add_ps(fix1,tx);
478             fiy1             = _mm_add_ps(fiy1,ty);
479             fiz1             = _mm_add_ps(fiz1,tz);
480
481             fjx3             = _mm_add_ps(fjx3,tx);
482             fjy3             = _mm_add_ps(fjy3,ty);
483             fjz3             = _mm_add_ps(fjz3,tz);
484
485             }
486
487             /**************************
488              * CALCULATE INTERACTIONS *
489              **************************/
490
491             if (gmx_mm_any_lt(rsq21,rcutoff2))
492             {
493
494             /* REACTION-FIELD ELECTROSTATICS */
495             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
496             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
497
498             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
499
500             /* Update potential sum for this i atom from the interaction with this j atom. */
501             velec            = _mm_and_ps(velec,cutoff_mask);
502             velecsum         = _mm_add_ps(velecsum,velec);
503
504             fscal            = felec;
505
506             fscal            = _mm_and_ps(fscal,cutoff_mask);
507
508             /* Calculate temporary vectorial force */
509             tx               = _mm_mul_ps(fscal,dx21);
510             ty               = _mm_mul_ps(fscal,dy21);
511             tz               = _mm_mul_ps(fscal,dz21);
512
513             /* Update vectorial force */
514             fix2             = _mm_add_ps(fix2,tx);
515             fiy2             = _mm_add_ps(fiy2,ty);
516             fiz2             = _mm_add_ps(fiz2,tz);
517
518             fjx1             = _mm_add_ps(fjx1,tx);
519             fjy1             = _mm_add_ps(fjy1,ty);
520             fjz1             = _mm_add_ps(fjz1,tz);
521
522             }
523
524             /**************************
525              * CALCULATE INTERACTIONS *
526              **************************/
527
528             if (gmx_mm_any_lt(rsq22,rcutoff2))
529             {
530
531             /* REACTION-FIELD ELECTROSTATICS */
532             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
533             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
534
535             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
536
537             /* Update potential sum for this i atom from the interaction with this j atom. */
538             velec            = _mm_and_ps(velec,cutoff_mask);
539             velecsum         = _mm_add_ps(velecsum,velec);
540
541             fscal            = felec;
542
543             fscal            = _mm_and_ps(fscal,cutoff_mask);
544
545             /* Calculate temporary vectorial force */
546             tx               = _mm_mul_ps(fscal,dx22);
547             ty               = _mm_mul_ps(fscal,dy22);
548             tz               = _mm_mul_ps(fscal,dz22);
549
550             /* Update vectorial force */
551             fix2             = _mm_add_ps(fix2,tx);
552             fiy2             = _mm_add_ps(fiy2,ty);
553             fiz2             = _mm_add_ps(fiz2,tz);
554
555             fjx2             = _mm_add_ps(fjx2,tx);
556             fjy2             = _mm_add_ps(fjy2,ty);
557             fjz2             = _mm_add_ps(fjz2,tz);
558
559             }
560
561             /**************************
562              * CALCULATE INTERACTIONS *
563              **************************/
564
565             if (gmx_mm_any_lt(rsq23,rcutoff2))
566             {
567
568             /* REACTION-FIELD ELECTROSTATICS */
569             velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
570             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
571
572             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
573
574             /* Update potential sum for this i atom from the interaction with this j atom. */
575             velec            = _mm_and_ps(velec,cutoff_mask);
576             velecsum         = _mm_add_ps(velecsum,velec);
577
578             fscal            = felec;
579
580             fscal            = _mm_and_ps(fscal,cutoff_mask);
581
582             /* Calculate temporary vectorial force */
583             tx               = _mm_mul_ps(fscal,dx23);
584             ty               = _mm_mul_ps(fscal,dy23);
585             tz               = _mm_mul_ps(fscal,dz23);
586
587             /* Update vectorial force */
588             fix2             = _mm_add_ps(fix2,tx);
589             fiy2             = _mm_add_ps(fiy2,ty);
590             fiz2             = _mm_add_ps(fiz2,tz);
591
592             fjx3             = _mm_add_ps(fjx3,tx);
593             fjy3             = _mm_add_ps(fjy3,ty);
594             fjz3             = _mm_add_ps(fjz3,tz);
595
596             }
597
598             /**************************
599              * CALCULATE INTERACTIONS *
600              **************************/
601
602             if (gmx_mm_any_lt(rsq31,rcutoff2))
603             {
604
605             /* REACTION-FIELD ELECTROSTATICS */
606             velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
607             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
608
609             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
610
611             /* Update potential sum for this i atom from the interaction with this j atom. */
612             velec            = _mm_and_ps(velec,cutoff_mask);
613             velecsum         = _mm_add_ps(velecsum,velec);
614
615             fscal            = felec;
616
617             fscal            = _mm_and_ps(fscal,cutoff_mask);
618
619             /* Calculate temporary vectorial force */
620             tx               = _mm_mul_ps(fscal,dx31);
621             ty               = _mm_mul_ps(fscal,dy31);
622             tz               = _mm_mul_ps(fscal,dz31);
623
624             /* Update vectorial force */
625             fix3             = _mm_add_ps(fix3,tx);
626             fiy3             = _mm_add_ps(fiy3,ty);
627             fiz3             = _mm_add_ps(fiz3,tz);
628
629             fjx1             = _mm_add_ps(fjx1,tx);
630             fjy1             = _mm_add_ps(fjy1,ty);
631             fjz1             = _mm_add_ps(fjz1,tz);
632
633             }
634
635             /**************************
636              * CALCULATE INTERACTIONS *
637              **************************/
638
639             if (gmx_mm_any_lt(rsq32,rcutoff2))
640             {
641
642             /* REACTION-FIELD ELECTROSTATICS */
643             velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
644             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
645
646             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
647
648             /* Update potential sum for this i atom from the interaction with this j atom. */
649             velec            = _mm_and_ps(velec,cutoff_mask);
650             velecsum         = _mm_add_ps(velecsum,velec);
651
652             fscal            = felec;
653
654             fscal            = _mm_and_ps(fscal,cutoff_mask);
655
656             /* Calculate temporary vectorial force */
657             tx               = _mm_mul_ps(fscal,dx32);
658             ty               = _mm_mul_ps(fscal,dy32);
659             tz               = _mm_mul_ps(fscal,dz32);
660
661             /* Update vectorial force */
662             fix3             = _mm_add_ps(fix3,tx);
663             fiy3             = _mm_add_ps(fiy3,ty);
664             fiz3             = _mm_add_ps(fiz3,tz);
665
666             fjx2             = _mm_add_ps(fjx2,tx);
667             fjy2             = _mm_add_ps(fjy2,ty);
668             fjz2             = _mm_add_ps(fjz2,tz);
669
670             }
671
672             /**************************
673              * CALCULATE INTERACTIONS *
674              **************************/
675
676             if (gmx_mm_any_lt(rsq33,rcutoff2))
677             {
678
679             /* REACTION-FIELD ELECTROSTATICS */
680             velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
681             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
682
683             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
684
685             /* Update potential sum for this i atom from the interaction with this j atom. */
686             velec            = _mm_and_ps(velec,cutoff_mask);
687             velecsum         = _mm_add_ps(velecsum,velec);
688
689             fscal            = felec;
690
691             fscal            = _mm_and_ps(fscal,cutoff_mask);
692
693             /* Calculate temporary vectorial force */
694             tx               = _mm_mul_ps(fscal,dx33);
695             ty               = _mm_mul_ps(fscal,dy33);
696             tz               = _mm_mul_ps(fscal,dz33);
697
698             /* Update vectorial force */
699             fix3             = _mm_add_ps(fix3,tx);
700             fiy3             = _mm_add_ps(fiy3,ty);
701             fiz3             = _mm_add_ps(fiz3,tz);
702
703             fjx3             = _mm_add_ps(fjx3,tx);
704             fjy3             = _mm_add_ps(fjy3,ty);
705             fjz3             = _mm_add_ps(fjz3,tz);
706
707             }
708
709             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
710                                                    f+j_coord_offsetC,f+j_coord_offsetD,
711                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
712                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
713
714             /* Inner loop uses 383 flops */
715         }
716
717         if(jidx<j_index_end)
718         {
719
720             /* Get j neighbor index, and coordinate index */
721             jnrA             = jjnr[jidx];
722             jnrB             = jjnr[jidx+1];
723             jnrC             = jjnr[jidx+2];
724             jnrD             = jjnr[jidx+3];
725
726             /* Sign of each element will be negative for non-real atoms.
727              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
728              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
729              */
730             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
731             jnrA       = (jnrA>=0) ? jnrA : 0;
732             jnrB       = (jnrB>=0) ? jnrB : 0;
733             jnrC       = (jnrC>=0) ? jnrC : 0;
734             jnrD       = (jnrD>=0) ? jnrD : 0;
735
736             j_coord_offsetA  = DIM*jnrA;
737             j_coord_offsetB  = DIM*jnrB;
738             j_coord_offsetC  = DIM*jnrC;
739             j_coord_offsetD  = DIM*jnrD;
740
741             /* load j atom coordinates */
742             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
743                                               x+j_coord_offsetC,x+j_coord_offsetD,
744                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
745                                               &jy2,&jz2,&jx3,&jy3,&jz3);
746
747             /* Calculate displacement vector */
748             dx00             = _mm_sub_ps(ix0,jx0);
749             dy00             = _mm_sub_ps(iy0,jy0);
750             dz00             = _mm_sub_ps(iz0,jz0);
751             dx11             = _mm_sub_ps(ix1,jx1);
752             dy11             = _mm_sub_ps(iy1,jy1);
753             dz11             = _mm_sub_ps(iz1,jz1);
754             dx12             = _mm_sub_ps(ix1,jx2);
755             dy12             = _mm_sub_ps(iy1,jy2);
756             dz12             = _mm_sub_ps(iz1,jz2);
757             dx13             = _mm_sub_ps(ix1,jx3);
758             dy13             = _mm_sub_ps(iy1,jy3);
759             dz13             = _mm_sub_ps(iz1,jz3);
760             dx21             = _mm_sub_ps(ix2,jx1);
761             dy21             = _mm_sub_ps(iy2,jy1);
762             dz21             = _mm_sub_ps(iz2,jz1);
763             dx22             = _mm_sub_ps(ix2,jx2);
764             dy22             = _mm_sub_ps(iy2,jy2);
765             dz22             = _mm_sub_ps(iz2,jz2);
766             dx23             = _mm_sub_ps(ix2,jx3);
767             dy23             = _mm_sub_ps(iy2,jy3);
768             dz23             = _mm_sub_ps(iz2,jz3);
769             dx31             = _mm_sub_ps(ix3,jx1);
770             dy31             = _mm_sub_ps(iy3,jy1);
771             dz31             = _mm_sub_ps(iz3,jz1);
772             dx32             = _mm_sub_ps(ix3,jx2);
773             dy32             = _mm_sub_ps(iy3,jy2);
774             dz32             = _mm_sub_ps(iz3,jz2);
775             dx33             = _mm_sub_ps(ix3,jx3);
776             dy33             = _mm_sub_ps(iy3,jy3);
777             dz33             = _mm_sub_ps(iz3,jz3);
778
779             /* Calculate squared distance and things based on it */
780             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
781             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
782             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
783             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
784             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
785             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
786             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
787             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
788             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
789             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
790
791             rinv00           = gmx_mm_invsqrt_ps(rsq00);
792             rinv11           = gmx_mm_invsqrt_ps(rsq11);
793             rinv12           = gmx_mm_invsqrt_ps(rsq12);
794             rinv13           = gmx_mm_invsqrt_ps(rsq13);
795             rinv21           = gmx_mm_invsqrt_ps(rsq21);
796             rinv22           = gmx_mm_invsqrt_ps(rsq22);
797             rinv23           = gmx_mm_invsqrt_ps(rsq23);
798             rinv31           = gmx_mm_invsqrt_ps(rsq31);
799             rinv32           = gmx_mm_invsqrt_ps(rsq32);
800             rinv33           = gmx_mm_invsqrt_ps(rsq33);
801
802             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
803             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
804             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
805             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
806             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
807             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
808             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
809             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
810             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
811
812             fjx0             = _mm_setzero_ps();
813             fjy0             = _mm_setzero_ps();
814             fjz0             = _mm_setzero_ps();
815             fjx1             = _mm_setzero_ps();
816             fjy1             = _mm_setzero_ps();
817             fjz1             = _mm_setzero_ps();
818             fjx2             = _mm_setzero_ps();
819             fjy2             = _mm_setzero_ps();
820             fjz2             = _mm_setzero_ps();
821             fjx3             = _mm_setzero_ps();
822             fjy3             = _mm_setzero_ps();
823             fjz3             = _mm_setzero_ps();
824
825             /**************************
826              * CALCULATE INTERACTIONS *
827              **************************/
828
829             r00              = _mm_mul_ps(rsq00,rinv00);
830             r00              = _mm_andnot_ps(dummy_mask,r00);
831
832             /* Calculate table index by multiplying r with table scale and truncate to integer */
833             rt               = _mm_mul_ps(r00,vftabscale);
834             vfitab           = _mm_cvttps_epi32(rt);
835             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
836             vfitab           = _mm_slli_epi32(vfitab,3);
837
838             /* CUBIC SPLINE TABLE DISPERSION */
839             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
840             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
841             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
842             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
843             _MM_TRANSPOSE4_PS(Y,F,G,H);
844             Heps             = _mm_mul_ps(vfeps,H);
845             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
846             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
847             vvdw6            = _mm_mul_ps(c6_00,VV);
848             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
849             fvdw6            = _mm_mul_ps(c6_00,FF);
850
851             /* CUBIC SPLINE TABLE REPULSION */
852             vfitab           = _mm_add_epi32(vfitab,ifour);
853             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
854             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
855             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
856             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
857             _MM_TRANSPOSE4_PS(Y,F,G,H);
858             Heps             = _mm_mul_ps(vfeps,H);
859             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
860             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
861             vvdw12           = _mm_mul_ps(c12_00,VV);
862             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
863             fvdw12           = _mm_mul_ps(c12_00,FF);
864             vvdw             = _mm_add_ps(vvdw12,vvdw6);
865             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
866
867             /* Update potential sum for this i atom from the interaction with this j atom. */
868             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
869             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
870
871             fscal            = fvdw;
872
873             fscal            = _mm_andnot_ps(dummy_mask,fscal);
874
875             /* Calculate temporary vectorial force */
876             tx               = _mm_mul_ps(fscal,dx00);
877             ty               = _mm_mul_ps(fscal,dy00);
878             tz               = _mm_mul_ps(fscal,dz00);
879
880             /* Update vectorial force */
881             fix0             = _mm_add_ps(fix0,tx);
882             fiy0             = _mm_add_ps(fiy0,ty);
883             fiz0             = _mm_add_ps(fiz0,tz);
884
885             fjx0             = _mm_add_ps(fjx0,tx);
886             fjy0             = _mm_add_ps(fjy0,ty);
887             fjz0             = _mm_add_ps(fjz0,tz);
888
889             /**************************
890              * CALCULATE INTERACTIONS *
891              **************************/
892
893             if (gmx_mm_any_lt(rsq11,rcutoff2))
894             {
895
896             /* REACTION-FIELD ELECTROSTATICS */
897             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
898             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
899
900             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
901
902             /* Update potential sum for this i atom from the interaction with this j atom. */
903             velec            = _mm_and_ps(velec,cutoff_mask);
904             velec            = _mm_andnot_ps(dummy_mask,velec);
905             velecsum         = _mm_add_ps(velecsum,velec);
906
907             fscal            = felec;
908
909             fscal            = _mm_and_ps(fscal,cutoff_mask);
910
911             fscal            = _mm_andnot_ps(dummy_mask,fscal);
912
913             /* Calculate temporary vectorial force */
914             tx               = _mm_mul_ps(fscal,dx11);
915             ty               = _mm_mul_ps(fscal,dy11);
916             tz               = _mm_mul_ps(fscal,dz11);
917
918             /* Update vectorial force */
919             fix1             = _mm_add_ps(fix1,tx);
920             fiy1             = _mm_add_ps(fiy1,ty);
921             fiz1             = _mm_add_ps(fiz1,tz);
922
923             fjx1             = _mm_add_ps(fjx1,tx);
924             fjy1             = _mm_add_ps(fjy1,ty);
925             fjz1             = _mm_add_ps(fjz1,tz);
926
927             }
928
929             /**************************
930              * CALCULATE INTERACTIONS *
931              **************************/
932
933             if (gmx_mm_any_lt(rsq12,rcutoff2))
934             {
935
936             /* REACTION-FIELD ELECTROSTATICS */
937             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
938             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
939
940             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
941
942             /* Update potential sum for this i atom from the interaction with this j atom. */
943             velec            = _mm_and_ps(velec,cutoff_mask);
944             velec            = _mm_andnot_ps(dummy_mask,velec);
945             velecsum         = _mm_add_ps(velecsum,velec);
946
947             fscal            = felec;
948
949             fscal            = _mm_and_ps(fscal,cutoff_mask);
950
951             fscal            = _mm_andnot_ps(dummy_mask,fscal);
952
953             /* Calculate temporary vectorial force */
954             tx               = _mm_mul_ps(fscal,dx12);
955             ty               = _mm_mul_ps(fscal,dy12);
956             tz               = _mm_mul_ps(fscal,dz12);
957
958             /* Update vectorial force */
959             fix1             = _mm_add_ps(fix1,tx);
960             fiy1             = _mm_add_ps(fiy1,ty);
961             fiz1             = _mm_add_ps(fiz1,tz);
962
963             fjx2             = _mm_add_ps(fjx2,tx);
964             fjy2             = _mm_add_ps(fjy2,ty);
965             fjz2             = _mm_add_ps(fjz2,tz);
966
967             }
968
969             /**************************
970              * CALCULATE INTERACTIONS *
971              **************************/
972
973             if (gmx_mm_any_lt(rsq13,rcutoff2))
974             {
975
976             /* REACTION-FIELD ELECTROSTATICS */
977             velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
978             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
979
980             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
981
982             /* Update potential sum for this i atom from the interaction with this j atom. */
983             velec            = _mm_and_ps(velec,cutoff_mask);
984             velec            = _mm_andnot_ps(dummy_mask,velec);
985             velecsum         = _mm_add_ps(velecsum,velec);
986
987             fscal            = felec;
988
989             fscal            = _mm_and_ps(fscal,cutoff_mask);
990
991             fscal            = _mm_andnot_ps(dummy_mask,fscal);
992
993             /* Calculate temporary vectorial force */
994             tx               = _mm_mul_ps(fscal,dx13);
995             ty               = _mm_mul_ps(fscal,dy13);
996             tz               = _mm_mul_ps(fscal,dz13);
997
998             /* Update vectorial force */
999             fix1             = _mm_add_ps(fix1,tx);
1000             fiy1             = _mm_add_ps(fiy1,ty);
1001             fiz1             = _mm_add_ps(fiz1,tz);
1002
1003             fjx3             = _mm_add_ps(fjx3,tx);
1004             fjy3             = _mm_add_ps(fjy3,ty);
1005             fjz3             = _mm_add_ps(fjz3,tz);
1006
1007             }
1008
1009             /**************************
1010              * CALCULATE INTERACTIONS *
1011              **************************/
1012
1013             if (gmx_mm_any_lt(rsq21,rcutoff2))
1014             {
1015
1016             /* REACTION-FIELD ELECTROSTATICS */
1017             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
1018             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1019
1020             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1021
1022             /* Update potential sum for this i atom from the interaction with this j atom. */
1023             velec            = _mm_and_ps(velec,cutoff_mask);
1024             velec            = _mm_andnot_ps(dummy_mask,velec);
1025             velecsum         = _mm_add_ps(velecsum,velec);
1026
1027             fscal            = felec;
1028
1029             fscal            = _mm_and_ps(fscal,cutoff_mask);
1030
1031             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1032
1033             /* Calculate temporary vectorial force */
1034             tx               = _mm_mul_ps(fscal,dx21);
1035             ty               = _mm_mul_ps(fscal,dy21);
1036             tz               = _mm_mul_ps(fscal,dz21);
1037
1038             /* Update vectorial force */
1039             fix2             = _mm_add_ps(fix2,tx);
1040             fiy2             = _mm_add_ps(fiy2,ty);
1041             fiz2             = _mm_add_ps(fiz2,tz);
1042
1043             fjx1             = _mm_add_ps(fjx1,tx);
1044             fjy1             = _mm_add_ps(fjy1,ty);
1045             fjz1             = _mm_add_ps(fjz1,tz);
1046
1047             }
1048
1049             /**************************
1050              * CALCULATE INTERACTIONS *
1051              **************************/
1052
1053             if (gmx_mm_any_lt(rsq22,rcutoff2))
1054             {
1055
1056             /* REACTION-FIELD ELECTROSTATICS */
1057             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
1058             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1059
1060             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1061
1062             /* Update potential sum for this i atom from the interaction with this j atom. */
1063             velec            = _mm_and_ps(velec,cutoff_mask);
1064             velec            = _mm_andnot_ps(dummy_mask,velec);
1065             velecsum         = _mm_add_ps(velecsum,velec);
1066
1067             fscal            = felec;
1068
1069             fscal            = _mm_and_ps(fscal,cutoff_mask);
1070
1071             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1072
1073             /* Calculate temporary vectorial force */
1074             tx               = _mm_mul_ps(fscal,dx22);
1075             ty               = _mm_mul_ps(fscal,dy22);
1076             tz               = _mm_mul_ps(fscal,dz22);
1077
1078             /* Update vectorial force */
1079             fix2             = _mm_add_ps(fix2,tx);
1080             fiy2             = _mm_add_ps(fiy2,ty);
1081             fiz2             = _mm_add_ps(fiz2,tz);
1082
1083             fjx2             = _mm_add_ps(fjx2,tx);
1084             fjy2             = _mm_add_ps(fjy2,ty);
1085             fjz2             = _mm_add_ps(fjz2,tz);
1086
1087             }
1088
1089             /**************************
1090              * CALCULATE INTERACTIONS *
1091              **************************/
1092
1093             if (gmx_mm_any_lt(rsq23,rcutoff2))
1094             {
1095
1096             /* REACTION-FIELD ELECTROSTATICS */
1097             velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
1098             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1099
1100             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1101
1102             /* Update potential sum for this i atom from the interaction with this j atom. */
1103             velec            = _mm_and_ps(velec,cutoff_mask);
1104             velec            = _mm_andnot_ps(dummy_mask,velec);
1105             velecsum         = _mm_add_ps(velecsum,velec);
1106
1107             fscal            = felec;
1108
1109             fscal            = _mm_and_ps(fscal,cutoff_mask);
1110
1111             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1112
1113             /* Calculate temporary vectorial force */
1114             tx               = _mm_mul_ps(fscal,dx23);
1115             ty               = _mm_mul_ps(fscal,dy23);
1116             tz               = _mm_mul_ps(fscal,dz23);
1117
1118             /* Update vectorial force */
1119             fix2             = _mm_add_ps(fix2,tx);
1120             fiy2             = _mm_add_ps(fiy2,ty);
1121             fiz2             = _mm_add_ps(fiz2,tz);
1122
1123             fjx3             = _mm_add_ps(fjx3,tx);
1124             fjy3             = _mm_add_ps(fjy3,ty);
1125             fjz3             = _mm_add_ps(fjz3,tz);
1126
1127             }
1128
1129             /**************************
1130              * CALCULATE INTERACTIONS *
1131              **************************/
1132
1133             if (gmx_mm_any_lt(rsq31,rcutoff2))
1134             {
1135
1136             /* REACTION-FIELD ELECTROSTATICS */
1137             velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
1138             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1139
1140             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1141
1142             /* Update potential sum for this i atom from the interaction with this j atom. */
1143             velec            = _mm_and_ps(velec,cutoff_mask);
1144             velec            = _mm_andnot_ps(dummy_mask,velec);
1145             velecsum         = _mm_add_ps(velecsum,velec);
1146
1147             fscal            = felec;
1148
1149             fscal            = _mm_and_ps(fscal,cutoff_mask);
1150
1151             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1152
1153             /* Calculate temporary vectorial force */
1154             tx               = _mm_mul_ps(fscal,dx31);
1155             ty               = _mm_mul_ps(fscal,dy31);
1156             tz               = _mm_mul_ps(fscal,dz31);
1157
1158             /* Update vectorial force */
1159             fix3             = _mm_add_ps(fix3,tx);
1160             fiy3             = _mm_add_ps(fiy3,ty);
1161             fiz3             = _mm_add_ps(fiz3,tz);
1162
1163             fjx1             = _mm_add_ps(fjx1,tx);
1164             fjy1             = _mm_add_ps(fjy1,ty);
1165             fjz1             = _mm_add_ps(fjz1,tz);
1166
1167             }
1168
1169             /**************************
1170              * CALCULATE INTERACTIONS *
1171              **************************/
1172
1173             if (gmx_mm_any_lt(rsq32,rcutoff2))
1174             {
1175
1176             /* REACTION-FIELD ELECTROSTATICS */
1177             velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
1178             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1179
1180             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1181
1182             /* Update potential sum for this i atom from the interaction with this j atom. */
1183             velec            = _mm_and_ps(velec,cutoff_mask);
1184             velec            = _mm_andnot_ps(dummy_mask,velec);
1185             velecsum         = _mm_add_ps(velecsum,velec);
1186
1187             fscal            = felec;
1188
1189             fscal            = _mm_and_ps(fscal,cutoff_mask);
1190
1191             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1192
1193             /* Calculate temporary vectorial force */
1194             tx               = _mm_mul_ps(fscal,dx32);
1195             ty               = _mm_mul_ps(fscal,dy32);
1196             tz               = _mm_mul_ps(fscal,dz32);
1197
1198             /* Update vectorial force */
1199             fix3             = _mm_add_ps(fix3,tx);
1200             fiy3             = _mm_add_ps(fiy3,ty);
1201             fiz3             = _mm_add_ps(fiz3,tz);
1202
1203             fjx2             = _mm_add_ps(fjx2,tx);
1204             fjy2             = _mm_add_ps(fjy2,ty);
1205             fjz2             = _mm_add_ps(fjz2,tz);
1206
1207             }
1208
1209             /**************************
1210              * CALCULATE INTERACTIONS *
1211              **************************/
1212
1213             if (gmx_mm_any_lt(rsq33,rcutoff2))
1214             {
1215
1216             /* REACTION-FIELD ELECTROSTATICS */
1217             velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
1218             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1219
1220             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1221
1222             /* Update potential sum for this i atom from the interaction with this j atom. */
1223             velec            = _mm_and_ps(velec,cutoff_mask);
1224             velec            = _mm_andnot_ps(dummy_mask,velec);
1225             velecsum         = _mm_add_ps(velecsum,velec);
1226
1227             fscal            = felec;
1228
1229             fscal            = _mm_and_ps(fscal,cutoff_mask);
1230
1231             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1232
1233             /* Calculate temporary vectorial force */
1234             tx               = _mm_mul_ps(fscal,dx33);
1235             ty               = _mm_mul_ps(fscal,dy33);
1236             tz               = _mm_mul_ps(fscal,dz33);
1237
1238             /* Update vectorial force */
1239             fix3             = _mm_add_ps(fix3,tx);
1240             fiy3             = _mm_add_ps(fiy3,ty);
1241             fiz3             = _mm_add_ps(fiz3,tz);
1242
1243             fjx3             = _mm_add_ps(fjx3,tx);
1244             fjy3             = _mm_add_ps(fjy3,ty);
1245             fjz3             = _mm_add_ps(fjz3,tz);
1246
1247             }
1248
1249             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1250                                                    f+j_coord_offsetC,f+j_coord_offsetD,
1251                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1252                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1253
1254             /* Inner loop uses 384 flops */
1255         }
1256
1257         /* End of innermost loop */
1258
1259         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1260                                               f+i_coord_offset,fshift+i_shift_offset);
1261
1262         ggid                        = gid[iidx];
1263         /* Update potential energies */
1264         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1265         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1266
1267         /* Increment number of inner iterations */
1268         inneriter                  += j_index_end - j_index_start;
1269
1270         /* Outer loop uses 38 flops */
1271     }
1272
1273     /* Increment number of outer iterations */
1274     outeriter        += nri;
1275
1276     /* Update outer/inner flops */
1277
1278     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*384);
1279 }
1280 /*
1281  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
1282  * Electrostatics interaction: ReactionField
1283  * VdW interaction:            CubicSplineTable
1284  * Geometry:                   Water4-Water4
1285  * Calculate force/pot:        Force
1286  */
1287 void
1288 nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
1289                     (t_nblist * gmx_restrict                nlist,
1290                      rvec * gmx_restrict                    xx,
1291                      rvec * gmx_restrict                    ff,
1292                      t_forcerec * gmx_restrict              fr,
1293                      t_mdatoms * gmx_restrict               mdatoms,
1294                      nb_kernel_data_t * gmx_restrict        kernel_data,
1295                      t_nrnb * gmx_restrict                  nrnb)
1296 {
1297     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1298      * just 0 for non-waters.
1299      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1300      * jnr indices corresponding to data put in the four positions in the SIMD register.
1301      */
1302     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1303     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1304     int              jnrA,jnrB,jnrC,jnrD;
1305     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1306     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1307     real             shX,shY,shZ,rcutoff_scalar;
1308     real             *shiftvec,*fshift,*x,*f;
1309     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1310     int              vdwioffset0;
1311     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1312     int              vdwioffset1;
1313     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1314     int              vdwioffset2;
1315     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1316     int              vdwioffset3;
1317     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1318     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1319     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1320     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1321     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1322     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1323     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1324     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1325     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1326     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1327     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1328     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1329     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1330     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1331     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1332     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1333     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1334     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1335     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1336     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1337     real             *charge;
1338     int              nvdwtype;
1339     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1340     int              *vdwtype;
1341     real             *vdwparam;
1342     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1343     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1344     __m128i          vfitab;
1345     __m128i          ifour       = _mm_set1_epi32(4);
1346     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1347     real             *vftab;
1348     __m128           dummy_mask,cutoff_mask;
1349     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1350     __m128           one     = _mm_set1_ps(1.0);
1351     __m128           two     = _mm_set1_ps(2.0);
1352     x                = xx[0];
1353     f                = ff[0];
1354
1355     nri              = nlist->nri;
1356     iinr             = nlist->iinr;
1357     jindex           = nlist->jindex;
1358     jjnr             = nlist->jjnr;
1359     shiftidx         = nlist->shift;
1360     gid              = nlist->gid;
1361     shiftvec         = fr->shift_vec[0];
1362     fshift           = fr->fshift[0];
1363     facel            = _mm_set1_ps(fr->epsfac);
1364     charge           = mdatoms->chargeA;
1365     krf              = _mm_set1_ps(fr->ic->k_rf);
1366     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
1367     crf              = _mm_set1_ps(fr->ic->c_rf);
1368     nvdwtype         = fr->ntype;
1369     vdwparam         = fr->nbfp;
1370     vdwtype          = mdatoms->typeA;
1371
1372     vftab            = kernel_data->table_vdw->data;
1373     vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
1374
1375     /* Setup water-specific parameters */
1376     inr              = nlist->iinr[0];
1377     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1378     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1379     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1380     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1381
1382     jq1              = _mm_set1_ps(charge[inr+1]);
1383     jq2              = _mm_set1_ps(charge[inr+2]);
1384     jq3              = _mm_set1_ps(charge[inr+3]);
1385     vdwjidx0A        = 2*vdwtype[inr+0];
1386     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1387     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1388     qq11             = _mm_mul_ps(iq1,jq1);
1389     qq12             = _mm_mul_ps(iq1,jq2);
1390     qq13             = _mm_mul_ps(iq1,jq3);
1391     qq21             = _mm_mul_ps(iq2,jq1);
1392     qq22             = _mm_mul_ps(iq2,jq2);
1393     qq23             = _mm_mul_ps(iq2,jq3);
1394     qq31             = _mm_mul_ps(iq3,jq1);
1395     qq32             = _mm_mul_ps(iq3,jq2);
1396     qq33             = _mm_mul_ps(iq3,jq3);
1397
1398     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1399     rcutoff_scalar   = fr->rcoulomb;
1400     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1401     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1402
1403     /* Avoid stupid compiler warnings */
1404     jnrA = jnrB = jnrC = jnrD = 0;
1405     j_coord_offsetA = 0;
1406     j_coord_offsetB = 0;
1407     j_coord_offsetC = 0;
1408     j_coord_offsetD = 0;
1409
1410     outeriter        = 0;
1411     inneriter        = 0;
1412
1413     /* Start outer loop over neighborlists */
1414     for(iidx=0; iidx<nri; iidx++)
1415     {
1416         /* Load shift vector for this list */
1417         i_shift_offset   = DIM*shiftidx[iidx];
1418         shX              = shiftvec[i_shift_offset+XX];
1419         shY              = shiftvec[i_shift_offset+YY];
1420         shZ              = shiftvec[i_shift_offset+ZZ];
1421
1422         /* Load limits for loop over neighbors */
1423         j_index_start    = jindex[iidx];
1424         j_index_end      = jindex[iidx+1];
1425
1426         /* Get outer coordinate index */
1427         inr              = iinr[iidx];
1428         i_coord_offset   = DIM*inr;
1429
1430         /* Load i particle coords and add shift vector */
1431         ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
1432         iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
1433         iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
1434         ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1435         iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1436         iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1437         ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1438         iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1439         iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1440         ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
1441         iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
1442         iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
1443
1444         fix0             = _mm_setzero_ps();
1445         fiy0             = _mm_setzero_ps();
1446         fiz0             = _mm_setzero_ps();
1447         fix1             = _mm_setzero_ps();
1448         fiy1             = _mm_setzero_ps();
1449         fiz1             = _mm_setzero_ps();
1450         fix2             = _mm_setzero_ps();
1451         fiy2             = _mm_setzero_ps();
1452         fiz2             = _mm_setzero_ps();
1453         fix3             = _mm_setzero_ps();
1454         fiy3             = _mm_setzero_ps();
1455         fiz3             = _mm_setzero_ps();
1456
1457         /* Start inner kernel loop */
1458         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1459         {
1460
1461             /* Get j neighbor index, and coordinate index */
1462             jnrA             = jjnr[jidx];
1463             jnrB             = jjnr[jidx+1];
1464             jnrC             = jjnr[jidx+2];
1465             jnrD             = jjnr[jidx+3];
1466
1467             j_coord_offsetA  = DIM*jnrA;
1468             j_coord_offsetB  = DIM*jnrB;
1469             j_coord_offsetC  = DIM*jnrC;
1470             j_coord_offsetD  = DIM*jnrD;
1471
1472             /* load j atom coordinates */
1473             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1474                                               x+j_coord_offsetC,x+j_coord_offsetD,
1475                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1476                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1477
1478             /* Calculate displacement vector */
1479             dx00             = _mm_sub_ps(ix0,jx0);
1480             dy00             = _mm_sub_ps(iy0,jy0);
1481             dz00             = _mm_sub_ps(iz0,jz0);
1482             dx11             = _mm_sub_ps(ix1,jx1);
1483             dy11             = _mm_sub_ps(iy1,jy1);
1484             dz11             = _mm_sub_ps(iz1,jz1);
1485             dx12             = _mm_sub_ps(ix1,jx2);
1486             dy12             = _mm_sub_ps(iy1,jy2);
1487             dz12             = _mm_sub_ps(iz1,jz2);
1488             dx13             = _mm_sub_ps(ix1,jx3);
1489             dy13             = _mm_sub_ps(iy1,jy3);
1490             dz13             = _mm_sub_ps(iz1,jz3);
1491             dx21             = _mm_sub_ps(ix2,jx1);
1492             dy21             = _mm_sub_ps(iy2,jy1);
1493             dz21             = _mm_sub_ps(iz2,jz1);
1494             dx22             = _mm_sub_ps(ix2,jx2);
1495             dy22             = _mm_sub_ps(iy2,jy2);
1496             dz22             = _mm_sub_ps(iz2,jz2);
1497             dx23             = _mm_sub_ps(ix2,jx3);
1498             dy23             = _mm_sub_ps(iy2,jy3);
1499             dz23             = _mm_sub_ps(iz2,jz3);
1500             dx31             = _mm_sub_ps(ix3,jx1);
1501             dy31             = _mm_sub_ps(iy3,jy1);
1502             dz31             = _mm_sub_ps(iz3,jz1);
1503             dx32             = _mm_sub_ps(ix3,jx2);
1504             dy32             = _mm_sub_ps(iy3,jy2);
1505             dz32             = _mm_sub_ps(iz3,jz2);
1506             dx33             = _mm_sub_ps(ix3,jx3);
1507             dy33             = _mm_sub_ps(iy3,jy3);
1508             dz33             = _mm_sub_ps(iz3,jz3);
1509
1510             /* Calculate squared distance and things based on it */
1511             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1512             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1513             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1514             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1515             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1516             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1517             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1518             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1519             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1520             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1521
1522             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1523             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1524             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1525             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1526             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1527             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1528             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1529             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1530             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1531             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1532
1533             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1534             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1535             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1536             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1537             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1538             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1539             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1540             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1541             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1542
1543             fjx0             = _mm_setzero_ps();
1544             fjy0             = _mm_setzero_ps();
1545             fjz0             = _mm_setzero_ps();
1546             fjx1             = _mm_setzero_ps();
1547             fjy1             = _mm_setzero_ps();
1548             fjz1             = _mm_setzero_ps();
1549             fjx2             = _mm_setzero_ps();
1550             fjy2             = _mm_setzero_ps();
1551             fjz2             = _mm_setzero_ps();
1552             fjx3             = _mm_setzero_ps();
1553             fjy3             = _mm_setzero_ps();
1554             fjz3             = _mm_setzero_ps();
1555
1556             /**************************
1557              * CALCULATE INTERACTIONS *
1558              **************************/
1559
1560             r00              = _mm_mul_ps(rsq00,rinv00);
1561
1562             /* Calculate table index by multiplying r with table scale and truncate to integer */
1563             rt               = _mm_mul_ps(r00,vftabscale);
1564             vfitab           = _mm_cvttps_epi32(rt);
1565             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1566             vfitab           = _mm_slli_epi32(vfitab,3);
1567
1568             /* CUBIC SPLINE TABLE DISPERSION */
1569             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1570             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1571             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1572             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1573             _MM_TRANSPOSE4_PS(Y,F,G,H);
1574             Heps             = _mm_mul_ps(vfeps,H);
1575             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1576             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1577             fvdw6            = _mm_mul_ps(c6_00,FF);
1578
1579             /* CUBIC SPLINE TABLE REPULSION */
1580             vfitab           = _mm_add_epi32(vfitab,ifour);
1581             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1582             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1583             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1584             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1585             _MM_TRANSPOSE4_PS(Y,F,G,H);
1586             Heps             = _mm_mul_ps(vfeps,H);
1587             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1588             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1589             fvdw12           = _mm_mul_ps(c12_00,FF);
1590             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1591
1592             fscal            = fvdw;
1593
1594             /* Calculate temporary vectorial force */
1595             tx               = _mm_mul_ps(fscal,dx00);
1596             ty               = _mm_mul_ps(fscal,dy00);
1597             tz               = _mm_mul_ps(fscal,dz00);
1598
1599             /* Update vectorial force */
1600             fix0             = _mm_add_ps(fix0,tx);
1601             fiy0             = _mm_add_ps(fiy0,ty);
1602             fiz0             = _mm_add_ps(fiz0,tz);
1603
1604             fjx0             = _mm_add_ps(fjx0,tx);
1605             fjy0             = _mm_add_ps(fjy0,ty);
1606             fjz0             = _mm_add_ps(fjz0,tz);
1607
1608             /**************************
1609              * CALCULATE INTERACTIONS *
1610              **************************/
1611
1612             if (gmx_mm_any_lt(rsq11,rcutoff2))
1613             {
1614
1615             /* REACTION-FIELD ELECTROSTATICS */
1616             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1617
1618             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1619
1620             fscal            = felec;
1621
1622             fscal            = _mm_and_ps(fscal,cutoff_mask);
1623
1624             /* Calculate temporary vectorial force */
1625             tx               = _mm_mul_ps(fscal,dx11);
1626             ty               = _mm_mul_ps(fscal,dy11);
1627             tz               = _mm_mul_ps(fscal,dz11);
1628
1629             /* Update vectorial force */
1630             fix1             = _mm_add_ps(fix1,tx);
1631             fiy1             = _mm_add_ps(fiy1,ty);
1632             fiz1             = _mm_add_ps(fiz1,tz);
1633
1634             fjx1             = _mm_add_ps(fjx1,tx);
1635             fjy1             = _mm_add_ps(fjy1,ty);
1636             fjz1             = _mm_add_ps(fjz1,tz);
1637
1638             }
1639
1640             /**************************
1641              * CALCULATE INTERACTIONS *
1642              **************************/
1643
1644             if (gmx_mm_any_lt(rsq12,rcutoff2))
1645             {
1646
1647             /* REACTION-FIELD ELECTROSTATICS */
1648             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1649
1650             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1651
1652             fscal            = felec;
1653
1654             fscal            = _mm_and_ps(fscal,cutoff_mask);
1655
1656             /* Calculate temporary vectorial force */
1657             tx               = _mm_mul_ps(fscal,dx12);
1658             ty               = _mm_mul_ps(fscal,dy12);
1659             tz               = _mm_mul_ps(fscal,dz12);
1660
1661             /* Update vectorial force */
1662             fix1             = _mm_add_ps(fix1,tx);
1663             fiy1             = _mm_add_ps(fiy1,ty);
1664             fiz1             = _mm_add_ps(fiz1,tz);
1665
1666             fjx2             = _mm_add_ps(fjx2,tx);
1667             fjy2             = _mm_add_ps(fjy2,ty);
1668             fjz2             = _mm_add_ps(fjz2,tz);
1669
1670             }
1671
1672             /**************************
1673              * CALCULATE INTERACTIONS *
1674              **************************/
1675
1676             if (gmx_mm_any_lt(rsq13,rcutoff2))
1677             {
1678
1679             /* REACTION-FIELD ELECTROSTATICS */
1680             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
1681
1682             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
1683
1684             fscal            = felec;
1685
1686             fscal            = _mm_and_ps(fscal,cutoff_mask);
1687
1688             /* Calculate temporary vectorial force */
1689             tx               = _mm_mul_ps(fscal,dx13);
1690             ty               = _mm_mul_ps(fscal,dy13);
1691             tz               = _mm_mul_ps(fscal,dz13);
1692
1693             /* Update vectorial force */
1694             fix1             = _mm_add_ps(fix1,tx);
1695             fiy1             = _mm_add_ps(fiy1,ty);
1696             fiz1             = _mm_add_ps(fiz1,tz);
1697
1698             fjx3             = _mm_add_ps(fjx3,tx);
1699             fjy3             = _mm_add_ps(fjy3,ty);
1700             fjz3             = _mm_add_ps(fjz3,tz);
1701
1702             }
1703
1704             /**************************
1705              * CALCULATE INTERACTIONS *
1706              **************************/
1707
1708             if (gmx_mm_any_lt(rsq21,rcutoff2))
1709             {
1710
1711             /* REACTION-FIELD ELECTROSTATICS */
1712             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1713
1714             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1715
1716             fscal            = felec;
1717
1718             fscal            = _mm_and_ps(fscal,cutoff_mask);
1719
1720             /* Calculate temporary vectorial force */
1721             tx               = _mm_mul_ps(fscal,dx21);
1722             ty               = _mm_mul_ps(fscal,dy21);
1723             tz               = _mm_mul_ps(fscal,dz21);
1724
1725             /* Update vectorial force */
1726             fix2             = _mm_add_ps(fix2,tx);
1727             fiy2             = _mm_add_ps(fiy2,ty);
1728             fiz2             = _mm_add_ps(fiz2,tz);
1729
1730             fjx1             = _mm_add_ps(fjx1,tx);
1731             fjy1             = _mm_add_ps(fjy1,ty);
1732             fjz1             = _mm_add_ps(fjz1,tz);
1733
1734             }
1735
1736             /**************************
1737              * CALCULATE INTERACTIONS *
1738              **************************/
1739
1740             if (gmx_mm_any_lt(rsq22,rcutoff2))
1741             {
1742
1743             /* REACTION-FIELD ELECTROSTATICS */
1744             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1745
1746             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1747
1748             fscal            = felec;
1749
1750             fscal            = _mm_and_ps(fscal,cutoff_mask);
1751
1752             /* Calculate temporary vectorial force */
1753             tx               = _mm_mul_ps(fscal,dx22);
1754             ty               = _mm_mul_ps(fscal,dy22);
1755             tz               = _mm_mul_ps(fscal,dz22);
1756
1757             /* Update vectorial force */
1758             fix2             = _mm_add_ps(fix2,tx);
1759             fiy2             = _mm_add_ps(fiy2,ty);
1760             fiz2             = _mm_add_ps(fiz2,tz);
1761
1762             fjx2             = _mm_add_ps(fjx2,tx);
1763             fjy2             = _mm_add_ps(fjy2,ty);
1764             fjz2             = _mm_add_ps(fjz2,tz);
1765
1766             }
1767
1768             /**************************
1769              * CALCULATE INTERACTIONS *
1770              **************************/
1771
1772             if (gmx_mm_any_lt(rsq23,rcutoff2))
1773             {
1774
1775             /* REACTION-FIELD ELECTROSTATICS */
1776             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1777
1778             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1779
1780             fscal            = felec;
1781
1782             fscal            = _mm_and_ps(fscal,cutoff_mask);
1783
1784             /* Calculate temporary vectorial force */
1785             tx               = _mm_mul_ps(fscal,dx23);
1786             ty               = _mm_mul_ps(fscal,dy23);
1787             tz               = _mm_mul_ps(fscal,dz23);
1788
1789             /* Update vectorial force */
1790             fix2             = _mm_add_ps(fix2,tx);
1791             fiy2             = _mm_add_ps(fiy2,ty);
1792             fiz2             = _mm_add_ps(fiz2,tz);
1793
1794             fjx3             = _mm_add_ps(fjx3,tx);
1795             fjy3             = _mm_add_ps(fjy3,ty);
1796             fjz3             = _mm_add_ps(fjz3,tz);
1797
1798             }
1799
1800             /**************************
1801              * CALCULATE INTERACTIONS *
1802              **************************/
1803
1804             if (gmx_mm_any_lt(rsq31,rcutoff2))
1805             {
1806
1807             /* REACTION-FIELD ELECTROSTATICS */
1808             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1809
1810             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1811
1812             fscal            = felec;
1813
1814             fscal            = _mm_and_ps(fscal,cutoff_mask);
1815
1816             /* Calculate temporary vectorial force */
1817             tx               = _mm_mul_ps(fscal,dx31);
1818             ty               = _mm_mul_ps(fscal,dy31);
1819             tz               = _mm_mul_ps(fscal,dz31);
1820
1821             /* Update vectorial force */
1822             fix3             = _mm_add_ps(fix3,tx);
1823             fiy3             = _mm_add_ps(fiy3,ty);
1824             fiz3             = _mm_add_ps(fiz3,tz);
1825
1826             fjx1             = _mm_add_ps(fjx1,tx);
1827             fjy1             = _mm_add_ps(fjy1,ty);
1828             fjz1             = _mm_add_ps(fjz1,tz);
1829
1830             }
1831
1832             /**************************
1833              * CALCULATE INTERACTIONS *
1834              **************************/
1835
1836             if (gmx_mm_any_lt(rsq32,rcutoff2))
1837             {
1838
1839             /* REACTION-FIELD ELECTROSTATICS */
1840             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1841
1842             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1843
1844             fscal            = felec;
1845
1846             fscal            = _mm_and_ps(fscal,cutoff_mask);
1847
1848             /* Calculate temporary vectorial force */
1849             tx               = _mm_mul_ps(fscal,dx32);
1850             ty               = _mm_mul_ps(fscal,dy32);
1851             tz               = _mm_mul_ps(fscal,dz32);
1852
1853             /* Update vectorial force */
1854             fix3             = _mm_add_ps(fix3,tx);
1855             fiy3             = _mm_add_ps(fiy3,ty);
1856             fiz3             = _mm_add_ps(fiz3,tz);
1857
1858             fjx2             = _mm_add_ps(fjx2,tx);
1859             fjy2             = _mm_add_ps(fjy2,ty);
1860             fjz2             = _mm_add_ps(fjz2,tz);
1861
1862             }
1863
1864             /**************************
1865              * CALCULATE INTERACTIONS *
1866              **************************/
1867
1868             if (gmx_mm_any_lt(rsq33,rcutoff2))
1869             {
1870
1871             /* REACTION-FIELD ELECTROSTATICS */
1872             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1873
1874             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1875
1876             fscal            = felec;
1877
1878             fscal            = _mm_and_ps(fscal,cutoff_mask);
1879
1880             /* Calculate temporary vectorial force */
1881             tx               = _mm_mul_ps(fscal,dx33);
1882             ty               = _mm_mul_ps(fscal,dy33);
1883             tz               = _mm_mul_ps(fscal,dz33);
1884
1885             /* Update vectorial force */
1886             fix3             = _mm_add_ps(fix3,tx);
1887             fiy3             = _mm_add_ps(fiy3,ty);
1888             fiz3             = _mm_add_ps(fiz3,tz);
1889
1890             fjx3             = _mm_add_ps(fjx3,tx);
1891             fjy3             = _mm_add_ps(fjy3,ty);
1892             fjz3             = _mm_add_ps(fjz3,tz);
1893
1894             }
1895
1896             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1897                                                    f+j_coord_offsetC,f+j_coord_offsetD,
1898                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1899                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1900
1901             /* Inner loop uses 321 flops */
1902         }
1903
1904         if(jidx<j_index_end)
1905         {
1906
1907             /* Get j neighbor index, and coordinate index */
1908             jnrA             = jjnr[jidx];
1909             jnrB             = jjnr[jidx+1];
1910             jnrC             = jjnr[jidx+2];
1911             jnrD             = jjnr[jidx+3];
1912
1913             /* Sign of each element will be negative for non-real atoms.
1914              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1915              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1916              */
1917             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1918             jnrA       = (jnrA>=0) ? jnrA : 0;
1919             jnrB       = (jnrB>=0) ? jnrB : 0;
1920             jnrC       = (jnrC>=0) ? jnrC : 0;
1921             jnrD       = (jnrD>=0) ? jnrD : 0;
1922
1923             j_coord_offsetA  = DIM*jnrA;
1924             j_coord_offsetB  = DIM*jnrB;
1925             j_coord_offsetC  = DIM*jnrC;
1926             j_coord_offsetD  = DIM*jnrD;
1927
1928             /* load j atom coordinates */
1929             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1930                                               x+j_coord_offsetC,x+j_coord_offsetD,
1931                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1932                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1933
1934             /* Calculate displacement vector */
1935             dx00             = _mm_sub_ps(ix0,jx0);
1936             dy00             = _mm_sub_ps(iy0,jy0);
1937             dz00             = _mm_sub_ps(iz0,jz0);
1938             dx11             = _mm_sub_ps(ix1,jx1);
1939             dy11             = _mm_sub_ps(iy1,jy1);
1940             dz11             = _mm_sub_ps(iz1,jz1);
1941             dx12             = _mm_sub_ps(ix1,jx2);
1942             dy12             = _mm_sub_ps(iy1,jy2);
1943             dz12             = _mm_sub_ps(iz1,jz2);
1944             dx13             = _mm_sub_ps(ix1,jx3);
1945             dy13             = _mm_sub_ps(iy1,jy3);
1946             dz13             = _mm_sub_ps(iz1,jz3);
1947             dx21             = _mm_sub_ps(ix2,jx1);
1948             dy21             = _mm_sub_ps(iy2,jy1);
1949             dz21             = _mm_sub_ps(iz2,jz1);
1950             dx22             = _mm_sub_ps(ix2,jx2);
1951             dy22             = _mm_sub_ps(iy2,jy2);
1952             dz22             = _mm_sub_ps(iz2,jz2);
1953             dx23             = _mm_sub_ps(ix2,jx3);
1954             dy23             = _mm_sub_ps(iy2,jy3);
1955             dz23             = _mm_sub_ps(iz2,jz3);
1956             dx31             = _mm_sub_ps(ix3,jx1);
1957             dy31             = _mm_sub_ps(iy3,jy1);
1958             dz31             = _mm_sub_ps(iz3,jz1);
1959             dx32             = _mm_sub_ps(ix3,jx2);
1960             dy32             = _mm_sub_ps(iy3,jy2);
1961             dz32             = _mm_sub_ps(iz3,jz2);
1962             dx33             = _mm_sub_ps(ix3,jx3);
1963             dy33             = _mm_sub_ps(iy3,jy3);
1964             dz33             = _mm_sub_ps(iz3,jz3);
1965
1966             /* Calculate squared distance and things based on it */
1967             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1968             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1969             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1970             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1971             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1972             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1973             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1974             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1975             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1976             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1977
1978             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1979             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1980             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1981             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1982             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1983             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1984             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1985             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1986             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1987             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1988
1989             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1990             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1991             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1992             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1993             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1994             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1995             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1996             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1997             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1998
1999             fjx0             = _mm_setzero_ps();
2000             fjy0             = _mm_setzero_ps();
2001             fjz0             = _mm_setzero_ps();
2002             fjx1             = _mm_setzero_ps();
2003             fjy1             = _mm_setzero_ps();
2004             fjz1             = _mm_setzero_ps();
2005             fjx2             = _mm_setzero_ps();
2006             fjy2             = _mm_setzero_ps();
2007             fjz2             = _mm_setzero_ps();
2008             fjx3             = _mm_setzero_ps();
2009             fjy3             = _mm_setzero_ps();
2010             fjz3             = _mm_setzero_ps();
2011
2012             /**************************
2013              * CALCULATE INTERACTIONS *
2014              **************************/
2015
2016             r00              = _mm_mul_ps(rsq00,rinv00);
2017             r00              = _mm_andnot_ps(dummy_mask,r00);
2018
2019             /* Calculate table index by multiplying r with table scale and truncate to integer */
2020             rt               = _mm_mul_ps(r00,vftabscale);
2021             vfitab           = _mm_cvttps_epi32(rt);
2022             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2023             vfitab           = _mm_slli_epi32(vfitab,3);
2024
2025             /* CUBIC SPLINE TABLE DISPERSION */
2026             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2027             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2028             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2029             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2030             _MM_TRANSPOSE4_PS(Y,F,G,H);
2031             Heps             = _mm_mul_ps(vfeps,H);
2032             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2033             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2034             fvdw6            = _mm_mul_ps(c6_00,FF);
2035
2036             /* CUBIC SPLINE TABLE REPULSION */
2037             vfitab           = _mm_add_epi32(vfitab,ifour);
2038             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2039             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2040             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2041             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2042             _MM_TRANSPOSE4_PS(Y,F,G,H);
2043             Heps             = _mm_mul_ps(vfeps,H);
2044             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2045             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2046             fvdw12           = _mm_mul_ps(c12_00,FF);
2047             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
2048
2049             fscal            = fvdw;
2050
2051             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2052
2053             /* Calculate temporary vectorial force */
2054             tx               = _mm_mul_ps(fscal,dx00);
2055             ty               = _mm_mul_ps(fscal,dy00);
2056             tz               = _mm_mul_ps(fscal,dz00);
2057
2058             /* Update vectorial force */
2059             fix0             = _mm_add_ps(fix0,tx);
2060             fiy0             = _mm_add_ps(fiy0,ty);
2061             fiz0             = _mm_add_ps(fiz0,tz);
2062
2063             fjx0             = _mm_add_ps(fjx0,tx);
2064             fjy0             = _mm_add_ps(fjy0,ty);
2065             fjz0             = _mm_add_ps(fjz0,tz);
2066
2067             /**************************
2068              * CALCULATE INTERACTIONS *
2069              **************************/
2070
2071             if (gmx_mm_any_lt(rsq11,rcutoff2))
2072             {
2073
2074             /* REACTION-FIELD ELECTROSTATICS */
2075             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
2076
2077             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2078
2079             fscal            = felec;
2080
2081             fscal            = _mm_and_ps(fscal,cutoff_mask);
2082
2083             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2084
2085             /* Calculate temporary vectorial force */
2086             tx               = _mm_mul_ps(fscal,dx11);
2087             ty               = _mm_mul_ps(fscal,dy11);
2088             tz               = _mm_mul_ps(fscal,dz11);
2089
2090             /* Update vectorial force */
2091             fix1             = _mm_add_ps(fix1,tx);
2092             fiy1             = _mm_add_ps(fiy1,ty);
2093             fiz1             = _mm_add_ps(fiz1,tz);
2094
2095             fjx1             = _mm_add_ps(fjx1,tx);
2096             fjy1             = _mm_add_ps(fjy1,ty);
2097             fjz1             = _mm_add_ps(fjz1,tz);
2098
2099             }
2100
2101             /**************************
2102              * CALCULATE INTERACTIONS *
2103              **************************/
2104
2105             if (gmx_mm_any_lt(rsq12,rcutoff2))
2106             {
2107
2108             /* REACTION-FIELD ELECTROSTATICS */
2109             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
2110
2111             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2112
2113             fscal            = felec;
2114
2115             fscal            = _mm_and_ps(fscal,cutoff_mask);
2116
2117             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2118
2119             /* Calculate temporary vectorial force */
2120             tx               = _mm_mul_ps(fscal,dx12);
2121             ty               = _mm_mul_ps(fscal,dy12);
2122             tz               = _mm_mul_ps(fscal,dz12);
2123
2124             /* Update vectorial force */
2125             fix1             = _mm_add_ps(fix1,tx);
2126             fiy1             = _mm_add_ps(fiy1,ty);
2127             fiz1             = _mm_add_ps(fiz1,tz);
2128
2129             fjx2             = _mm_add_ps(fjx2,tx);
2130             fjy2             = _mm_add_ps(fjy2,ty);
2131             fjz2             = _mm_add_ps(fjz2,tz);
2132
2133             }
2134
2135             /**************************
2136              * CALCULATE INTERACTIONS *
2137              **************************/
2138
2139             if (gmx_mm_any_lt(rsq13,rcutoff2))
2140             {
2141
2142             /* REACTION-FIELD ELECTROSTATICS */
2143             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
2144
2145             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
2146
2147             fscal            = felec;
2148
2149             fscal            = _mm_and_ps(fscal,cutoff_mask);
2150
2151             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2152
2153             /* Calculate temporary vectorial force */
2154             tx               = _mm_mul_ps(fscal,dx13);
2155             ty               = _mm_mul_ps(fscal,dy13);
2156             tz               = _mm_mul_ps(fscal,dz13);
2157
2158             /* Update vectorial force */
2159             fix1             = _mm_add_ps(fix1,tx);
2160             fiy1             = _mm_add_ps(fiy1,ty);
2161             fiz1             = _mm_add_ps(fiz1,tz);
2162
2163             fjx3             = _mm_add_ps(fjx3,tx);
2164             fjy3             = _mm_add_ps(fjy3,ty);
2165             fjz3             = _mm_add_ps(fjz3,tz);
2166
2167             }
2168
2169             /**************************
2170              * CALCULATE INTERACTIONS *
2171              **************************/
2172
2173             if (gmx_mm_any_lt(rsq21,rcutoff2))
2174             {
2175
2176             /* REACTION-FIELD ELECTROSTATICS */
2177             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
2178
2179             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2180
2181             fscal            = felec;
2182
2183             fscal            = _mm_and_ps(fscal,cutoff_mask);
2184
2185             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2186
2187             /* Calculate temporary vectorial force */
2188             tx               = _mm_mul_ps(fscal,dx21);
2189             ty               = _mm_mul_ps(fscal,dy21);
2190             tz               = _mm_mul_ps(fscal,dz21);
2191
2192             /* Update vectorial force */
2193             fix2             = _mm_add_ps(fix2,tx);
2194             fiy2             = _mm_add_ps(fiy2,ty);
2195             fiz2             = _mm_add_ps(fiz2,tz);
2196
2197             fjx1             = _mm_add_ps(fjx1,tx);
2198             fjy1             = _mm_add_ps(fjy1,ty);
2199             fjz1             = _mm_add_ps(fjz1,tz);
2200
2201             }
2202
2203             /**************************
2204              * CALCULATE INTERACTIONS *
2205              **************************/
2206
2207             if (gmx_mm_any_lt(rsq22,rcutoff2))
2208             {
2209
2210             /* REACTION-FIELD ELECTROSTATICS */
2211             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
2212
2213             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2214
2215             fscal            = felec;
2216
2217             fscal            = _mm_and_ps(fscal,cutoff_mask);
2218
2219             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2220
2221             /* Calculate temporary vectorial force */
2222             tx               = _mm_mul_ps(fscal,dx22);
2223             ty               = _mm_mul_ps(fscal,dy22);
2224             tz               = _mm_mul_ps(fscal,dz22);
2225
2226             /* Update vectorial force */
2227             fix2             = _mm_add_ps(fix2,tx);
2228             fiy2             = _mm_add_ps(fiy2,ty);
2229             fiz2             = _mm_add_ps(fiz2,tz);
2230
2231             fjx2             = _mm_add_ps(fjx2,tx);
2232             fjy2             = _mm_add_ps(fjy2,ty);
2233             fjz2             = _mm_add_ps(fjz2,tz);
2234
2235             }
2236
2237             /**************************
2238              * CALCULATE INTERACTIONS *
2239              **************************/
2240
2241             if (gmx_mm_any_lt(rsq23,rcutoff2))
2242             {
2243
2244             /* REACTION-FIELD ELECTROSTATICS */
2245             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
2246
2247             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
2248
2249             fscal            = felec;
2250
2251             fscal            = _mm_and_ps(fscal,cutoff_mask);
2252
2253             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2254
2255             /* Calculate temporary vectorial force */
2256             tx               = _mm_mul_ps(fscal,dx23);
2257             ty               = _mm_mul_ps(fscal,dy23);
2258             tz               = _mm_mul_ps(fscal,dz23);
2259
2260             /* Update vectorial force */
2261             fix2             = _mm_add_ps(fix2,tx);
2262             fiy2             = _mm_add_ps(fiy2,ty);
2263             fiz2             = _mm_add_ps(fiz2,tz);
2264
2265             fjx3             = _mm_add_ps(fjx3,tx);
2266             fjy3             = _mm_add_ps(fjy3,ty);
2267             fjz3             = _mm_add_ps(fjz3,tz);
2268
2269             }
2270
2271             /**************************
2272              * CALCULATE INTERACTIONS *
2273              **************************/
2274
2275             if (gmx_mm_any_lt(rsq31,rcutoff2))
2276             {
2277
2278             /* REACTION-FIELD ELECTROSTATICS */
2279             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
2280
2281             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
2282
2283             fscal            = felec;
2284
2285             fscal            = _mm_and_ps(fscal,cutoff_mask);
2286
2287             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2288
2289             /* Calculate temporary vectorial force */
2290             tx               = _mm_mul_ps(fscal,dx31);
2291             ty               = _mm_mul_ps(fscal,dy31);
2292             tz               = _mm_mul_ps(fscal,dz31);
2293
2294             /* Update vectorial force */
2295             fix3             = _mm_add_ps(fix3,tx);
2296             fiy3             = _mm_add_ps(fiy3,ty);
2297             fiz3             = _mm_add_ps(fiz3,tz);
2298
2299             fjx1             = _mm_add_ps(fjx1,tx);
2300             fjy1             = _mm_add_ps(fjy1,ty);
2301             fjz1             = _mm_add_ps(fjz1,tz);
2302
2303             }
2304
2305             /**************************
2306              * CALCULATE INTERACTIONS *
2307              **************************/
2308
2309             if (gmx_mm_any_lt(rsq32,rcutoff2))
2310             {
2311
2312             /* REACTION-FIELD ELECTROSTATICS */
2313             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
2314
2315             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
2316
2317             fscal            = felec;
2318
2319             fscal            = _mm_and_ps(fscal,cutoff_mask);
2320
2321             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2322
2323             /* Calculate temporary vectorial force */
2324             tx               = _mm_mul_ps(fscal,dx32);
2325             ty               = _mm_mul_ps(fscal,dy32);
2326             tz               = _mm_mul_ps(fscal,dz32);
2327
2328             /* Update vectorial force */
2329             fix3             = _mm_add_ps(fix3,tx);
2330             fiy3             = _mm_add_ps(fiy3,ty);
2331             fiz3             = _mm_add_ps(fiz3,tz);
2332
2333             fjx2             = _mm_add_ps(fjx2,tx);
2334             fjy2             = _mm_add_ps(fjy2,ty);
2335             fjz2             = _mm_add_ps(fjz2,tz);
2336
2337             }
2338
2339             /**************************
2340              * CALCULATE INTERACTIONS *
2341              **************************/
2342
2343             if (gmx_mm_any_lt(rsq33,rcutoff2))
2344             {
2345
2346             /* REACTION-FIELD ELECTROSTATICS */
2347             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
2348
2349             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
2350
2351             fscal            = felec;
2352
2353             fscal            = _mm_and_ps(fscal,cutoff_mask);
2354
2355             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2356
2357             /* Calculate temporary vectorial force */
2358             tx               = _mm_mul_ps(fscal,dx33);
2359             ty               = _mm_mul_ps(fscal,dy33);
2360             tz               = _mm_mul_ps(fscal,dz33);
2361
2362             /* Update vectorial force */
2363             fix3             = _mm_add_ps(fix3,tx);
2364             fiy3             = _mm_add_ps(fiy3,ty);
2365             fiz3             = _mm_add_ps(fiz3,tz);
2366
2367             fjx3             = _mm_add_ps(fjx3,tx);
2368             fjy3             = _mm_add_ps(fjy3,ty);
2369             fjz3             = _mm_add_ps(fjz3,tz);
2370
2371             }
2372
2373             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
2374                                                    f+j_coord_offsetC,f+j_coord_offsetD,
2375                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2376                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2377
2378             /* Inner loop uses 322 flops */
2379         }
2380
2381         /* End of innermost loop */
2382
2383         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2384                                               f+i_coord_offset,fshift+i_shift_offset);
2385
2386         /* Increment number of inner iterations */
2387         inneriter                  += j_index_end - j_index_start;
2388
2389         /* Outer loop uses 36 flops */
2390     }
2391
2392     /* Increment number of outer iterations */
2393     outeriter        += nri;
2394
2395     /* Update outer/inner flops */
2396
2397     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*322);
2398 }