Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_sse4_1_single.c
1 /*
2  * Note: this file was generated by the Gromacs sse4_1_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse4_1_single.h"
34 #include "kernelutil_x86_sse4_1_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse4_1_single
38  * Electrostatics interaction: ReactionField
39  * VdW interaction:            CubicSplineTable
40  * Geometry:                   Water4-Water4
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse4_1_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
64     real             rcutoff_scalar;
65     real             *shiftvec,*fshift,*x,*f;
66     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
67     real             scratch[4*DIM];
68     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
69     int              vdwioffset0;
70     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
71     int              vdwioffset1;
72     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
73     int              vdwioffset2;
74     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75     int              vdwioffset3;
76     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
77     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
78     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
79     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
80     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
81     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
82     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
83     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
84     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
85     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
86     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
87     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
88     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
89     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
90     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
91     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
92     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
93     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
94     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
95     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
96     real             *charge;
97     int              nvdwtype;
98     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
99     int              *vdwtype;
100     real             *vdwparam;
101     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
102     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
103     __m128i          vfitab;
104     __m128i          ifour       = _mm_set1_epi32(4);
105     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
106     real             *vftab;
107     __m128           dummy_mask,cutoff_mask;
108     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
109     __m128           one     = _mm_set1_ps(1.0);
110     __m128           two     = _mm_set1_ps(2.0);
111     x                = xx[0];
112     f                = ff[0];
113
114     nri              = nlist->nri;
115     iinr             = nlist->iinr;
116     jindex           = nlist->jindex;
117     jjnr             = nlist->jjnr;
118     shiftidx         = nlist->shift;
119     gid              = nlist->gid;
120     shiftvec         = fr->shift_vec[0];
121     fshift           = fr->fshift[0];
122     facel            = _mm_set1_ps(fr->epsfac);
123     charge           = mdatoms->chargeA;
124     krf              = _mm_set1_ps(fr->ic->k_rf);
125     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
126     crf              = _mm_set1_ps(fr->ic->c_rf);
127     nvdwtype         = fr->ntype;
128     vdwparam         = fr->nbfp;
129     vdwtype          = mdatoms->typeA;
130
131     vftab            = kernel_data->table_vdw->data;
132     vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
133
134     /* Setup water-specific parameters */
135     inr              = nlist->iinr[0];
136     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
137     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
138     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
139     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
140
141     jq1              = _mm_set1_ps(charge[inr+1]);
142     jq2              = _mm_set1_ps(charge[inr+2]);
143     jq3              = _mm_set1_ps(charge[inr+3]);
144     vdwjidx0A        = 2*vdwtype[inr+0];
145     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
146     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
147     qq11             = _mm_mul_ps(iq1,jq1);
148     qq12             = _mm_mul_ps(iq1,jq2);
149     qq13             = _mm_mul_ps(iq1,jq3);
150     qq21             = _mm_mul_ps(iq2,jq1);
151     qq22             = _mm_mul_ps(iq2,jq2);
152     qq23             = _mm_mul_ps(iq2,jq3);
153     qq31             = _mm_mul_ps(iq3,jq1);
154     qq32             = _mm_mul_ps(iq3,jq2);
155     qq33             = _mm_mul_ps(iq3,jq3);
156
157     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
158     rcutoff_scalar   = fr->rcoulomb;
159     rcutoff          = _mm_set1_ps(rcutoff_scalar);
160     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
161
162     /* Avoid stupid compiler warnings */
163     jnrA = jnrB = jnrC = jnrD = 0;
164     j_coord_offsetA = 0;
165     j_coord_offsetB = 0;
166     j_coord_offsetC = 0;
167     j_coord_offsetD = 0;
168
169     outeriter        = 0;
170     inneriter        = 0;
171
172     for(iidx=0;iidx<4*DIM;iidx++)
173     {
174         scratch[iidx] = 0.0;
175     }
176
177     /* Start outer loop over neighborlists */
178     for(iidx=0; iidx<nri; iidx++)
179     {
180         /* Load shift vector for this list */
181         i_shift_offset   = DIM*shiftidx[iidx];
182
183         /* Load limits for loop over neighbors */
184         j_index_start    = jindex[iidx];
185         j_index_end      = jindex[iidx+1];
186
187         /* Get outer coordinate index */
188         inr              = iinr[iidx];
189         i_coord_offset   = DIM*inr;
190
191         /* Load i particle coords and add shift vector */
192         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
193                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
194
195         fix0             = _mm_setzero_ps();
196         fiy0             = _mm_setzero_ps();
197         fiz0             = _mm_setzero_ps();
198         fix1             = _mm_setzero_ps();
199         fiy1             = _mm_setzero_ps();
200         fiz1             = _mm_setzero_ps();
201         fix2             = _mm_setzero_ps();
202         fiy2             = _mm_setzero_ps();
203         fiz2             = _mm_setzero_ps();
204         fix3             = _mm_setzero_ps();
205         fiy3             = _mm_setzero_ps();
206         fiz3             = _mm_setzero_ps();
207
208         /* Reset potential sums */
209         velecsum         = _mm_setzero_ps();
210         vvdwsum          = _mm_setzero_ps();
211
212         /* Start inner kernel loop */
213         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
214         {
215
216             /* Get j neighbor index, and coordinate index */
217             jnrA             = jjnr[jidx];
218             jnrB             = jjnr[jidx+1];
219             jnrC             = jjnr[jidx+2];
220             jnrD             = jjnr[jidx+3];
221             j_coord_offsetA  = DIM*jnrA;
222             j_coord_offsetB  = DIM*jnrB;
223             j_coord_offsetC  = DIM*jnrC;
224             j_coord_offsetD  = DIM*jnrD;
225
226             /* load j atom coordinates */
227             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
228                                               x+j_coord_offsetC,x+j_coord_offsetD,
229                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
230                                               &jy2,&jz2,&jx3,&jy3,&jz3);
231
232             /* Calculate displacement vector */
233             dx00             = _mm_sub_ps(ix0,jx0);
234             dy00             = _mm_sub_ps(iy0,jy0);
235             dz00             = _mm_sub_ps(iz0,jz0);
236             dx11             = _mm_sub_ps(ix1,jx1);
237             dy11             = _mm_sub_ps(iy1,jy1);
238             dz11             = _mm_sub_ps(iz1,jz1);
239             dx12             = _mm_sub_ps(ix1,jx2);
240             dy12             = _mm_sub_ps(iy1,jy2);
241             dz12             = _mm_sub_ps(iz1,jz2);
242             dx13             = _mm_sub_ps(ix1,jx3);
243             dy13             = _mm_sub_ps(iy1,jy3);
244             dz13             = _mm_sub_ps(iz1,jz3);
245             dx21             = _mm_sub_ps(ix2,jx1);
246             dy21             = _mm_sub_ps(iy2,jy1);
247             dz21             = _mm_sub_ps(iz2,jz1);
248             dx22             = _mm_sub_ps(ix2,jx2);
249             dy22             = _mm_sub_ps(iy2,jy2);
250             dz22             = _mm_sub_ps(iz2,jz2);
251             dx23             = _mm_sub_ps(ix2,jx3);
252             dy23             = _mm_sub_ps(iy2,jy3);
253             dz23             = _mm_sub_ps(iz2,jz3);
254             dx31             = _mm_sub_ps(ix3,jx1);
255             dy31             = _mm_sub_ps(iy3,jy1);
256             dz31             = _mm_sub_ps(iz3,jz1);
257             dx32             = _mm_sub_ps(ix3,jx2);
258             dy32             = _mm_sub_ps(iy3,jy2);
259             dz32             = _mm_sub_ps(iz3,jz2);
260             dx33             = _mm_sub_ps(ix3,jx3);
261             dy33             = _mm_sub_ps(iy3,jy3);
262             dz33             = _mm_sub_ps(iz3,jz3);
263
264             /* Calculate squared distance and things based on it */
265             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
266             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
267             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
268             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
269             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
270             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
271             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
272             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
273             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
274             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
275
276             rinv00           = gmx_mm_invsqrt_ps(rsq00);
277             rinv11           = gmx_mm_invsqrt_ps(rsq11);
278             rinv12           = gmx_mm_invsqrt_ps(rsq12);
279             rinv13           = gmx_mm_invsqrt_ps(rsq13);
280             rinv21           = gmx_mm_invsqrt_ps(rsq21);
281             rinv22           = gmx_mm_invsqrt_ps(rsq22);
282             rinv23           = gmx_mm_invsqrt_ps(rsq23);
283             rinv31           = gmx_mm_invsqrt_ps(rsq31);
284             rinv32           = gmx_mm_invsqrt_ps(rsq32);
285             rinv33           = gmx_mm_invsqrt_ps(rsq33);
286
287             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
288             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
289             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
290             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
291             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
292             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
293             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
294             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
295             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
296
297             fjx0             = _mm_setzero_ps();
298             fjy0             = _mm_setzero_ps();
299             fjz0             = _mm_setzero_ps();
300             fjx1             = _mm_setzero_ps();
301             fjy1             = _mm_setzero_ps();
302             fjz1             = _mm_setzero_ps();
303             fjx2             = _mm_setzero_ps();
304             fjy2             = _mm_setzero_ps();
305             fjz2             = _mm_setzero_ps();
306             fjx3             = _mm_setzero_ps();
307             fjy3             = _mm_setzero_ps();
308             fjz3             = _mm_setzero_ps();
309
310             /**************************
311              * CALCULATE INTERACTIONS *
312              **************************/
313
314             if (gmx_mm_any_lt(rsq00,rcutoff2))
315             {
316
317             r00              = _mm_mul_ps(rsq00,rinv00);
318
319             /* Calculate table index by multiplying r with table scale and truncate to integer */
320             rt               = _mm_mul_ps(r00,vftabscale);
321             vfitab           = _mm_cvttps_epi32(rt);
322             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
323             vfitab           = _mm_slli_epi32(vfitab,3);
324
325             /* CUBIC SPLINE TABLE DISPERSION */
326             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
327             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
328             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
329             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
330             _MM_TRANSPOSE4_PS(Y,F,G,H);
331             Heps             = _mm_mul_ps(vfeps,H);
332             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
333             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
334             vvdw6            = _mm_mul_ps(c6_00,VV);
335             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
336             fvdw6            = _mm_mul_ps(c6_00,FF);
337
338             /* CUBIC SPLINE TABLE REPULSION */
339             vfitab           = _mm_add_epi32(vfitab,ifour);
340             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
341             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
342             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
343             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
344             _MM_TRANSPOSE4_PS(Y,F,G,H);
345             Heps             = _mm_mul_ps(vfeps,H);
346             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
347             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
348             vvdw12           = _mm_mul_ps(c12_00,VV);
349             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
350             fvdw12           = _mm_mul_ps(c12_00,FF);
351             vvdw             = _mm_add_ps(vvdw12,vvdw6);
352             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
353
354             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
355
356             /* Update potential sum for this i atom from the interaction with this j atom. */
357             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
358             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
359
360             fscal            = fvdw;
361
362             fscal            = _mm_and_ps(fscal,cutoff_mask);
363
364             /* Calculate temporary vectorial force */
365             tx               = _mm_mul_ps(fscal,dx00);
366             ty               = _mm_mul_ps(fscal,dy00);
367             tz               = _mm_mul_ps(fscal,dz00);
368
369             /* Update vectorial force */
370             fix0             = _mm_add_ps(fix0,tx);
371             fiy0             = _mm_add_ps(fiy0,ty);
372             fiz0             = _mm_add_ps(fiz0,tz);
373
374             fjx0             = _mm_add_ps(fjx0,tx);
375             fjy0             = _mm_add_ps(fjy0,ty);
376             fjz0             = _mm_add_ps(fjz0,tz);
377
378             }
379
380             /**************************
381              * CALCULATE INTERACTIONS *
382              **************************/
383
384             if (gmx_mm_any_lt(rsq11,rcutoff2))
385             {
386
387             /* REACTION-FIELD ELECTROSTATICS */
388             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
389             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
390
391             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
392
393             /* Update potential sum for this i atom from the interaction with this j atom. */
394             velec            = _mm_and_ps(velec,cutoff_mask);
395             velecsum         = _mm_add_ps(velecsum,velec);
396
397             fscal            = felec;
398
399             fscal            = _mm_and_ps(fscal,cutoff_mask);
400
401             /* Calculate temporary vectorial force */
402             tx               = _mm_mul_ps(fscal,dx11);
403             ty               = _mm_mul_ps(fscal,dy11);
404             tz               = _mm_mul_ps(fscal,dz11);
405
406             /* Update vectorial force */
407             fix1             = _mm_add_ps(fix1,tx);
408             fiy1             = _mm_add_ps(fiy1,ty);
409             fiz1             = _mm_add_ps(fiz1,tz);
410
411             fjx1             = _mm_add_ps(fjx1,tx);
412             fjy1             = _mm_add_ps(fjy1,ty);
413             fjz1             = _mm_add_ps(fjz1,tz);
414
415             }
416
417             /**************************
418              * CALCULATE INTERACTIONS *
419              **************************/
420
421             if (gmx_mm_any_lt(rsq12,rcutoff2))
422             {
423
424             /* REACTION-FIELD ELECTROSTATICS */
425             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
426             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
427
428             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
429
430             /* Update potential sum for this i atom from the interaction with this j atom. */
431             velec            = _mm_and_ps(velec,cutoff_mask);
432             velecsum         = _mm_add_ps(velecsum,velec);
433
434             fscal            = felec;
435
436             fscal            = _mm_and_ps(fscal,cutoff_mask);
437
438             /* Calculate temporary vectorial force */
439             tx               = _mm_mul_ps(fscal,dx12);
440             ty               = _mm_mul_ps(fscal,dy12);
441             tz               = _mm_mul_ps(fscal,dz12);
442
443             /* Update vectorial force */
444             fix1             = _mm_add_ps(fix1,tx);
445             fiy1             = _mm_add_ps(fiy1,ty);
446             fiz1             = _mm_add_ps(fiz1,tz);
447
448             fjx2             = _mm_add_ps(fjx2,tx);
449             fjy2             = _mm_add_ps(fjy2,ty);
450             fjz2             = _mm_add_ps(fjz2,tz);
451
452             }
453
454             /**************************
455              * CALCULATE INTERACTIONS *
456              **************************/
457
458             if (gmx_mm_any_lt(rsq13,rcutoff2))
459             {
460
461             /* REACTION-FIELD ELECTROSTATICS */
462             velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
463             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
464
465             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
466
467             /* Update potential sum for this i atom from the interaction with this j atom. */
468             velec            = _mm_and_ps(velec,cutoff_mask);
469             velecsum         = _mm_add_ps(velecsum,velec);
470
471             fscal            = felec;
472
473             fscal            = _mm_and_ps(fscal,cutoff_mask);
474
475             /* Calculate temporary vectorial force */
476             tx               = _mm_mul_ps(fscal,dx13);
477             ty               = _mm_mul_ps(fscal,dy13);
478             tz               = _mm_mul_ps(fscal,dz13);
479
480             /* Update vectorial force */
481             fix1             = _mm_add_ps(fix1,tx);
482             fiy1             = _mm_add_ps(fiy1,ty);
483             fiz1             = _mm_add_ps(fiz1,tz);
484
485             fjx3             = _mm_add_ps(fjx3,tx);
486             fjy3             = _mm_add_ps(fjy3,ty);
487             fjz3             = _mm_add_ps(fjz3,tz);
488
489             }
490
491             /**************************
492              * CALCULATE INTERACTIONS *
493              **************************/
494
495             if (gmx_mm_any_lt(rsq21,rcutoff2))
496             {
497
498             /* REACTION-FIELD ELECTROSTATICS */
499             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
500             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
501
502             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
503
504             /* Update potential sum for this i atom from the interaction with this j atom. */
505             velec            = _mm_and_ps(velec,cutoff_mask);
506             velecsum         = _mm_add_ps(velecsum,velec);
507
508             fscal            = felec;
509
510             fscal            = _mm_and_ps(fscal,cutoff_mask);
511
512             /* Calculate temporary vectorial force */
513             tx               = _mm_mul_ps(fscal,dx21);
514             ty               = _mm_mul_ps(fscal,dy21);
515             tz               = _mm_mul_ps(fscal,dz21);
516
517             /* Update vectorial force */
518             fix2             = _mm_add_ps(fix2,tx);
519             fiy2             = _mm_add_ps(fiy2,ty);
520             fiz2             = _mm_add_ps(fiz2,tz);
521
522             fjx1             = _mm_add_ps(fjx1,tx);
523             fjy1             = _mm_add_ps(fjy1,ty);
524             fjz1             = _mm_add_ps(fjz1,tz);
525
526             }
527
528             /**************************
529              * CALCULATE INTERACTIONS *
530              **************************/
531
532             if (gmx_mm_any_lt(rsq22,rcutoff2))
533             {
534
535             /* REACTION-FIELD ELECTROSTATICS */
536             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
537             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
538
539             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
540
541             /* Update potential sum for this i atom from the interaction with this j atom. */
542             velec            = _mm_and_ps(velec,cutoff_mask);
543             velecsum         = _mm_add_ps(velecsum,velec);
544
545             fscal            = felec;
546
547             fscal            = _mm_and_ps(fscal,cutoff_mask);
548
549             /* Calculate temporary vectorial force */
550             tx               = _mm_mul_ps(fscal,dx22);
551             ty               = _mm_mul_ps(fscal,dy22);
552             tz               = _mm_mul_ps(fscal,dz22);
553
554             /* Update vectorial force */
555             fix2             = _mm_add_ps(fix2,tx);
556             fiy2             = _mm_add_ps(fiy2,ty);
557             fiz2             = _mm_add_ps(fiz2,tz);
558
559             fjx2             = _mm_add_ps(fjx2,tx);
560             fjy2             = _mm_add_ps(fjy2,ty);
561             fjz2             = _mm_add_ps(fjz2,tz);
562
563             }
564
565             /**************************
566              * CALCULATE INTERACTIONS *
567              **************************/
568
569             if (gmx_mm_any_lt(rsq23,rcutoff2))
570             {
571
572             /* REACTION-FIELD ELECTROSTATICS */
573             velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
574             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
575
576             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
577
578             /* Update potential sum for this i atom from the interaction with this j atom. */
579             velec            = _mm_and_ps(velec,cutoff_mask);
580             velecsum         = _mm_add_ps(velecsum,velec);
581
582             fscal            = felec;
583
584             fscal            = _mm_and_ps(fscal,cutoff_mask);
585
586             /* Calculate temporary vectorial force */
587             tx               = _mm_mul_ps(fscal,dx23);
588             ty               = _mm_mul_ps(fscal,dy23);
589             tz               = _mm_mul_ps(fscal,dz23);
590
591             /* Update vectorial force */
592             fix2             = _mm_add_ps(fix2,tx);
593             fiy2             = _mm_add_ps(fiy2,ty);
594             fiz2             = _mm_add_ps(fiz2,tz);
595
596             fjx3             = _mm_add_ps(fjx3,tx);
597             fjy3             = _mm_add_ps(fjy3,ty);
598             fjz3             = _mm_add_ps(fjz3,tz);
599
600             }
601
602             /**************************
603              * CALCULATE INTERACTIONS *
604              **************************/
605
606             if (gmx_mm_any_lt(rsq31,rcutoff2))
607             {
608
609             /* REACTION-FIELD ELECTROSTATICS */
610             velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
611             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
612
613             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
614
615             /* Update potential sum for this i atom from the interaction with this j atom. */
616             velec            = _mm_and_ps(velec,cutoff_mask);
617             velecsum         = _mm_add_ps(velecsum,velec);
618
619             fscal            = felec;
620
621             fscal            = _mm_and_ps(fscal,cutoff_mask);
622
623             /* Calculate temporary vectorial force */
624             tx               = _mm_mul_ps(fscal,dx31);
625             ty               = _mm_mul_ps(fscal,dy31);
626             tz               = _mm_mul_ps(fscal,dz31);
627
628             /* Update vectorial force */
629             fix3             = _mm_add_ps(fix3,tx);
630             fiy3             = _mm_add_ps(fiy3,ty);
631             fiz3             = _mm_add_ps(fiz3,tz);
632
633             fjx1             = _mm_add_ps(fjx1,tx);
634             fjy1             = _mm_add_ps(fjy1,ty);
635             fjz1             = _mm_add_ps(fjz1,tz);
636
637             }
638
639             /**************************
640              * CALCULATE INTERACTIONS *
641              **************************/
642
643             if (gmx_mm_any_lt(rsq32,rcutoff2))
644             {
645
646             /* REACTION-FIELD ELECTROSTATICS */
647             velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
648             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
649
650             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
651
652             /* Update potential sum for this i atom from the interaction with this j atom. */
653             velec            = _mm_and_ps(velec,cutoff_mask);
654             velecsum         = _mm_add_ps(velecsum,velec);
655
656             fscal            = felec;
657
658             fscal            = _mm_and_ps(fscal,cutoff_mask);
659
660             /* Calculate temporary vectorial force */
661             tx               = _mm_mul_ps(fscal,dx32);
662             ty               = _mm_mul_ps(fscal,dy32);
663             tz               = _mm_mul_ps(fscal,dz32);
664
665             /* Update vectorial force */
666             fix3             = _mm_add_ps(fix3,tx);
667             fiy3             = _mm_add_ps(fiy3,ty);
668             fiz3             = _mm_add_ps(fiz3,tz);
669
670             fjx2             = _mm_add_ps(fjx2,tx);
671             fjy2             = _mm_add_ps(fjy2,ty);
672             fjz2             = _mm_add_ps(fjz2,tz);
673
674             }
675
676             /**************************
677              * CALCULATE INTERACTIONS *
678              **************************/
679
680             if (gmx_mm_any_lt(rsq33,rcutoff2))
681             {
682
683             /* REACTION-FIELD ELECTROSTATICS */
684             velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
685             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
686
687             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
688
689             /* Update potential sum for this i atom from the interaction with this j atom. */
690             velec            = _mm_and_ps(velec,cutoff_mask);
691             velecsum         = _mm_add_ps(velecsum,velec);
692
693             fscal            = felec;
694
695             fscal            = _mm_and_ps(fscal,cutoff_mask);
696
697             /* Calculate temporary vectorial force */
698             tx               = _mm_mul_ps(fscal,dx33);
699             ty               = _mm_mul_ps(fscal,dy33);
700             tz               = _mm_mul_ps(fscal,dz33);
701
702             /* Update vectorial force */
703             fix3             = _mm_add_ps(fix3,tx);
704             fiy3             = _mm_add_ps(fiy3,ty);
705             fiz3             = _mm_add_ps(fiz3,tz);
706
707             fjx3             = _mm_add_ps(fjx3,tx);
708             fjy3             = _mm_add_ps(fjy3,ty);
709             fjz3             = _mm_add_ps(fjz3,tz);
710
711             }
712
713             fjptrA             = f+j_coord_offsetA;
714             fjptrB             = f+j_coord_offsetB;
715             fjptrC             = f+j_coord_offsetC;
716             fjptrD             = f+j_coord_offsetD;
717
718             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
719                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
720                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
721
722             /* Inner loop uses 387 flops */
723         }
724
725         if(jidx<j_index_end)
726         {
727
728             /* Get j neighbor index, and coordinate index */
729             jnrlistA         = jjnr[jidx];
730             jnrlistB         = jjnr[jidx+1];
731             jnrlistC         = jjnr[jidx+2];
732             jnrlistD         = jjnr[jidx+3];
733             /* Sign of each element will be negative for non-real atoms.
734              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
735              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
736              */
737             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
738             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
739             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
740             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
741             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
742             j_coord_offsetA  = DIM*jnrA;
743             j_coord_offsetB  = DIM*jnrB;
744             j_coord_offsetC  = DIM*jnrC;
745             j_coord_offsetD  = DIM*jnrD;
746
747             /* load j atom coordinates */
748             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
749                                               x+j_coord_offsetC,x+j_coord_offsetD,
750                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
751                                               &jy2,&jz2,&jx3,&jy3,&jz3);
752
753             /* Calculate displacement vector */
754             dx00             = _mm_sub_ps(ix0,jx0);
755             dy00             = _mm_sub_ps(iy0,jy0);
756             dz00             = _mm_sub_ps(iz0,jz0);
757             dx11             = _mm_sub_ps(ix1,jx1);
758             dy11             = _mm_sub_ps(iy1,jy1);
759             dz11             = _mm_sub_ps(iz1,jz1);
760             dx12             = _mm_sub_ps(ix1,jx2);
761             dy12             = _mm_sub_ps(iy1,jy2);
762             dz12             = _mm_sub_ps(iz1,jz2);
763             dx13             = _mm_sub_ps(ix1,jx3);
764             dy13             = _mm_sub_ps(iy1,jy3);
765             dz13             = _mm_sub_ps(iz1,jz3);
766             dx21             = _mm_sub_ps(ix2,jx1);
767             dy21             = _mm_sub_ps(iy2,jy1);
768             dz21             = _mm_sub_ps(iz2,jz1);
769             dx22             = _mm_sub_ps(ix2,jx2);
770             dy22             = _mm_sub_ps(iy2,jy2);
771             dz22             = _mm_sub_ps(iz2,jz2);
772             dx23             = _mm_sub_ps(ix2,jx3);
773             dy23             = _mm_sub_ps(iy2,jy3);
774             dz23             = _mm_sub_ps(iz2,jz3);
775             dx31             = _mm_sub_ps(ix3,jx1);
776             dy31             = _mm_sub_ps(iy3,jy1);
777             dz31             = _mm_sub_ps(iz3,jz1);
778             dx32             = _mm_sub_ps(ix3,jx2);
779             dy32             = _mm_sub_ps(iy3,jy2);
780             dz32             = _mm_sub_ps(iz3,jz2);
781             dx33             = _mm_sub_ps(ix3,jx3);
782             dy33             = _mm_sub_ps(iy3,jy3);
783             dz33             = _mm_sub_ps(iz3,jz3);
784
785             /* Calculate squared distance and things based on it */
786             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
787             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
788             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
789             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
790             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
791             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
792             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
793             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
794             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
795             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
796
797             rinv00           = gmx_mm_invsqrt_ps(rsq00);
798             rinv11           = gmx_mm_invsqrt_ps(rsq11);
799             rinv12           = gmx_mm_invsqrt_ps(rsq12);
800             rinv13           = gmx_mm_invsqrt_ps(rsq13);
801             rinv21           = gmx_mm_invsqrt_ps(rsq21);
802             rinv22           = gmx_mm_invsqrt_ps(rsq22);
803             rinv23           = gmx_mm_invsqrt_ps(rsq23);
804             rinv31           = gmx_mm_invsqrt_ps(rsq31);
805             rinv32           = gmx_mm_invsqrt_ps(rsq32);
806             rinv33           = gmx_mm_invsqrt_ps(rsq33);
807
808             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
809             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
810             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
811             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
812             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
813             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
814             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
815             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
816             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
817
818             fjx0             = _mm_setzero_ps();
819             fjy0             = _mm_setzero_ps();
820             fjz0             = _mm_setzero_ps();
821             fjx1             = _mm_setzero_ps();
822             fjy1             = _mm_setzero_ps();
823             fjz1             = _mm_setzero_ps();
824             fjx2             = _mm_setzero_ps();
825             fjy2             = _mm_setzero_ps();
826             fjz2             = _mm_setzero_ps();
827             fjx3             = _mm_setzero_ps();
828             fjy3             = _mm_setzero_ps();
829             fjz3             = _mm_setzero_ps();
830
831             /**************************
832              * CALCULATE INTERACTIONS *
833              **************************/
834
835             if (gmx_mm_any_lt(rsq00,rcutoff2))
836             {
837
838             r00              = _mm_mul_ps(rsq00,rinv00);
839             r00              = _mm_andnot_ps(dummy_mask,r00);
840
841             /* Calculate table index by multiplying r with table scale and truncate to integer */
842             rt               = _mm_mul_ps(r00,vftabscale);
843             vfitab           = _mm_cvttps_epi32(rt);
844             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
845             vfitab           = _mm_slli_epi32(vfitab,3);
846
847             /* CUBIC SPLINE TABLE DISPERSION */
848             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
849             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
850             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
851             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
852             _MM_TRANSPOSE4_PS(Y,F,G,H);
853             Heps             = _mm_mul_ps(vfeps,H);
854             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
855             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
856             vvdw6            = _mm_mul_ps(c6_00,VV);
857             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
858             fvdw6            = _mm_mul_ps(c6_00,FF);
859
860             /* CUBIC SPLINE TABLE REPULSION */
861             vfitab           = _mm_add_epi32(vfitab,ifour);
862             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
863             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
864             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
865             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
866             _MM_TRANSPOSE4_PS(Y,F,G,H);
867             Heps             = _mm_mul_ps(vfeps,H);
868             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
869             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
870             vvdw12           = _mm_mul_ps(c12_00,VV);
871             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
872             fvdw12           = _mm_mul_ps(c12_00,FF);
873             vvdw             = _mm_add_ps(vvdw12,vvdw6);
874             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
875
876             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
877
878             /* Update potential sum for this i atom from the interaction with this j atom. */
879             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
880             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
881             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
882
883             fscal            = fvdw;
884
885             fscal            = _mm_and_ps(fscal,cutoff_mask);
886
887             fscal            = _mm_andnot_ps(dummy_mask,fscal);
888
889             /* Calculate temporary vectorial force */
890             tx               = _mm_mul_ps(fscal,dx00);
891             ty               = _mm_mul_ps(fscal,dy00);
892             tz               = _mm_mul_ps(fscal,dz00);
893
894             /* Update vectorial force */
895             fix0             = _mm_add_ps(fix0,tx);
896             fiy0             = _mm_add_ps(fiy0,ty);
897             fiz0             = _mm_add_ps(fiz0,tz);
898
899             fjx0             = _mm_add_ps(fjx0,tx);
900             fjy0             = _mm_add_ps(fjy0,ty);
901             fjz0             = _mm_add_ps(fjz0,tz);
902
903             }
904
905             /**************************
906              * CALCULATE INTERACTIONS *
907              **************************/
908
909             if (gmx_mm_any_lt(rsq11,rcutoff2))
910             {
911
912             /* REACTION-FIELD ELECTROSTATICS */
913             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
914             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
915
916             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
917
918             /* Update potential sum for this i atom from the interaction with this j atom. */
919             velec            = _mm_and_ps(velec,cutoff_mask);
920             velec            = _mm_andnot_ps(dummy_mask,velec);
921             velecsum         = _mm_add_ps(velecsum,velec);
922
923             fscal            = felec;
924
925             fscal            = _mm_and_ps(fscal,cutoff_mask);
926
927             fscal            = _mm_andnot_ps(dummy_mask,fscal);
928
929             /* Calculate temporary vectorial force */
930             tx               = _mm_mul_ps(fscal,dx11);
931             ty               = _mm_mul_ps(fscal,dy11);
932             tz               = _mm_mul_ps(fscal,dz11);
933
934             /* Update vectorial force */
935             fix1             = _mm_add_ps(fix1,tx);
936             fiy1             = _mm_add_ps(fiy1,ty);
937             fiz1             = _mm_add_ps(fiz1,tz);
938
939             fjx1             = _mm_add_ps(fjx1,tx);
940             fjy1             = _mm_add_ps(fjy1,ty);
941             fjz1             = _mm_add_ps(fjz1,tz);
942
943             }
944
945             /**************************
946              * CALCULATE INTERACTIONS *
947              **************************/
948
949             if (gmx_mm_any_lt(rsq12,rcutoff2))
950             {
951
952             /* REACTION-FIELD ELECTROSTATICS */
953             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
954             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
955
956             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
957
958             /* Update potential sum for this i atom from the interaction with this j atom. */
959             velec            = _mm_and_ps(velec,cutoff_mask);
960             velec            = _mm_andnot_ps(dummy_mask,velec);
961             velecsum         = _mm_add_ps(velecsum,velec);
962
963             fscal            = felec;
964
965             fscal            = _mm_and_ps(fscal,cutoff_mask);
966
967             fscal            = _mm_andnot_ps(dummy_mask,fscal);
968
969             /* Calculate temporary vectorial force */
970             tx               = _mm_mul_ps(fscal,dx12);
971             ty               = _mm_mul_ps(fscal,dy12);
972             tz               = _mm_mul_ps(fscal,dz12);
973
974             /* Update vectorial force */
975             fix1             = _mm_add_ps(fix1,tx);
976             fiy1             = _mm_add_ps(fiy1,ty);
977             fiz1             = _mm_add_ps(fiz1,tz);
978
979             fjx2             = _mm_add_ps(fjx2,tx);
980             fjy2             = _mm_add_ps(fjy2,ty);
981             fjz2             = _mm_add_ps(fjz2,tz);
982
983             }
984
985             /**************************
986              * CALCULATE INTERACTIONS *
987              **************************/
988
989             if (gmx_mm_any_lt(rsq13,rcutoff2))
990             {
991
992             /* REACTION-FIELD ELECTROSTATICS */
993             velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
994             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
995
996             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
997
998             /* Update potential sum for this i atom from the interaction with this j atom. */
999             velec            = _mm_and_ps(velec,cutoff_mask);
1000             velec            = _mm_andnot_ps(dummy_mask,velec);
1001             velecsum         = _mm_add_ps(velecsum,velec);
1002
1003             fscal            = felec;
1004
1005             fscal            = _mm_and_ps(fscal,cutoff_mask);
1006
1007             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1008
1009             /* Calculate temporary vectorial force */
1010             tx               = _mm_mul_ps(fscal,dx13);
1011             ty               = _mm_mul_ps(fscal,dy13);
1012             tz               = _mm_mul_ps(fscal,dz13);
1013
1014             /* Update vectorial force */
1015             fix1             = _mm_add_ps(fix1,tx);
1016             fiy1             = _mm_add_ps(fiy1,ty);
1017             fiz1             = _mm_add_ps(fiz1,tz);
1018
1019             fjx3             = _mm_add_ps(fjx3,tx);
1020             fjy3             = _mm_add_ps(fjy3,ty);
1021             fjz3             = _mm_add_ps(fjz3,tz);
1022
1023             }
1024
1025             /**************************
1026              * CALCULATE INTERACTIONS *
1027              **************************/
1028
1029             if (gmx_mm_any_lt(rsq21,rcutoff2))
1030             {
1031
1032             /* REACTION-FIELD ELECTROSTATICS */
1033             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
1034             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1035
1036             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1037
1038             /* Update potential sum for this i atom from the interaction with this j atom. */
1039             velec            = _mm_and_ps(velec,cutoff_mask);
1040             velec            = _mm_andnot_ps(dummy_mask,velec);
1041             velecsum         = _mm_add_ps(velecsum,velec);
1042
1043             fscal            = felec;
1044
1045             fscal            = _mm_and_ps(fscal,cutoff_mask);
1046
1047             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1048
1049             /* Calculate temporary vectorial force */
1050             tx               = _mm_mul_ps(fscal,dx21);
1051             ty               = _mm_mul_ps(fscal,dy21);
1052             tz               = _mm_mul_ps(fscal,dz21);
1053
1054             /* Update vectorial force */
1055             fix2             = _mm_add_ps(fix2,tx);
1056             fiy2             = _mm_add_ps(fiy2,ty);
1057             fiz2             = _mm_add_ps(fiz2,tz);
1058
1059             fjx1             = _mm_add_ps(fjx1,tx);
1060             fjy1             = _mm_add_ps(fjy1,ty);
1061             fjz1             = _mm_add_ps(fjz1,tz);
1062
1063             }
1064
1065             /**************************
1066              * CALCULATE INTERACTIONS *
1067              **************************/
1068
1069             if (gmx_mm_any_lt(rsq22,rcutoff2))
1070             {
1071
1072             /* REACTION-FIELD ELECTROSTATICS */
1073             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
1074             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1075
1076             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1077
1078             /* Update potential sum for this i atom from the interaction with this j atom. */
1079             velec            = _mm_and_ps(velec,cutoff_mask);
1080             velec            = _mm_andnot_ps(dummy_mask,velec);
1081             velecsum         = _mm_add_ps(velecsum,velec);
1082
1083             fscal            = felec;
1084
1085             fscal            = _mm_and_ps(fscal,cutoff_mask);
1086
1087             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1088
1089             /* Calculate temporary vectorial force */
1090             tx               = _mm_mul_ps(fscal,dx22);
1091             ty               = _mm_mul_ps(fscal,dy22);
1092             tz               = _mm_mul_ps(fscal,dz22);
1093
1094             /* Update vectorial force */
1095             fix2             = _mm_add_ps(fix2,tx);
1096             fiy2             = _mm_add_ps(fiy2,ty);
1097             fiz2             = _mm_add_ps(fiz2,tz);
1098
1099             fjx2             = _mm_add_ps(fjx2,tx);
1100             fjy2             = _mm_add_ps(fjy2,ty);
1101             fjz2             = _mm_add_ps(fjz2,tz);
1102
1103             }
1104
1105             /**************************
1106              * CALCULATE INTERACTIONS *
1107              **************************/
1108
1109             if (gmx_mm_any_lt(rsq23,rcutoff2))
1110             {
1111
1112             /* REACTION-FIELD ELECTROSTATICS */
1113             velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
1114             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1115
1116             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1117
1118             /* Update potential sum for this i atom from the interaction with this j atom. */
1119             velec            = _mm_and_ps(velec,cutoff_mask);
1120             velec            = _mm_andnot_ps(dummy_mask,velec);
1121             velecsum         = _mm_add_ps(velecsum,velec);
1122
1123             fscal            = felec;
1124
1125             fscal            = _mm_and_ps(fscal,cutoff_mask);
1126
1127             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1128
1129             /* Calculate temporary vectorial force */
1130             tx               = _mm_mul_ps(fscal,dx23);
1131             ty               = _mm_mul_ps(fscal,dy23);
1132             tz               = _mm_mul_ps(fscal,dz23);
1133
1134             /* Update vectorial force */
1135             fix2             = _mm_add_ps(fix2,tx);
1136             fiy2             = _mm_add_ps(fiy2,ty);
1137             fiz2             = _mm_add_ps(fiz2,tz);
1138
1139             fjx3             = _mm_add_ps(fjx3,tx);
1140             fjy3             = _mm_add_ps(fjy3,ty);
1141             fjz3             = _mm_add_ps(fjz3,tz);
1142
1143             }
1144
1145             /**************************
1146              * CALCULATE INTERACTIONS *
1147              **************************/
1148
1149             if (gmx_mm_any_lt(rsq31,rcutoff2))
1150             {
1151
1152             /* REACTION-FIELD ELECTROSTATICS */
1153             velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
1154             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1155
1156             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1157
1158             /* Update potential sum for this i atom from the interaction with this j atom. */
1159             velec            = _mm_and_ps(velec,cutoff_mask);
1160             velec            = _mm_andnot_ps(dummy_mask,velec);
1161             velecsum         = _mm_add_ps(velecsum,velec);
1162
1163             fscal            = felec;
1164
1165             fscal            = _mm_and_ps(fscal,cutoff_mask);
1166
1167             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1168
1169             /* Calculate temporary vectorial force */
1170             tx               = _mm_mul_ps(fscal,dx31);
1171             ty               = _mm_mul_ps(fscal,dy31);
1172             tz               = _mm_mul_ps(fscal,dz31);
1173
1174             /* Update vectorial force */
1175             fix3             = _mm_add_ps(fix3,tx);
1176             fiy3             = _mm_add_ps(fiy3,ty);
1177             fiz3             = _mm_add_ps(fiz3,tz);
1178
1179             fjx1             = _mm_add_ps(fjx1,tx);
1180             fjy1             = _mm_add_ps(fjy1,ty);
1181             fjz1             = _mm_add_ps(fjz1,tz);
1182
1183             }
1184
1185             /**************************
1186              * CALCULATE INTERACTIONS *
1187              **************************/
1188
1189             if (gmx_mm_any_lt(rsq32,rcutoff2))
1190             {
1191
1192             /* REACTION-FIELD ELECTROSTATICS */
1193             velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
1194             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1195
1196             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1197
1198             /* Update potential sum for this i atom from the interaction with this j atom. */
1199             velec            = _mm_and_ps(velec,cutoff_mask);
1200             velec            = _mm_andnot_ps(dummy_mask,velec);
1201             velecsum         = _mm_add_ps(velecsum,velec);
1202
1203             fscal            = felec;
1204
1205             fscal            = _mm_and_ps(fscal,cutoff_mask);
1206
1207             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1208
1209             /* Calculate temporary vectorial force */
1210             tx               = _mm_mul_ps(fscal,dx32);
1211             ty               = _mm_mul_ps(fscal,dy32);
1212             tz               = _mm_mul_ps(fscal,dz32);
1213
1214             /* Update vectorial force */
1215             fix3             = _mm_add_ps(fix3,tx);
1216             fiy3             = _mm_add_ps(fiy3,ty);
1217             fiz3             = _mm_add_ps(fiz3,tz);
1218
1219             fjx2             = _mm_add_ps(fjx2,tx);
1220             fjy2             = _mm_add_ps(fjy2,ty);
1221             fjz2             = _mm_add_ps(fjz2,tz);
1222
1223             }
1224
1225             /**************************
1226              * CALCULATE INTERACTIONS *
1227              **************************/
1228
1229             if (gmx_mm_any_lt(rsq33,rcutoff2))
1230             {
1231
1232             /* REACTION-FIELD ELECTROSTATICS */
1233             velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
1234             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1235
1236             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1237
1238             /* Update potential sum for this i atom from the interaction with this j atom. */
1239             velec            = _mm_and_ps(velec,cutoff_mask);
1240             velec            = _mm_andnot_ps(dummy_mask,velec);
1241             velecsum         = _mm_add_ps(velecsum,velec);
1242
1243             fscal            = felec;
1244
1245             fscal            = _mm_and_ps(fscal,cutoff_mask);
1246
1247             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1248
1249             /* Calculate temporary vectorial force */
1250             tx               = _mm_mul_ps(fscal,dx33);
1251             ty               = _mm_mul_ps(fscal,dy33);
1252             tz               = _mm_mul_ps(fscal,dz33);
1253
1254             /* Update vectorial force */
1255             fix3             = _mm_add_ps(fix3,tx);
1256             fiy3             = _mm_add_ps(fiy3,ty);
1257             fiz3             = _mm_add_ps(fiz3,tz);
1258
1259             fjx3             = _mm_add_ps(fjx3,tx);
1260             fjy3             = _mm_add_ps(fjy3,ty);
1261             fjz3             = _mm_add_ps(fjz3,tz);
1262
1263             }
1264
1265             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1266             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1267             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1268             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1269
1270             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1271                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1272                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1273
1274             /* Inner loop uses 388 flops */
1275         }
1276
1277         /* End of innermost loop */
1278
1279         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1280                                               f+i_coord_offset,fshift+i_shift_offset);
1281
1282         ggid                        = gid[iidx];
1283         /* Update potential energies */
1284         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1285         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1286
1287         /* Increment number of inner iterations */
1288         inneriter                  += j_index_end - j_index_start;
1289
1290         /* Outer loop uses 26 flops */
1291     }
1292
1293     /* Increment number of outer iterations */
1294     outeriter        += nri;
1295
1296     /* Update outer/inner flops */
1297
1298     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*388);
1299 }
1300 /*
1301  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse4_1_single
1302  * Electrostatics interaction: ReactionField
1303  * VdW interaction:            CubicSplineTable
1304  * Geometry:                   Water4-Water4
1305  * Calculate force/pot:        Force
1306  */
1307 void
1308 nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse4_1_single
1309                     (t_nblist * gmx_restrict                nlist,
1310                      rvec * gmx_restrict                    xx,
1311                      rvec * gmx_restrict                    ff,
1312                      t_forcerec * gmx_restrict              fr,
1313                      t_mdatoms * gmx_restrict               mdatoms,
1314                      nb_kernel_data_t * gmx_restrict        kernel_data,
1315                      t_nrnb * gmx_restrict                  nrnb)
1316 {
1317     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1318      * just 0 for non-waters.
1319      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1320      * jnr indices corresponding to data put in the four positions in the SIMD register.
1321      */
1322     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1323     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1324     int              jnrA,jnrB,jnrC,jnrD;
1325     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1326     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1327     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1328     real             rcutoff_scalar;
1329     real             *shiftvec,*fshift,*x,*f;
1330     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1331     real             scratch[4*DIM];
1332     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1333     int              vdwioffset0;
1334     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1335     int              vdwioffset1;
1336     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1337     int              vdwioffset2;
1338     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1339     int              vdwioffset3;
1340     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1341     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1342     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1343     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1344     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1345     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1346     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1347     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1348     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1349     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1350     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1351     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1352     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1353     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1354     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1355     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1356     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1357     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1358     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1359     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1360     real             *charge;
1361     int              nvdwtype;
1362     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1363     int              *vdwtype;
1364     real             *vdwparam;
1365     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1366     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1367     __m128i          vfitab;
1368     __m128i          ifour       = _mm_set1_epi32(4);
1369     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1370     real             *vftab;
1371     __m128           dummy_mask,cutoff_mask;
1372     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1373     __m128           one     = _mm_set1_ps(1.0);
1374     __m128           two     = _mm_set1_ps(2.0);
1375     x                = xx[0];
1376     f                = ff[0];
1377
1378     nri              = nlist->nri;
1379     iinr             = nlist->iinr;
1380     jindex           = nlist->jindex;
1381     jjnr             = nlist->jjnr;
1382     shiftidx         = nlist->shift;
1383     gid              = nlist->gid;
1384     shiftvec         = fr->shift_vec[0];
1385     fshift           = fr->fshift[0];
1386     facel            = _mm_set1_ps(fr->epsfac);
1387     charge           = mdatoms->chargeA;
1388     krf              = _mm_set1_ps(fr->ic->k_rf);
1389     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
1390     crf              = _mm_set1_ps(fr->ic->c_rf);
1391     nvdwtype         = fr->ntype;
1392     vdwparam         = fr->nbfp;
1393     vdwtype          = mdatoms->typeA;
1394
1395     vftab            = kernel_data->table_vdw->data;
1396     vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
1397
1398     /* Setup water-specific parameters */
1399     inr              = nlist->iinr[0];
1400     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1401     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1402     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1403     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1404
1405     jq1              = _mm_set1_ps(charge[inr+1]);
1406     jq2              = _mm_set1_ps(charge[inr+2]);
1407     jq3              = _mm_set1_ps(charge[inr+3]);
1408     vdwjidx0A        = 2*vdwtype[inr+0];
1409     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1410     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1411     qq11             = _mm_mul_ps(iq1,jq1);
1412     qq12             = _mm_mul_ps(iq1,jq2);
1413     qq13             = _mm_mul_ps(iq1,jq3);
1414     qq21             = _mm_mul_ps(iq2,jq1);
1415     qq22             = _mm_mul_ps(iq2,jq2);
1416     qq23             = _mm_mul_ps(iq2,jq3);
1417     qq31             = _mm_mul_ps(iq3,jq1);
1418     qq32             = _mm_mul_ps(iq3,jq2);
1419     qq33             = _mm_mul_ps(iq3,jq3);
1420
1421     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1422     rcutoff_scalar   = fr->rcoulomb;
1423     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1424     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1425
1426     /* Avoid stupid compiler warnings */
1427     jnrA = jnrB = jnrC = jnrD = 0;
1428     j_coord_offsetA = 0;
1429     j_coord_offsetB = 0;
1430     j_coord_offsetC = 0;
1431     j_coord_offsetD = 0;
1432
1433     outeriter        = 0;
1434     inneriter        = 0;
1435
1436     for(iidx=0;iidx<4*DIM;iidx++)
1437     {
1438         scratch[iidx] = 0.0;
1439     }
1440
1441     /* Start outer loop over neighborlists */
1442     for(iidx=0; iidx<nri; iidx++)
1443     {
1444         /* Load shift vector for this list */
1445         i_shift_offset   = DIM*shiftidx[iidx];
1446
1447         /* Load limits for loop over neighbors */
1448         j_index_start    = jindex[iidx];
1449         j_index_end      = jindex[iidx+1];
1450
1451         /* Get outer coordinate index */
1452         inr              = iinr[iidx];
1453         i_coord_offset   = DIM*inr;
1454
1455         /* Load i particle coords and add shift vector */
1456         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1457                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1458
1459         fix0             = _mm_setzero_ps();
1460         fiy0             = _mm_setzero_ps();
1461         fiz0             = _mm_setzero_ps();
1462         fix1             = _mm_setzero_ps();
1463         fiy1             = _mm_setzero_ps();
1464         fiz1             = _mm_setzero_ps();
1465         fix2             = _mm_setzero_ps();
1466         fiy2             = _mm_setzero_ps();
1467         fiz2             = _mm_setzero_ps();
1468         fix3             = _mm_setzero_ps();
1469         fiy3             = _mm_setzero_ps();
1470         fiz3             = _mm_setzero_ps();
1471
1472         /* Start inner kernel loop */
1473         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1474         {
1475
1476             /* Get j neighbor index, and coordinate index */
1477             jnrA             = jjnr[jidx];
1478             jnrB             = jjnr[jidx+1];
1479             jnrC             = jjnr[jidx+2];
1480             jnrD             = jjnr[jidx+3];
1481             j_coord_offsetA  = DIM*jnrA;
1482             j_coord_offsetB  = DIM*jnrB;
1483             j_coord_offsetC  = DIM*jnrC;
1484             j_coord_offsetD  = DIM*jnrD;
1485
1486             /* load j atom coordinates */
1487             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1488                                               x+j_coord_offsetC,x+j_coord_offsetD,
1489                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1490                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1491
1492             /* Calculate displacement vector */
1493             dx00             = _mm_sub_ps(ix0,jx0);
1494             dy00             = _mm_sub_ps(iy0,jy0);
1495             dz00             = _mm_sub_ps(iz0,jz0);
1496             dx11             = _mm_sub_ps(ix1,jx1);
1497             dy11             = _mm_sub_ps(iy1,jy1);
1498             dz11             = _mm_sub_ps(iz1,jz1);
1499             dx12             = _mm_sub_ps(ix1,jx2);
1500             dy12             = _mm_sub_ps(iy1,jy2);
1501             dz12             = _mm_sub_ps(iz1,jz2);
1502             dx13             = _mm_sub_ps(ix1,jx3);
1503             dy13             = _mm_sub_ps(iy1,jy3);
1504             dz13             = _mm_sub_ps(iz1,jz3);
1505             dx21             = _mm_sub_ps(ix2,jx1);
1506             dy21             = _mm_sub_ps(iy2,jy1);
1507             dz21             = _mm_sub_ps(iz2,jz1);
1508             dx22             = _mm_sub_ps(ix2,jx2);
1509             dy22             = _mm_sub_ps(iy2,jy2);
1510             dz22             = _mm_sub_ps(iz2,jz2);
1511             dx23             = _mm_sub_ps(ix2,jx3);
1512             dy23             = _mm_sub_ps(iy2,jy3);
1513             dz23             = _mm_sub_ps(iz2,jz3);
1514             dx31             = _mm_sub_ps(ix3,jx1);
1515             dy31             = _mm_sub_ps(iy3,jy1);
1516             dz31             = _mm_sub_ps(iz3,jz1);
1517             dx32             = _mm_sub_ps(ix3,jx2);
1518             dy32             = _mm_sub_ps(iy3,jy2);
1519             dz32             = _mm_sub_ps(iz3,jz2);
1520             dx33             = _mm_sub_ps(ix3,jx3);
1521             dy33             = _mm_sub_ps(iy3,jy3);
1522             dz33             = _mm_sub_ps(iz3,jz3);
1523
1524             /* Calculate squared distance and things based on it */
1525             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1526             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1527             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1528             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1529             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1530             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1531             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1532             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1533             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1534             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1535
1536             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1537             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1538             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1539             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1540             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1541             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1542             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1543             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1544             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1545             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1546
1547             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1548             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1549             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1550             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1551             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1552             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1553             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1554             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1555             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1556
1557             fjx0             = _mm_setzero_ps();
1558             fjy0             = _mm_setzero_ps();
1559             fjz0             = _mm_setzero_ps();
1560             fjx1             = _mm_setzero_ps();
1561             fjy1             = _mm_setzero_ps();
1562             fjz1             = _mm_setzero_ps();
1563             fjx2             = _mm_setzero_ps();
1564             fjy2             = _mm_setzero_ps();
1565             fjz2             = _mm_setzero_ps();
1566             fjx3             = _mm_setzero_ps();
1567             fjy3             = _mm_setzero_ps();
1568             fjz3             = _mm_setzero_ps();
1569
1570             /**************************
1571              * CALCULATE INTERACTIONS *
1572              **************************/
1573
1574             if (gmx_mm_any_lt(rsq00,rcutoff2))
1575             {
1576
1577             r00              = _mm_mul_ps(rsq00,rinv00);
1578
1579             /* Calculate table index by multiplying r with table scale and truncate to integer */
1580             rt               = _mm_mul_ps(r00,vftabscale);
1581             vfitab           = _mm_cvttps_epi32(rt);
1582             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1583             vfitab           = _mm_slli_epi32(vfitab,3);
1584
1585             /* CUBIC SPLINE TABLE DISPERSION */
1586             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1587             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1588             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1589             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1590             _MM_TRANSPOSE4_PS(Y,F,G,H);
1591             Heps             = _mm_mul_ps(vfeps,H);
1592             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1593             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1594             fvdw6            = _mm_mul_ps(c6_00,FF);
1595
1596             /* CUBIC SPLINE TABLE REPULSION */
1597             vfitab           = _mm_add_epi32(vfitab,ifour);
1598             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1599             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1600             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1601             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1602             _MM_TRANSPOSE4_PS(Y,F,G,H);
1603             Heps             = _mm_mul_ps(vfeps,H);
1604             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1605             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1606             fvdw12           = _mm_mul_ps(c12_00,FF);
1607             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1608
1609             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1610
1611             fscal            = fvdw;
1612
1613             fscal            = _mm_and_ps(fscal,cutoff_mask);
1614
1615             /* Calculate temporary vectorial force */
1616             tx               = _mm_mul_ps(fscal,dx00);
1617             ty               = _mm_mul_ps(fscal,dy00);
1618             tz               = _mm_mul_ps(fscal,dz00);
1619
1620             /* Update vectorial force */
1621             fix0             = _mm_add_ps(fix0,tx);
1622             fiy0             = _mm_add_ps(fiy0,ty);
1623             fiz0             = _mm_add_ps(fiz0,tz);
1624
1625             fjx0             = _mm_add_ps(fjx0,tx);
1626             fjy0             = _mm_add_ps(fjy0,ty);
1627             fjz0             = _mm_add_ps(fjz0,tz);
1628
1629             }
1630
1631             /**************************
1632              * CALCULATE INTERACTIONS *
1633              **************************/
1634
1635             if (gmx_mm_any_lt(rsq11,rcutoff2))
1636             {
1637
1638             /* REACTION-FIELD ELECTROSTATICS */
1639             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1640
1641             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1642
1643             fscal            = felec;
1644
1645             fscal            = _mm_and_ps(fscal,cutoff_mask);
1646
1647             /* Calculate temporary vectorial force */
1648             tx               = _mm_mul_ps(fscal,dx11);
1649             ty               = _mm_mul_ps(fscal,dy11);
1650             tz               = _mm_mul_ps(fscal,dz11);
1651
1652             /* Update vectorial force */
1653             fix1             = _mm_add_ps(fix1,tx);
1654             fiy1             = _mm_add_ps(fiy1,ty);
1655             fiz1             = _mm_add_ps(fiz1,tz);
1656
1657             fjx1             = _mm_add_ps(fjx1,tx);
1658             fjy1             = _mm_add_ps(fjy1,ty);
1659             fjz1             = _mm_add_ps(fjz1,tz);
1660
1661             }
1662
1663             /**************************
1664              * CALCULATE INTERACTIONS *
1665              **************************/
1666
1667             if (gmx_mm_any_lt(rsq12,rcutoff2))
1668             {
1669
1670             /* REACTION-FIELD ELECTROSTATICS */
1671             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1672
1673             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1674
1675             fscal            = felec;
1676
1677             fscal            = _mm_and_ps(fscal,cutoff_mask);
1678
1679             /* Calculate temporary vectorial force */
1680             tx               = _mm_mul_ps(fscal,dx12);
1681             ty               = _mm_mul_ps(fscal,dy12);
1682             tz               = _mm_mul_ps(fscal,dz12);
1683
1684             /* Update vectorial force */
1685             fix1             = _mm_add_ps(fix1,tx);
1686             fiy1             = _mm_add_ps(fiy1,ty);
1687             fiz1             = _mm_add_ps(fiz1,tz);
1688
1689             fjx2             = _mm_add_ps(fjx2,tx);
1690             fjy2             = _mm_add_ps(fjy2,ty);
1691             fjz2             = _mm_add_ps(fjz2,tz);
1692
1693             }
1694
1695             /**************************
1696              * CALCULATE INTERACTIONS *
1697              **************************/
1698
1699             if (gmx_mm_any_lt(rsq13,rcutoff2))
1700             {
1701
1702             /* REACTION-FIELD ELECTROSTATICS */
1703             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
1704
1705             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
1706
1707             fscal            = felec;
1708
1709             fscal            = _mm_and_ps(fscal,cutoff_mask);
1710
1711             /* Calculate temporary vectorial force */
1712             tx               = _mm_mul_ps(fscal,dx13);
1713             ty               = _mm_mul_ps(fscal,dy13);
1714             tz               = _mm_mul_ps(fscal,dz13);
1715
1716             /* Update vectorial force */
1717             fix1             = _mm_add_ps(fix1,tx);
1718             fiy1             = _mm_add_ps(fiy1,ty);
1719             fiz1             = _mm_add_ps(fiz1,tz);
1720
1721             fjx3             = _mm_add_ps(fjx3,tx);
1722             fjy3             = _mm_add_ps(fjy3,ty);
1723             fjz3             = _mm_add_ps(fjz3,tz);
1724
1725             }
1726
1727             /**************************
1728              * CALCULATE INTERACTIONS *
1729              **************************/
1730
1731             if (gmx_mm_any_lt(rsq21,rcutoff2))
1732             {
1733
1734             /* REACTION-FIELD ELECTROSTATICS */
1735             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1736
1737             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1738
1739             fscal            = felec;
1740
1741             fscal            = _mm_and_ps(fscal,cutoff_mask);
1742
1743             /* Calculate temporary vectorial force */
1744             tx               = _mm_mul_ps(fscal,dx21);
1745             ty               = _mm_mul_ps(fscal,dy21);
1746             tz               = _mm_mul_ps(fscal,dz21);
1747
1748             /* Update vectorial force */
1749             fix2             = _mm_add_ps(fix2,tx);
1750             fiy2             = _mm_add_ps(fiy2,ty);
1751             fiz2             = _mm_add_ps(fiz2,tz);
1752
1753             fjx1             = _mm_add_ps(fjx1,tx);
1754             fjy1             = _mm_add_ps(fjy1,ty);
1755             fjz1             = _mm_add_ps(fjz1,tz);
1756
1757             }
1758
1759             /**************************
1760              * CALCULATE INTERACTIONS *
1761              **************************/
1762
1763             if (gmx_mm_any_lt(rsq22,rcutoff2))
1764             {
1765
1766             /* REACTION-FIELD ELECTROSTATICS */
1767             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1768
1769             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1770
1771             fscal            = felec;
1772
1773             fscal            = _mm_and_ps(fscal,cutoff_mask);
1774
1775             /* Calculate temporary vectorial force */
1776             tx               = _mm_mul_ps(fscal,dx22);
1777             ty               = _mm_mul_ps(fscal,dy22);
1778             tz               = _mm_mul_ps(fscal,dz22);
1779
1780             /* Update vectorial force */
1781             fix2             = _mm_add_ps(fix2,tx);
1782             fiy2             = _mm_add_ps(fiy2,ty);
1783             fiz2             = _mm_add_ps(fiz2,tz);
1784
1785             fjx2             = _mm_add_ps(fjx2,tx);
1786             fjy2             = _mm_add_ps(fjy2,ty);
1787             fjz2             = _mm_add_ps(fjz2,tz);
1788
1789             }
1790
1791             /**************************
1792              * CALCULATE INTERACTIONS *
1793              **************************/
1794
1795             if (gmx_mm_any_lt(rsq23,rcutoff2))
1796             {
1797
1798             /* REACTION-FIELD ELECTROSTATICS */
1799             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1800
1801             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1802
1803             fscal            = felec;
1804
1805             fscal            = _mm_and_ps(fscal,cutoff_mask);
1806
1807             /* Calculate temporary vectorial force */
1808             tx               = _mm_mul_ps(fscal,dx23);
1809             ty               = _mm_mul_ps(fscal,dy23);
1810             tz               = _mm_mul_ps(fscal,dz23);
1811
1812             /* Update vectorial force */
1813             fix2             = _mm_add_ps(fix2,tx);
1814             fiy2             = _mm_add_ps(fiy2,ty);
1815             fiz2             = _mm_add_ps(fiz2,tz);
1816
1817             fjx3             = _mm_add_ps(fjx3,tx);
1818             fjy3             = _mm_add_ps(fjy3,ty);
1819             fjz3             = _mm_add_ps(fjz3,tz);
1820
1821             }
1822
1823             /**************************
1824              * CALCULATE INTERACTIONS *
1825              **************************/
1826
1827             if (gmx_mm_any_lt(rsq31,rcutoff2))
1828             {
1829
1830             /* REACTION-FIELD ELECTROSTATICS */
1831             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1832
1833             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1834
1835             fscal            = felec;
1836
1837             fscal            = _mm_and_ps(fscal,cutoff_mask);
1838
1839             /* Calculate temporary vectorial force */
1840             tx               = _mm_mul_ps(fscal,dx31);
1841             ty               = _mm_mul_ps(fscal,dy31);
1842             tz               = _mm_mul_ps(fscal,dz31);
1843
1844             /* Update vectorial force */
1845             fix3             = _mm_add_ps(fix3,tx);
1846             fiy3             = _mm_add_ps(fiy3,ty);
1847             fiz3             = _mm_add_ps(fiz3,tz);
1848
1849             fjx1             = _mm_add_ps(fjx1,tx);
1850             fjy1             = _mm_add_ps(fjy1,ty);
1851             fjz1             = _mm_add_ps(fjz1,tz);
1852
1853             }
1854
1855             /**************************
1856              * CALCULATE INTERACTIONS *
1857              **************************/
1858
1859             if (gmx_mm_any_lt(rsq32,rcutoff2))
1860             {
1861
1862             /* REACTION-FIELD ELECTROSTATICS */
1863             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1864
1865             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1866
1867             fscal            = felec;
1868
1869             fscal            = _mm_and_ps(fscal,cutoff_mask);
1870
1871             /* Calculate temporary vectorial force */
1872             tx               = _mm_mul_ps(fscal,dx32);
1873             ty               = _mm_mul_ps(fscal,dy32);
1874             tz               = _mm_mul_ps(fscal,dz32);
1875
1876             /* Update vectorial force */
1877             fix3             = _mm_add_ps(fix3,tx);
1878             fiy3             = _mm_add_ps(fiy3,ty);
1879             fiz3             = _mm_add_ps(fiz3,tz);
1880
1881             fjx2             = _mm_add_ps(fjx2,tx);
1882             fjy2             = _mm_add_ps(fjy2,ty);
1883             fjz2             = _mm_add_ps(fjz2,tz);
1884
1885             }
1886
1887             /**************************
1888              * CALCULATE INTERACTIONS *
1889              **************************/
1890
1891             if (gmx_mm_any_lt(rsq33,rcutoff2))
1892             {
1893
1894             /* REACTION-FIELD ELECTROSTATICS */
1895             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1896
1897             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1898
1899             fscal            = felec;
1900
1901             fscal            = _mm_and_ps(fscal,cutoff_mask);
1902
1903             /* Calculate temporary vectorial force */
1904             tx               = _mm_mul_ps(fscal,dx33);
1905             ty               = _mm_mul_ps(fscal,dy33);
1906             tz               = _mm_mul_ps(fscal,dz33);
1907
1908             /* Update vectorial force */
1909             fix3             = _mm_add_ps(fix3,tx);
1910             fiy3             = _mm_add_ps(fiy3,ty);
1911             fiz3             = _mm_add_ps(fiz3,tz);
1912
1913             fjx3             = _mm_add_ps(fjx3,tx);
1914             fjy3             = _mm_add_ps(fjy3,ty);
1915             fjz3             = _mm_add_ps(fjz3,tz);
1916
1917             }
1918
1919             fjptrA             = f+j_coord_offsetA;
1920             fjptrB             = f+j_coord_offsetB;
1921             fjptrC             = f+j_coord_offsetC;
1922             fjptrD             = f+j_coord_offsetD;
1923
1924             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1925                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1926                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1927
1928             /* Inner loop uses 324 flops */
1929         }
1930
1931         if(jidx<j_index_end)
1932         {
1933
1934             /* Get j neighbor index, and coordinate index */
1935             jnrlistA         = jjnr[jidx];
1936             jnrlistB         = jjnr[jidx+1];
1937             jnrlistC         = jjnr[jidx+2];
1938             jnrlistD         = jjnr[jidx+3];
1939             /* Sign of each element will be negative for non-real atoms.
1940              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1941              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1942              */
1943             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1944             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1945             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1946             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1947             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1948             j_coord_offsetA  = DIM*jnrA;
1949             j_coord_offsetB  = DIM*jnrB;
1950             j_coord_offsetC  = DIM*jnrC;
1951             j_coord_offsetD  = DIM*jnrD;
1952
1953             /* load j atom coordinates */
1954             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1955                                               x+j_coord_offsetC,x+j_coord_offsetD,
1956                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1957                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1958
1959             /* Calculate displacement vector */
1960             dx00             = _mm_sub_ps(ix0,jx0);
1961             dy00             = _mm_sub_ps(iy0,jy0);
1962             dz00             = _mm_sub_ps(iz0,jz0);
1963             dx11             = _mm_sub_ps(ix1,jx1);
1964             dy11             = _mm_sub_ps(iy1,jy1);
1965             dz11             = _mm_sub_ps(iz1,jz1);
1966             dx12             = _mm_sub_ps(ix1,jx2);
1967             dy12             = _mm_sub_ps(iy1,jy2);
1968             dz12             = _mm_sub_ps(iz1,jz2);
1969             dx13             = _mm_sub_ps(ix1,jx3);
1970             dy13             = _mm_sub_ps(iy1,jy3);
1971             dz13             = _mm_sub_ps(iz1,jz3);
1972             dx21             = _mm_sub_ps(ix2,jx1);
1973             dy21             = _mm_sub_ps(iy2,jy1);
1974             dz21             = _mm_sub_ps(iz2,jz1);
1975             dx22             = _mm_sub_ps(ix2,jx2);
1976             dy22             = _mm_sub_ps(iy2,jy2);
1977             dz22             = _mm_sub_ps(iz2,jz2);
1978             dx23             = _mm_sub_ps(ix2,jx3);
1979             dy23             = _mm_sub_ps(iy2,jy3);
1980             dz23             = _mm_sub_ps(iz2,jz3);
1981             dx31             = _mm_sub_ps(ix3,jx1);
1982             dy31             = _mm_sub_ps(iy3,jy1);
1983             dz31             = _mm_sub_ps(iz3,jz1);
1984             dx32             = _mm_sub_ps(ix3,jx2);
1985             dy32             = _mm_sub_ps(iy3,jy2);
1986             dz32             = _mm_sub_ps(iz3,jz2);
1987             dx33             = _mm_sub_ps(ix3,jx3);
1988             dy33             = _mm_sub_ps(iy3,jy3);
1989             dz33             = _mm_sub_ps(iz3,jz3);
1990
1991             /* Calculate squared distance and things based on it */
1992             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1993             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1994             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1995             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1996             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1997             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1998             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1999             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2000             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2001             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2002
2003             rinv00           = gmx_mm_invsqrt_ps(rsq00);
2004             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2005             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2006             rinv13           = gmx_mm_invsqrt_ps(rsq13);
2007             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2008             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2009             rinv23           = gmx_mm_invsqrt_ps(rsq23);
2010             rinv31           = gmx_mm_invsqrt_ps(rsq31);
2011             rinv32           = gmx_mm_invsqrt_ps(rsq32);
2012             rinv33           = gmx_mm_invsqrt_ps(rsq33);
2013
2014             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2015             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2016             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
2017             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2018             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2019             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
2020             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
2021             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
2022             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
2023
2024             fjx0             = _mm_setzero_ps();
2025             fjy0             = _mm_setzero_ps();
2026             fjz0             = _mm_setzero_ps();
2027             fjx1             = _mm_setzero_ps();
2028             fjy1             = _mm_setzero_ps();
2029             fjz1             = _mm_setzero_ps();
2030             fjx2             = _mm_setzero_ps();
2031             fjy2             = _mm_setzero_ps();
2032             fjz2             = _mm_setzero_ps();
2033             fjx3             = _mm_setzero_ps();
2034             fjy3             = _mm_setzero_ps();
2035             fjz3             = _mm_setzero_ps();
2036
2037             /**************************
2038              * CALCULATE INTERACTIONS *
2039              **************************/
2040
2041             if (gmx_mm_any_lt(rsq00,rcutoff2))
2042             {
2043
2044             r00              = _mm_mul_ps(rsq00,rinv00);
2045             r00              = _mm_andnot_ps(dummy_mask,r00);
2046
2047             /* Calculate table index by multiplying r with table scale and truncate to integer */
2048             rt               = _mm_mul_ps(r00,vftabscale);
2049             vfitab           = _mm_cvttps_epi32(rt);
2050             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2051             vfitab           = _mm_slli_epi32(vfitab,3);
2052
2053             /* CUBIC SPLINE TABLE DISPERSION */
2054             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2055             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2056             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2057             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2058             _MM_TRANSPOSE4_PS(Y,F,G,H);
2059             Heps             = _mm_mul_ps(vfeps,H);
2060             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2061             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2062             fvdw6            = _mm_mul_ps(c6_00,FF);
2063
2064             /* CUBIC SPLINE TABLE REPULSION */
2065             vfitab           = _mm_add_epi32(vfitab,ifour);
2066             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2067             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2068             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2069             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2070             _MM_TRANSPOSE4_PS(Y,F,G,H);
2071             Heps             = _mm_mul_ps(vfeps,H);
2072             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2073             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2074             fvdw12           = _mm_mul_ps(c12_00,FF);
2075             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
2076
2077             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2078
2079             fscal            = fvdw;
2080
2081             fscal            = _mm_and_ps(fscal,cutoff_mask);
2082
2083             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2084
2085             /* Calculate temporary vectorial force */
2086             tx               = _mm_mul_ps(fscal,dx00);
2087             ty               = _mm_mul_ps(fscal,dy00);
2088             tz               = _mm_mul_ps(fscal,dz00);
2089
2090             /* Update vectorial force */
2091             fix0             = _mm_add_ps(fix0,tx);
2092             fiy0             = _mm_add_ps(fiy0,ty);
2093             fiz0             = _mm_add_ps(fiz0,tz);
2094
2095             fjx0             = _mm_add_ps(fjx0,tx);
2096             fjy0             = _mm_add_ps(fjy0,ty);
2097             fjz0             = _mm_add_ps(fjz0,tz);
2098
2099             }
2100
2101             /**************************
2102              * CALCULATE INTERACTIONS *
2103              **************************/
2104
2105             if (gmx_mm_any_lt(rsq11,rcutoff2))
2106             {
2107
2108             /* REACTION-FIELD ELECTROSTATICS */
2109             felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
2110
2111             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2112
2113             fscal            = felec;
2114
2115             fscal            = _mm_and_ps(fscal,cutoff_mask);
2116
2117             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2118
2119             /* Calculate temporary vectorial force */
2120             tx               = _mm_mul_ps(fscal,dx11);
2121             ty               = _mm_mul_ps(fscal,dy11);
2122             tz               = _mm_mul_ps(fscal,dz11);
2123
2124             /* Update vectorial force */
2125             fix1             = _mm_add_ps(fix1,tx);
2126             fiy1             = _mm_add_ps(fiy1,ty);
2127             fiz1             = _mm_add_ps(fiz1,tz);
2128
2129             fjx1             = _mm_add_ps(fjx1,tx);
2130             fjy1             = _mm_add_ps(fjy1,ty);
2131             fjz1             = _mm_add_ps(fjz1,tz);
2132
2133             }
2134
2135             /**************************
2136              * CALCULATE INTERACTIONS *
2137              **************************/
2138
2139             if (gmx_mm_any_lt(rsq12,rcutoff2))
2140             {
2141
2142             /* REACTION-FIELD ELECTROSTATICS */
2143             felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
2144
2145             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2146
2147             fscal            = felec;
2148
2149             fscal            = _mm_and_ps(fscal,cutoff_mask);
2150
2151             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2152
2153             /* Calculate temporary vectorial force */
2154             tx               = _mm_mul_ps(fscal,dx12);
2155             ty               = _mm_mul_ps(fscal,dy12);
2156             tz               = _mm_mul_ps(fscal,dz12);
2157
2158             /* Update vectorial force */
2159             fix1             = _mm_add_ps(fix1,tx);
2160             fiy1             = _mm_add_ps(fiy1,ty);
2161             fiz1             = _mm_add_ps(fiz1,tz);
2162
2163             fjx2             = _mm_add_ps(fjx2,tx);
2164             fjy2             = _mm_add_ps(fjy2,ty);
2165             fjz2             = _mm_add_ps(fjz2,tz);
2166
2167             }
2168
2169             /**************************
2170              * CALCULATE INTERACTIONS *
2171              **************************/
2172
2173             if (gmx_mm_any_lt(rsq13,rcutoff2))
2174             {
2175
2176             /* REACTION-FIELD ELECTROSTATICS */
2177             felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
2178
2179             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
2180
2181             fscal            = felec;
2182
2183             fscal            = _mm_and_ps(fscal,cutoff_mask);
2184
2185             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2186
2187             /* Calculate temporary vectorial force */
2188             tx               = _mm_mul_ps(fscal,dx13);
2189             ty               = _mm_mul_ps(fscal,dy13);
2190             tz               = _mm_mul_ps(fscal,dz13);
2191
2192             /* Update vectorial force */
2193             fix1             = _mm_add_ps(fix1,tx);
2194             fiy1             = _mm_add_ps(fiy1,ty);
2195             fiz1             = _mm_add_ps(fiz1,tz);
2196
2197             fjx3             = _mm_add_ps(fjx3,tx);
2198             fjy3             = _mm_add_ps(fjy3,ty);
2199             fjz3             = _mm_add_ps(fjz3,tz);
2200
2201             }
2202
2203             /**************************
2204              * CALCULATE INTERACTIONS *
2205              **************************/
2206
2207             if (gmx_mm_any_lt(rsq21,rcutoff2))
2208             {
2209
2210             /* REACTION-FIELD ELECTROSTATICS */
2211             felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
2212
2213             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2214
2215             fscal            = felec;
2216
2217             fscal            = _mm_and_ps(fscal,cutoff_mask);
2218
2219             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2220
2221             /* Calculate temporary vectorial force */
2222             tx               = _mm_mul_ps(fscal,dx21);
2223             ty               = _mm_mul_ps(fscal,dy21);
2224             tz               = _mm_mul_ps(fscal,dz21);
2225
2226             /* Update vectorial force */
2227             fix2             = _mm_add_ps(fix2,tx);
2228             fiy2             = _mm_add_ps(fiy2,ty);
2229             fiz2             = _mm_add_ps(fiz2,tz);
2230
2231             fjx1             = _mm_add_ps(fjx1,tx);
2232             fjy1             = _mm_add_ps(fjy1,ty);
2233             fjz1             = _mm_add_ps(fjz1,tz);
2234
2235             }
2236
2237             /**************************
2238              * CALCULATE INTERACTIONS *
2239              **************************/
2240
2241             if (gmx_mm_any_lt(rsq22,rcutoff2))
2242             {
2243
2244             /* REACTION-FIELD ELECTROSTATICS */
2245             felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
2246
2247             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2248
2249             fscal            = felec;
2250
2251             fscal            = _mm_and_ps(fscal,cutoff_mask);
2252
2253             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2254
2255             /* Calculate temporary vectorial force */
2256             tx               = _mm_mul_ps(fscal,dx22);
2257             ty               = _mm_mul_ps(fscal,dy22);
2258             tz               = _mm_mul_ps(fscal,dz22);
2259
2260             /* Update vectorial force */
2261             fix2             = _mm_add_ps(fix2,tx);
2262             fiy2             = _mm_add_ps(fiy2,ty);
2263             fiz2             = _mm_add_ps(fiz2,tz);
2264
2265             fjx2             = _mm_add_ps(fjx2,tx);
2266             fjy2             = _mm_add_ps(fjy2,ty);
2267             fjz2             = _mm_add_ps(fjz2,tz);
2268
2269             }
2270
2271             /**************************
2272              * CALCULATE INTERACTIONS *
2273              **************************/
2274
2275             if (gmx_mm_any_lt(rsq23,rcutoff2))
2276             {
2277
2278             /* REACTION-FIELD ELECTROSTATICS */
2279             felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
2280
2281             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
2282
2283             fscal            = felec;
2284
2285             fscal            = _mm_and_ps(fscal,cutoff_mask);
2286
2287             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2288
2289             /* Calculate temporary vectorial force */
2290             tx               = _mm_mul_ps(fscal,dx23);
2291             ty               = _mm_mul_ps(fscal,dy23);
2292             tz               = _mm_mul_ps(fscal,dz23);
2293
2294             /* Update vectorial force */
2295             fix2             = _mm_add_ps(fix2,tx);
2296             fiy2             = _mm_add_ps(fiy2,ty);
2297             fiz2             = _mm_add_ps(fiz2,tz);
2298
2299             fjx3             = _mm_add_ps(fjx3,tx);
2300             fjy3             = _mm_add_ps(fjy3,ty);
2301             fjz3             = _mm_add_ps(fjz3,tz);
2302
2303             }
2304
2305             /**************************
2306              * CALCULATE INTERACTIONS *
2307              **************************/
2308
2309             if (gmx_mm_any_lt(rsq31,rcutoff2))
2310             {
2311
2312             /* REACTION-FIELD ELECTROSTATICS */
2313             felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
2314
2315             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
2316
2317             fscal            = felec;
2318
2319             fscal            = _mm_and_ps(fscal,cutoff_mask);
2320
2321             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2322
2323             /* Calculate temporary vectorial force */
2324             tx               = _mm_mul_ps(fscal,dx31);
2325             ty               = _mm_mul_ps(fscal,dy31);
2326             tz               = _mm_mul_ps(fscal,dz31);
2327
2328             /* Update vectorial force */
2329             fix3             = _mm_add_ps(fix3,tx);
2330             fiy3             = _mm_add_ps(fiy3,ty);
2331             fiz3             = _mm_add_ps(fiz3,tz);
2332
2333             fjx1             = _mm_add_ps(fjx1,tx);
2334             fjy1             = _mm_add_ps(fjy1,ty);
2335             fjz1             = _mm_add_ps(fjz1,tz);
2336
2337             }
2338
2339             /**************************
2340              * CALCULATE INTERACTIONS *
2341              **************************/
2342
2343             if (gmx_mm_any_lt(rsq32,rcutoff2))
2344             {
2345
2346             /* REACTION-FIELD ELECTROSTATICS */
2347             felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
2348
2349             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
2350
2351             fscal            = felec;
2352
2353             fscal            = _mm_and_ps(fscal,cutoff_mask);
2354
2355             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2356
2357             /* Calculate temporary vectorial force */
2358             tx               = _mm_mul_ps(fscal,dx32);
2359             ty               = _mm_mul_ps(fscal,dy32);
2360             tz               = _mm_mul_ps(fscal,dz32);
2361
2362             /* Update vectorial force */
2363             fix3             = _mm_add_ps(fix3,tx);
2364             fiy3             = _mm_add_ps(fiy3,ty);
2365             fiz3             = _mm_add_ps(fiz3,tz);
2366
2367             fjx2             = _mm_add_ps(fjx2,tx);
2368             fjy2             = _mm_add_ps(fjy2,ty);
2369             fjz2             = _mm_add_ps(fjz2,tz);
2370
2371             }
2372
2373             /**************************
2374              * CALCULATE INTERACTIONS *
2375              **************************/
2376
2377             if (gmx_mm_any_lt(rsq33,rcutoff2))
2378             {
2379
2380             /* REACTION-FIELD ELECTROSTATICS */
2381             felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
2382
2383             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
2384
2385             fscal            = felec;
2386
2387             fscal            = _mm_and_ps(fscal,cutoff_mask);
2388
2389             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2390
2391             /* Calculate temporary vectorial force */
2392             tx               = _mm_mul_ps(fscal,dx33);
2393             ty               = _mm_mul_ps(fscal,dy33);
2394             tz               = _mm_mul_ps(fscal,dz33);
2395
2396             /* Update vectorial force */
2397             fix3             = _mm_add_ps(fix3,tx);
2398             fiy3             = _mm_add_ps(fiy3,ty);
2399             fiz3             = _mm_add_ps(fiz3,tz);
2400
2401             fjx3             = _mm_add_ps(fjx3,tx);
2402             fjy3             = _mm_add_ps(fjy3,ty);
2403             fjz3             = _mm_add_ps(fjz3,tz);
2404
2405             }
2406
2407             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2408             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2409             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2410             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2411
2412             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2413                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2414                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2415
2416             /* Inner loop uses 325 flops */
2417         }
2418
2419         /* End of innermost loop */
2420
2421         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2422                                               f+i_coord_offset,fshift+i_shift_offset);
2423
2424         /* Increment number of inner iterations */
2425         inneriter                  += j_index_end - j_index_start;
2426
2427         /* Outer loop uses 24 flops */
2428     }
2429
2430     /* Increment number of outer iterations */
2431     outeriter        += nri;
2432
2433     /* Update outer/inner flops */
2434
2435     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*325);
2436 }