ab9ab509c455d11f3daa2b028baaaafb48046139
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_ElecCSTab_VdwNone_GeomW3W3_sse4_1_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
46
47 #include "gromacs/simd/math_x86_sse4_1_single.h"
48 #include "kernelutil_x86_sse4_1_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse4_1_single
52  * Electrostatics interaction: CubicSplineTable
53  * VdW interaction:            None
54  * Geometry:                   Water3-Water3
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse4_1_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83     int              vdwioffset0;
84     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     int              vdwioffset1;
86     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     int              vdwioffset2;
88     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
105     real             *charge;
106     __m128i          vfitab;
107     __m128i          ifour       = _mm_set1_epi32(4);
108     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
109     real             *vftab;
110     __m128           dummy_mask,cutoff_mask;
111     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
112     __m128           one     = _mm_set1_ps(1.0);
113     __m128           two     = _mm_set1_ps(2.0);
114     x                = xx[0];
115     f                = ff[0];
116
117     nri              = nlist->nri;
118     iinr             = nlist->iinr;
119     jindex           = nlist->jindex;
120     jjnr             = nlist->jjnr;
121     shiftidx         = nlist->shift;
122     gid              = nlist->gid;
123     shiftvec         = fr->shift_vec[0];
124     fshift           = fr->fshift[0];
125     facel            = _mm_set1_ps(fr->epsfac);
126     charge           = mdatoms->chargeA;
127
128     vftab            = kernel_data->table_elec->data;
129     vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
130
131     /* Setup water-specific parameters */
132     inr              = nlist->iinr[0];
133     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
134     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
135     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
136
137     jq0              = _mm_set1_ps(charge[inr+0]);
138     jq1              = _mm_set1_ps(charge[inr+1]);
139     jq2              = _mm_set1_ps(charge[inr+2]);
140     qq00             = _mm_mul_ps(iq0,jq0);
141     qq01             = _mm_mul_ps(iq0,jq1);
142     qq02             = _mm_mul_ps(iq0,jq2);
143     qq10             = _mm_mul_ps(iq1,jq0);
144     qq11             = _mm_mul_ps(iq1,jq1);
145     qq12             = _mm_mul_ps(iq1,jq2);
146     qq20             = _mm_mul_ps(iq2,jq0);
147     qq21             = _mm_mul_ps(iq2,jq1);
148     qq22             = _mm_mul_ps(iq2,jq2);
149
150     /* Avoid stupid compiler warnings */
151     jnrA = jnrB = jnrC = jnrD = 0;
152     j_coord_offsetA = 0;
153     j_coord_offsetB = 0;
154     j_coord_offsetC = 0;
155     j_coord_offsetD = 0;
156
157     outeriter        = 0;
158     inneriter        = 0;
159
160     for(iidx=0;iidx<4*DIM;iidx++)
161     {
162         scratch[iidx] = 0.0;
163     }
164
165     /* Start outer loop over neighborlists */
166     for(iidx=0; iidx<nri; iidx++)
167     {
168         /* Load shift vector for this list */
169         i_shift_offset   = DIM*shiftidx[iidx];
170
171         /* Load limits for loop over neighbors */
172         j_index_start    = jindex[iidx];
173         j_index_end      = jindex[iidx+1];
174
175         /* Get outer coordinate index */
176         inr              = iinr[iidx];
177         i_coord_offset   = DIM*inr;
178
179         /* Load i particle coords and add shift vector */
180         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
181                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
182
183         fix0             = _mm_setzero_ps();
184         fiy0             = _mm_setzero_ps();
185         fiz0             = _mm_setzero_ps();
186         fix1             = _mm_setzero_ps();
187         fiy1             = _mm_setzero_ps();
188         fiz1             = _mm_setzero_ps();
189         fix2             = _mm_setzero_ps();
190         fiy2             = _mm_setzero_ps();
191         fiz2             = _mm_setzero_ps();
192
193         /* Reset potential sums */
194         velecsum         = _mm_setzero_ps();
195
196         /* Start inner kernel loop */
197         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
198         {
199
200             /* Get j neighbor index, and coordinate index */
201             jnrA             = jjnr[jidx];
202             jnrB             = jjnr[jidx+1];
203             jnrC             = jjnr[jidx+2];
204             jnrD             = jjnr[jidx+3];
205             j_coord_offsetA  = DIM*jnrA;
206             j_coord_offsetB  = DIM*jnrB;
207             j_coord_offsetC  = DIM*jnrC;
208             j_coord_offsetD  = DIM*jnrD;
209
210             /* load j atom coordinates */
211             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
212                                               x+j_coord_offsetC,x+j_coord_offsetD,
213                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
214
215             /* Calculate displacement vector */
216             dx00             = _mm_sub_ps(ix0,jx0);
217             dy00             = _mm_sub_ps(iy0,jy0);
218             dz00             = _mm_sub_ps(iz0,jz0);
219             dx01             = _mm_sub_ps(ix0,jx1);
220             dy01             = _mm_sub_ps(iy0,jy1);
221             dz01             = _mm_sub_ps(iz0,jz1);
222             dx02             = _mm_sub_ps(ix0,jx2);
223             dy02             = _mm_sub_ps(iy0,jy2);
224             dz02             = _mm_sub_ps(iz0,jz2);
225             dx10             = _mm_sub_ps(ix1,jx0);
226             dy10             = _mm_sub_ps(iy1,jy0);
227             dz10             = _mm_sub_ps(iz1,jz0);
228             dx11             = _mm_sub_ps(ix1,jx1);
229             dy11             = _mm_sub_ps(iy1,jy1);
230             dz11             = _mm_sub_ps(iz1,jz1);
231             dx12             = _mm_sub_ps(ix1,jx2);
232             dy12             = _mm_sub_ps(iy1,jy2);
233             dz12             = _mm_sub_ps(iz1,jz2);
234             dx20             = _mm_sub_ps(ix2,jx0);
235             dy20             = _mm_sub_ps(iy2,jy0);
236             dz20             = _mm_sub_ps(iz2,jz0);
237             dx21             = _mm_sub_ps(ix2,jx1);
238             dy21             = _mm_sub_ps(iy2,jy1);
239             dz21             = _mm_sub_ps(iz2,jz1);
240             dx22             = _mm_sub_ps(ix2,jx2);
241             dy22             = _mm_sub_ps(iy2,jy2);
242             dz22             = _mm_sub_ps(iz2,jz2);
243
244             /* Calculate squared distance and things based on it */
245             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
246             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
247             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
248             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
249             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
250             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
251             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
252             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
253             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
254
255             rinv00           = gmx_mm_invsqrt_ps(rsq00);
256             rinv01           = gmx_mm_invsqrt_ps(rsq01);
257             rinv02           = gmx_mm_invsqrt_ps(rsq02);
258             rinv10           = gmx_mm_invsqrt_ps(rsq10);
259             rinv11           = gmx_mm_invsqrt_ps(rsq11);
260             rinv12           = gmx_mm_invsqrt_ps(rsq12);
261             rinv20           = gmx_mm_invsqrt_ps(rsq20);
262             rinv21           = gmx_mm_invsqrt_ps(rsq21);
263             rinv22           = gmx_mm_invsqrt_ps(rsq22);
264
265             fjx0             = _mm_setzero_ps();
266             fjy0             = _mm_setzero_ps();
267             fjz0             = _mm_setzero_ps();
268             fjx1             = _mm_setzero_ps();
269             fjy1             = _mm_setzero_ps();
270             fjz1             = _mm_setzero_ps();
271             fjx2             = _mm_setzero_ps();
272             fjy2             = _mm_setzero_ps();
273             fjz2             = _mm_setzero_ps();
274
275             /**************************
276              * CALCULATE INTERACTIONS *
277              **************************/
278
279             r00              = _mm_mul_ps(rsq00,rinv00);
280
281             /* Calculate table index by multiplying r with table scale and truncate to integer */
282             rt               = _mm_mul_ps(r00,vftabscale);
283             vfitab           = _mm_cvttps_epi32(rt);
284             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
285             vfitab           = _mm_slli_epi32(vfitab,2);
286
287             /* CUBIC SPLINE TABLE ELECTROSTATICS */
288             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
289             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
290             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
291             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
292             _MM_TRANSPOSE4_PS(Y,F,G,H);
293             Heps             = _mm_mul_ps(vfeps,H);
294             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
295             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
296             velec            = _mm_mul_ps(qq00,VV);
297             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
298             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
299
300             /* Update potential sum for this i atom from the interaction with this j atom. */
301             velecsum         = _mm_add_ps(velecsum,velec);
302
303             fscal            = felec;
304
305             /* Calculate temporary vectorial force */
306             tx               = _mm_mul_ps(fscal,dx00);
307             ty               = _mm_mul_ps(fscal,dy00);
308             tz               = _mm_mul_ps(fscal,dz00);
309
310             /* Update vectorial force */
311             fix0             = _mm_add_ps(fix0,tx);
312             fiy0             = _mm_add_ps(fiy0,ty);
313             fiz0             = _mm_add_ps(fiz0,tz);
314
315             fjx0             = _mm_add_ps(fjx0,tx);
316             fjy0             = _mm_add_ps(fjy0,ty);
317             fjz0             = _mm_add_ps(fjz0,tz);
318
319             /**************************
320              * CALCULATE INTERACTIONS *
321              **************************/
322
323             r01              = _mm_mul_ps(rsq01,rinv01);
324
325             /* Calculate table index by multiplying r with table scale and truncate to integer */
326             rt               = _mm_mul_ps(r01,vftabscale);
327             vfitab           = _mm_cvttps_epi32(rt);
328             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
329             vfitab           = _mm_slli_epi32(vfitab,2);
330
331             /* CUBIC SPLINE TABLE ELECTROSTATICS */
332             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
333             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
334             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
335             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
336             _MM_TRANSPOSE4_PS(Y,F,G,H);
337             Heps             = _mm_mul_ps(vfeps,H);
338             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
339             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
340             velec            = _mm_mul_ps(qq01,VV);
341             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
342             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
343
344             /* Update potential sum for this i atom from the interaction with this j atom. */
345             velecsum         = _mm_add_ps(velecsum,velec);
346
347             fscal            = felec;
348
349             /* Calculate temporary vectorial force */
350             tx               = _mm_mul_ps(fscal,dx01);
351             ty               = _mm_mul_ps(fscal,dy01);
352             tz               = _mm_mul_ps(fscal,dz01);
353
354             /* Update vectorial force */
355             fix0             = _mm_add_ps(fix0,tx);
356             fiy0             = _mm_add_ps(fiy0,ty);
357             fiz0             = _mm_add_ps(fiz0,tz);
358
359             fjx1             = _mm_add_ps(fjx1,tx);
360             fjy1             = _mm_add_ps(fjy1,ty);
361             fjz1             = _mm_add_ps(fjz1,tz);
362
363             /**************************
364              * CALCULATE INTERACTIONS *
365              **************************/
366
367             r02              = _mm_mul_ps(rsq02,rinv02);
368
369             /* Calculate table index by multiplying r with table scale and truncate to integer */
370             rt               = _mm_mul_ps(r02,vftabscale);
371             vfitab           = _mm_cvttps_epi32(rt);
372             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
373             vfitab           = _mm_slli_epi32(vfitab,2);
374
375             /* CUBIC SPLINE TABLE ELECTROSTATICS */
376             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
377             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
378             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
379             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
380             _MM_TRANSPOSE4_PS(Y,F,G,H);
381             Heps             = _mm_mul_ps(vfeps,H);
382             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
383             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
384             velec            = _mm_mul_ps(qq02,VV);
385             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
386             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
387
388             /* Update potential sum for this i atom from the interaction with this j atom. */
389             velecsum         = _mm_add_ps(velecsum,velec);
390
391             fscal            = felec;
392
393             /* Calculate temporary vectorial force */
394             tx               = _mm_mul_ps(fscal,dx02);
395             ty               = _mm_mul_ps(fscal,dy02);
396             tz               = _mm_mul_ps(fscal,dz02);
397
398             /* Update vectorial force */
399             fix0             = _mm_add_ps(fix0,tx);
400             fiy0             = _mm_add_ps(fiy0,ty);
401             fiz0             = _mm_add_ps(fiz0,tz);
402
403             fjx2             = _mm_add_ps(fjx2,tx);
404             fjy2             = _mm_add_ps(fjy2,ty);
405             fjz2             = _mm_add_ps(fjz2,tz);
406
407             /**************************
408              * CALCULATE INTERACTIONS *
409              **************************/
410
411             r10              = _mm_mul_ps(rsq10,rinv10);
412
413             /* Calculate table index by multiplying r with table scale and truncate to integer */
414             rt               = _mm_mul_ps(r10,vftabscale);
415             vfitab           = _mm_cvttps_epi32(rt);
416             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
417             vfitab           = _mm_slli_epi32(vfitab,2);
418
419             /* CUBIC SPLINE TABLE ELECTROSTATICS */
420             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
421             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
422             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
423             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
424             _MM_TRANSPOSE4_PS(Y,F,G,H);
425             Heps             = _mm_mul_ps(vfeps,H);
426             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
427             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
428             velec            = _mm_mul_ps(qq10,VV);
429             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
430             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
431
432             /* Update potential sum for this i atom from the interaction with this j atom. */
433             velecsum         = _mm_add_ps(velecsum,velec);
434
435             fscal            = felec;
436
437             /* Calculate temporary vectorial force */
438             tx               = _mm_mul_ps(fscal,dx10);
439             ty               = _mm_mul_ps(fscal,dy10);
440             tz               = _mm_mul_ps(fscal,dz10);
441
442             /* Update vectorial force */
443             fix1             = _mm_add_ps(fix1,tx);
444             fiy1             = _mm_add_ps(fiy1,ty);
445             fiz1             = _mm_add_ps(fiz1,tz);
446
447             fjx0             = _mm_add_ps(fjx0,tx);
448             fjy0             = _mm_add_ps(fjy0,ty);
449             fjz0             = _mm_add_ps(fjz0,tz);
450
451             /**************************
452              * CALCULATE INTERACTIONS *
453              **************************/
454
455             r11              = _mm_mul_ps(rsq11,rinv11);
456
457             /* Calculate table index by multiplying r with table scale and truncate to integer */
458             rt               = _mm_mul_ps(r11,vftabscale);
459             vfitab           = _mm_cvttps_epi32(rt);
460             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
461             vfitab           = _mm_slli_epi32(vfitab,2);
462
463             /* CUBIC SPLINE TABLE ELECTROSTATICS */
464             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
465             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
466             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
467             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
468             _MM_TRANSPOSE4_PS(Y,F,G,H);
469             Heps             = _mm_mul_ps(vfeps,H);
470             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
471             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
472             velec            = _mm_mul_ps(qq11,VV);
473             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
474             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
475
476             /* Update potential sum for this i atom from the interaction with this j atom. */
477             velecsum         = _mm_add_ps(velecsum,velec);
478
479             fscal            = felec;
480
481             /* Calculate temporary vectorial force */
482             tx               = _mm_mul_ps(fscal,dx11);
483             ty               = _mm_mul_ps(fscal,dy11);
484             tz               = _mm_mul_ps(fscal,dz11);
485
486             /* Update vectorial force */
487             fix1             = _mm_add_ps(fix1,tx);
488             fiy1             = _mm_add_ps(fiy1,ty);
489             fiz1             = _mm_add_ps(fiz1,tz);
490
491             fjx1             = _mm_add_ps(fjx1,tx);
492             fjy1             = _mm_add_ps(fjy1,ty);
493             fjz1             = _mm_add_ps(fjz1,tz);
494
495             /**************************
496              * CALCULATE INTERACTIONS *
497              **************************/
498
499             r12              = _mm_mul_ps(rsq12,rinv12);
500
501             /* Calculate table index by multiplying r with table scale and truncate to integer */
502             rt               = _mm_mul_ps(r12,vftabscale);
503             vfitab           = _mm_cvttps_epi32(rt);
504             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
505             vfitab           = _mm_slli_epi32(vfitab,2);
506
507             /* CUBIC SPLINE TABLE ELECTROSTATICS */
508             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
509             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
510             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
511             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
512             _MM_TRANSPOSE4_PS(Y,F,G,H);
513             Heps             = _mm_mul_ps(vfeps,H);
514             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
515             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
516             velec            = _mm_mul_ps(qq12,VV);
517             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
518             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
519
520             /* Update potential sum for this i atom from the interaction with this j atom. */
521             velecsum         = _mm_add_ps(velecsum,velec);
522
523             fscal            = felec;
524
525             /* Calculate temporary vectorial force */
526             tx               = _mm_mul_ps(fscal,dx12);
527             ty               = _mm_mul_ps(fscal,dy12);
528             tz               = _mm_mul_ps(fscal,dz12);
529
530             /* Update vectorial force */
531             fix1             = _mm_add_ps(fix1,tx);
532             fiy1             = _mm_add_ps(fiy1,ty);
533             fiz1             = _mm_add_ps(fiz1,tz);
534
535             fjx2             = _mm_add_ps(fjx2,tx);
536             fjy2             = _mm_add_ps(fjy2,ty);
537             fjz2             = _mm_add_ps(fjz2,tz);
538
539             /**************************
540              * CALCULATE INTERACTIONS *
541              **************************/
542
543             r20              = _mm_mul_ps(rsq20,rinv20);
544
545             /* Calculate table index by multiplying r with table scale and truncate to integer */
546             rt               = _mm_mul_ps(r20,vftabscale);
547             vfitab           = _mm_cvttps_epi32(rt);
548             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
549             vfitab           = _mm_slli_epi32(vfitab,2);
550
551             /* CUBIC SPLINE TABLE ELECTROSTATICS */
552             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
553             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
554             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
555             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
556             _MM_TRANSPOSE4_PS(Y,F,G,H);
557             Heps             = _mm_mul_ps(vfeps,H);
558             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
559             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
560             velec            = _mm_mul_ps(qq20,VV);
561             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
562             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
563
564             /* Update potential sum for this i atom from the interaction with this j atom. */
565             velecsum         = _mm_add_ps(velecsum,velec);
566
567             fscal            = felec;
568
569             /* Calculate temporary vectorial force */
570             tx               = _mm_mul_ps(fscal,dx20);
571             ty               = _mm_mul_ps(fscal,dy20);
572             tz               = _mm_mul_ps(fscal,dz20);
573
574             /* Update vectorial force */
575             fix2             = _mm_add_ps(fix2,tx);
576             fiy2             = _mm_add_ps(fiy2,ty);
577             fiz2             = _mm_add_ps(fiz2,tz);
578
579             fjx0             = _mm_add_ps(fjx0,tx);
580             fjy0             = _mm_add_ps(fjy0,ty);
581             fjz0             = _mm_add_ps(fjz0,tz);
582
583             /**************************
584              * CALCULATE INTERACTIONS *
585              **************************/
586
587             r21              = _mm_mul_ps(rsq21,rinv21);
588
589             /* Calculate table index by multiplying r with table scale and truncate to integer */
590             rt               = _mm_mul_ps(r21,vftabscale);
591             vfitab           = _mm_cvttps_epi32(rt);
592             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
593             vfitab           = _mm_slli_epi32(vfitab,2);
594
595             /* CUBIC SPLINE TABLE ELECTROSTATICS */
596             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
597             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
598             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
599             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
600             _MM_TRANSPOSE4_PS(Y,F,G,H);
601             Heps             = _mm_mul_ps(vfeps,H);
602             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
603             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
604             velec            = _mm_mul_ps(qq21,VV);
605             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
606             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
607
608             /* Update potential sum for this i atom from the interaction with this j atom. */
609             velecsum         = _mm_add_ps(velecsum,velec);
610
611             fscal            = felec;
612
613             /* Calculate temporary vectorial force */
614             tx               = _mm_mul_ps(fscal,dx21);
615             ty               = _mm_mul_ps(fscal,dy21);
616             tz               = _mm_mul_ps(fscal,dz21);
617
618             /* Update vectorial force */
619             fix2             = _mm_add_ps(fix2,tx);
620             fiy2             = _mm_add_ps(fiy2,ty);
621             fiz2             = _mm_add_ps(fiz2,tz);
622
623             fjx1             = _mm_add_ps(fjx1,tx);
624             fjy1             = _mm_add_ps(fjy1,ty);
625             fjz1             = _mm_add_ps(fjz1,tz);
626
627             /**************************
628              * CALCULATE INTERACTIONS *
629              **************************/
630
631             r22              = _mm_mul_ps(rsq22,rinv22);
632
633             /* Calculate table index by multiplying r with table scale and truncate to integer */
634             rt               = _mm_mul_ps(r22,vftabscale);
635             vfitab           = _mm_cvttps_epi32(rt);
636             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
637             vfitab           = _mm_slli_epi32(vfitab,2);
638
639             /* CUBIC SPLINE TABLE ELECTROSTATICS */
640             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
641             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
642             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
643             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
644             _MM_TRANSPOSE4_PS(Y,F,G,H);
645             Heps             = _mm_mul_ps(vfeps,H);
646             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
647             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
648             velec            = _mm_mul_ps(qq22,VV);
649             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
650             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
651
652             /* Update potential sum for this i atom from the interaction with this j atom. */
653             velecsum         = _mm_add_ps(velecsum,velec);
654
655             fscal            = felec;
656
657             /* Calculate temporary vectorial force */
658             tx               = _mm_mul_ps(fscal,dx22);
659             ty               = _mm_mul_ps(fscal,dy22);
660             tz               = _mm_mul_ps(fscal,dz22);
661
662             /* Update vectorial force */
663             fix2             = _mm_add_ps(fix2,tx);
664             fiy2             = _mm_add_ps(fiy2,ty);
665             fiz2             = _mm_add_ps(fiz2,tz);
666
667             fjx2             = _mm_add_ps(fjx2,tx);
668             fjy2             = _mm_add_ps(fjy2,ty);
669             fjz2             = _mm_add_ps(fjz2,tz);
670
671             fjptrA             = f+j_coord_offsetA;
672             fjptrB             = f+j_coord_offsetB;
673             fjptrC             = f+j_coord_offsetC;
674             fjptrD             = f+j_coord_offsetD;
675
676             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
677                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
678
679             /* Inner loop uses 387 flops */
680         }
681
682         if(jidx<j_index_end)
683         {
684
685             /* Get j neighbor index, and coordinate index */
686             jnrlistA         = jjnr[jidx];
687             jnrlistB         = jjnr[jidx+1];
688             jnrlistC         = jjnr[jidx+2];
689             jnrlistD         = jjnr[jidx+3];
690             /* Sign of each element will be negative for non-real atoms.
691              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
692              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
693              */
694             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
695             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
696             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
697             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
698             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
699             j_coord_offsetA  = DIM*jnrA;
700             j_coord_offsetB  = DIM*jnrB;
701             j_coord_offsetC  = DIM*jnrC;
702             j_coord_offsetD  = DIM*jnrD;
703
704             /* load j atom coordinates */
705             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
706                                               x+j_coord_offsetC,x+j_coord_offsetD,
707                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
708
709             /* Calculate displacement vector */
710             dx00             = _mm_sub_ps(ix0,jx0);
711             dy00             = _mm_sub_ps(iy0,jy0);
712             dz00             = _mm_sub_ps(iz0,jz0);
713             dx01             = _mm_sub_ps(ix0,jx1);
714             dy01             = _mm_sub_ps(iy0,jy1);
715             dz01             = _mm_sub_ps(iz0,jz1);
716             dx02             = _mm_sub_ps(ix0,jx2);
717             dy02             = _mm_sub_ps(iy0,jy2);
718             dz02             = _mm_sub_ps(iz0,jz2);
719             dx10             = _mm_sub_ps(ix1,jx0);
720             dy10             = _mm_sub_ps(iy1,jy0);
721             dz10             = _mm_sub_ps(iz1,jz0);
722             dx11             = _mm_sub_ps(ix1,jx1);
723             dy11             = _mm_sub_ps(iy1,jy1);
724             dz11             = _mm_sub_ps(iz1,jz1);
725             dx12             = _mm_sub_ps(ix1,jx2);
726             dy12             = _mm_sub_ps(iy1,jy2);
727             dz12             = _mm_sub_ps(iz1,jz2);
728             dx20             = _mm_sub_ps(ix2,jx0);
729             dy20             = _mm_sub_ps(iy2,jy0);
730             dz20             = _mm_sub_ps(iz2,jz0);
731             dx21             = _mm_sub_ps(ix2,jx1);
732             dy21             = _mm_sub_ps(iy2,jy1);
733             dz21             = _mm_sub_ps(iz2,jz1);
734             dx22             = _mm_sub_ps(ix2,jx2);
735             dy22             = _mm_sub_ps(iy2,jy2);
736             dz22             = _mm_sub_ps(iz2,jz2);
737
738             /* Calculate squared distance and things based on it */
739             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
740             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
741             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
742             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
743             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
744             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
745             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
746             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
747             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
748
749             rinv00           = gmx_mm_invsqrt_ps(rsq00);
750             rinv01           = gmx_mm_invsqrt_ps(rsq01);
751             rinv02           = gmx_mm_invsqrt_ps(rsq02);
752             rinv10           = gmx_mm_invsqrt_ps(rsq10);
753             rinv11           = gmx_mm_invsqrt_ps(rsq11);
754             rinv12           = gmx_mm_invsqrt_ps(rsq12);
755             rinv20           = gmx_mm_invsqrt_ps(rsq20);
756             rinv21           = gmx_mm_invsqrt_ps(rsq21);
757             rinv22           = gmx_mm_invsqrt_ps(rsq22);
758
759             fjx0             = _mm_setzero_ps();
760             fjy0             = _mm_setzero_ps();
761             fjz0             = _mm_setzero_ps();
762             fjx1             = _mm_setzero_ps();
763             fjy1             = _mm_setzero_ps();
764             fjz1             = _mm_setzero_ps();
765             fjx2             = _mm_setzero_ps();
766             fjy2             = _mm_setzero_ps();
767             fjz2             = _mm_setzero_ps();
768
769             /**************************
770              * CALCULATE INTERACTIONS *
771              **************************/
772
773             r00              = _mm_mul_ps(rsq00,rinv00);
774             r00              = _mm_andnot_ps(dummy_mask,r00);
775
776             /* Calculate table index by multiplying r with table scale and truncate to integer */
777             rt               = _mm_mul_ps(r00,vftabscale);
778             vfitab           = _mm_cvttps_epi32(rt);
779             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
780             vfitab           = _mm_slli_epi32(vfitab,2);
781
782             /* CUBIC SPLINE TABLE ELECTROSTATICS */
783             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
784             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
785             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
786             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
787             _MM_TRANSPOSE4_PS(Y,F,G,H);
788             Heps             = _mm_mul_ps(vfeps,H);
789             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
790             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
791             velec            = _mm_mul_ps(qq00,VV);
792             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
793             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
794
795             /* Update potential sum for this i atom from the interaction with this j atom. */
796             velec            = _mm_andnot_ps(dummy_mask,velec);
797             velecsum         = _mm_add_ps(velecsum,velec);
798
799             fscal            = felec;
800
801             fscal            = _mm_andnot_ps(dummy_mask,fscal);
802
803             /* Calculate temporary vectorial force */
804             tx               = _mm_mul_ps(fscal,dx00);
805             ty               = _mm_mul_ps(fscal,dy00);
806             tz               = _mm_mul_ps(fscal,dz00);
807
808             /* Update vectorial force */
809             fix0             = _mm_add_ps(fix0,tx);
810             fiy0             = _mm_add_ps(fiy0,ty);
811             fiz0             = _mm_add_ps(fiz0,tz);
812
813             fjx0             = _mm_add_ps(fjx0,tx);
814             fjy0             = _mm_add_ps(fjy0,ty);
815             fjz0             = _mm_add_ps(fjz0,tz);
816
817             /**************************
818              * CALCULATE INTERACTIONS *
819              **************************/
820
821             r01              = _mm_mul_ps(rsq01,rinv01);
822             r01              = _mm_andnot_ps(dummy_mask,r01);
823
824             /* Calculate table index by multiplying r with table scale and truncate to integer */
825             rt               = _mm_mul_ps(r01,vftabscale);
826             vfitab           = _mm_cvttps_epi32(rt);
827             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
828             vfitab           = _mm_slli_epi32(vfitab,2);
829
830             /* CUBIC SPLINE TABLE ELECTROSTATICS */
831             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
832             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
833             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
834             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
835             _MM_TRANSPOSE4_PS(Y,F,G,H);
836             Heps             = _mm_mul_ps(vfeps,H);
837             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
838             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
839             velec            = _mm_mul_ps(qq01,VV);
840             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
841             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
842
843             /* Update potential sum for this i atom from the interaction with this j atom. */
844             velec            = _mm_andnot_ps(dummy_mask,velec);
845             velecsum         = _mm_add_ps(velecsum,velec);
846
847             fscal            = felec;
848
849             fscal            = _mm_andnot_ps(dummy_mask,fscal);
850
851             /* Calculate temporary vectorial force */
852             tx               = _mm_mul_ps(fscal,dx01);
853             ty               = _mm_mul_ps(fscal,dy01);
854             tz               = _mm_mul_ps(fscal,dz01);
855
856             /* Update vectorial force */
857             fix0             = _mm_add_ps(fix0,tx);
858             fiy0             = _mm_add_ps(fiy0,ty);
859             fiz0             = _mm_add_ps(fiz0,tz);
860
861             fjx1             = _mm_add_ps(fjx1,tx);
862             fjy1             = _mm_add_ps(fjy1,ty);
863             fjz1             = _mm_add_ps(fjz1,tz);
864
865             /**************************
866              * CALCULATE INTERACTIONS *
867              **************************/
868
869             r02              = _mm_mul_ps(rsq02,rinv02);
870             r02              = _mm_andnot_ps(dummy_mask,r02);
871
872             /* Calculate table index by multiplying r with table scale and truncate to integer */
873             rt               = _mm_mul_ps(r02,vftabscale);
874             vfitab           = _mm_cvttps_epi32(rt);
875             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
876             vfitab           = _mm_slli_epi32(vfitab,2);
877
878             /* CUBIC SPLINE TABLE ELECTROSTATICS */
879             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
880             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
881             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
882             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
883             _MM_TRANSPOSE4_PS(Y,F,G,H);
884             Heps             = _mm_mul_ps(vfeps,H);
885             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
886             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
887             velec            = _mm_mul_ps(qq02,VV);
888             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
889             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
890
891             /* Update potential sum for this i atom from the interaction with this j atom. */
892             velec            = _mm_andnot_ps(dummy_mask,velec);
893             velecsum         = _mm_add_ps(velecsum,velec);
894
895             fscal            = felec;
896
897             fscal            = _mm_andnot_ps(dummy_mask,fscal);
898
899             /* Calculate temporary vectorial force */
900             tx               = _mm_mul_ps(fscal,dx02);
901             ty               = _mm_mul_ps(fscal,dy02);
902             tz               = _mm_mul_ps(fscal,dz02);
903
904             /* Update vectorial force */
905             fix0             = _mm_add_ps(fix0,tx);
906             fiy0             = _mm_add_ps(fiy0,ty);
907             fiz0             = _mm_add_ps(fiz0,tz);
908
909             fjx2             = _mm_add_ps(fjx2,tx);
910             fjy2             = _mm_add_ps(fjy2,ty);
911             fjz2             = _mm_add_ps(fjz2,tz);
912
913             /**************************
914              * CALCULATE INTERACTIONS *
915              **************************/
916
917             r10              = _mm_mul_ps(rsq10,rinv10);
918             r10              = _mm_andnot_ps(dummy_mask,r10);
919
920             /* Calculate table index by multiplying r with table scale and truncate to integer */
921             rt               = _mm_mul_ps(r10,vftabscale);
922             vfitab           = _mm_cvttps_epi32(rt);
923             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
924             vfitab           = _mm_slli_epi32(vfitab,2);
925
926             /* CUBIC SPLINE TABLE ELECTROSTATICS */
927             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
928             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
929             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
930             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
931             _MM_TRANSPOSE4_PS(Y,F,G,H);
932             Heps             = _mm_mul_ps(vfeps,H);
933             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
934             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
935             velec            = _mm_mul_ps(qq10,VV);
936             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
937             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
938
939             /* Update potential sum for this i atom from the interaction with this j atom. */
940             velec            = _mm_andnot_ps(dummy_mask,velec);
941             velecsum         = _mm_add_ps(velecsum,velec);
942
943             fscal            = felec;
944
945             fscal            = _mm_andnot_ps(dummy_mask,fscal);
946
947             /* Calculate temporary vectorial force */
948             tx               = _mm_mul_ps(fscal,dx10);
949             ty               = _mm_mul_ps(fscal,dy10);
950             tz               = _mm_mul_ps(fscal,dz10);
951
952             /* Update vectorial force */
953             fix1             = _mm_add_ps(fix1,tx);
954             fiy1             = _mm_add_ps(fiy1,ty);
955             fiz1             = _mm_add_ps(fiz1,tz);
956
957             fjx0             = _mm_add_ps(fjx0,tx);
958             fjy0             = _mm_add_ps(fjy0,ty);
959             fjz0             = _mm_add_ps(fjz0,tz);
960
961             /**************************
962              * CALCULATE INTERACTIONS *
963              **************************/
964
965             r11              = _mm_mul_ps(rsq11,rinv11);
966             r11              = _mm_andnot_ps(dummy_mask,r11);
967
968             /* Calculate table index by multiplying r with table scale and truncate to integer */
969             rt               = _mm_mul_ps(r11,vftabscale);
970             vfitab           = _mm_cvttps_epi32(rt);
971             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
972             vfitab           = _mm_slli_epi32(vfitab,2);
973
974             /* CUBIC SPLINE TABLE ELECTROSTATICS */
975             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
976             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
977             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
978             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
979             _MM_TRANSPOSE4_PS(Y,F,G,H);
980             Heps             = _mm_mul_ps(vfeps,H);
981             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
982             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
983             velec            = _mm_mul_ps(qq11,VV);
984             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
985             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
986
987             /* Update potential sum for this i atom from the interaction with this j atom. */
988             velec            = _mm_andnot_ps(dummy_mask,velec);
989             velecsum         = _mm_add_ps(velecsum,velec);
990
991             fscal            = felec;
992
993             fscal            = _mm_andnot_ps(dummy_mask,fscal);
994
995             /* Calculate temporary vectorial force */
996             tx               = _mm_mul_ps(fscal,dx11);
997             ty               = _mm_mul_ps(fscal,dy11);
998             tz               = _mm_mul_ps(fscal,dz11);
999
1000             /* Update vectorial force */
1001             fix1             = _mm_add_ps(fix1,tx);
1002             fiy1             = _mm_add_ps(fiy1,ty);
1003             fiz1             = _mm_add_ps(fiz1,tz);
1004
1005             fjx1             = _mm_add_ps(fjx1,tx);
1006             fjy1             = _mm_add_ps(fjy1,ty);
1007             fjz1             = _mm_add_ps(fjz1,tz);
1008
1009             /**************************
1010              * CALCULATE INTERACTIONS *
1011              **************************/
1012
1013             r12              = _mm_mul_ps(rsq12,rinv12);
1014             r12              = _mm_andnot_ps(dummy_mask,r12);
1015
1016             /* Calculate table index by multiplying r with table scale and truncate to integer */
1017             rt               = _mm_mul_ps(r12,vftabscale);
1018             vfitab           = _mm_cvttps_epi32(rt);
1019             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1020             vfitab           = _mm_slli_epi32(vfitab,2);
1021
1022             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1023             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1024             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1025             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1026             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1027             _MM_TRANSPOSE4_PS(Y,F,G,H);
1028             Heps             = _mm_mul_ps(vfeps,H);
1029             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1030             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1031             velec            = _mm_mul_ps(qq12,VV);
1032             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1033             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1034
1035             /* Update potential sum for this i atom from the interaction with this j atom. */
1036             velec            = _mm_andnot_ps(dummy_mask,velec);
1037             velecsum         = _mm_add_ps(velecsum,velec);
1038
1039             fscal            = felec;
1040
1041             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1042
1043             /* Calculate temporary vectorial force */
1044             tx               = _mm_mul_ps(fscal,dx12);
1045             ty               = _mm_mul_ps(fscal,dy12);
1046             tz               = _mm_mul_ps(fscal,dz12);
1047
1048             /* Update vectorial force */
1049             fix1             = _mm_add_ps(fix1,tx);
1050             fiy1             = _mm_add_ps(fiy1,ty);
1051             fiz1             = _mm_add_ps(fiz1,tz);
1052
1053             fjx2             = _mm_add_ps(fjx2,tx);
1054             fjy2             = _mm_add_ps(fjy2,ty);
1055             fjz2             = _mm_add_ps(fjz2,tz);
1056
1057             /**************************
1058              * CALCULATE INTERACTIONS *
1059              **************************/
1060
1061             r20              = _mm_mul_ps(rsq20,rinv20);
1062             r20              = _mm_andnot_ps(dummy_mask,r20);
1063
1064             /* Calculate table index by multiplying r with table scale and truncate to integer */
1065             rt               = _mm_mul_ps(r20,vftabscale);
1066             vfitab           = _mm_cvttps_epi32(rt);
1067             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1068             vfitab           = _mm_slli_epi32(vfitab,2);
1069
1070             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1071             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1072             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1073             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1074             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1075             _MM_TRANSPOSE4_PS(Y,F,G,H);
1076             Heps             = _mm_mul_ps(vfeps,H);
1077             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1078             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1079             velec            = _mm_mul_ps(qq20,VV);
1080             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1081             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1082
1083             /* Update potential sum for this i atom from the interaction with this j atom. */
1084             velec            = _mm_andnot_ps(dummy_mask,velec);
1085             velecsum         = _mm_add_ps(velecsum,velec);
1086
1087             fscal            = felec;
1088
1089             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1090
1091             /* Calculate temporary vectorial force */
1092             tx               = _mm_mul_ps(fscal,dx20);
1093             ty               = _mm_mul_ps(fscal,dy20);
1094             tz               = _mm_mul_ps(fscal,dz20);
1095
1096             /* Update vectorial force */
1097             fix2             = _mm_add_ps(fix2,tx);
1098             fiy2             = _mm_add_ps(fiy2,ty);
1099             fiz2             = _mm_add_ps(fiz2,tz);
1100
1101             fjx0             = _mm_add_ps(fjx0,tx);
1102             fjy0             = _mm_add_ps(fjy0,ty);
1103             fjz0             = _mm_add_ps(fjz0,tz);
1104
1105             /**************************
1106              * CALCULATE INTERACTIONS *
1107              **************************/
1108
1109             r21              = _mm_mul_ps(rsq21,rinv21);
1110             r21              = _mm_andnot_ps(dummy_mask,r21);
1111
1112             /* Calculate table index by multiplying r with table scale and truncate to integer */
1113             rt               = _mm_mul_ps(r21,vftabscale);
1114             vfitab           = _mm_cvttps_epi32(rt);
1115             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1116             vfitab           = _mm_slli_epi32(vfitab,2);
1117
1118             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1119             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1120             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1121             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1122             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1123             _MM_TRANSPOSE4_PS(Y,F,G,H);
1124             Heps             = _mm_mul_ps(vfeps,H);
1125             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1126             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1127             velec            = _mm_mul_ps(qq21,VV);
1128             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1129             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1130
1131             /* Update potential sum for this i atom from the interaction with this j atom. */
1132             velec            = _mm_andnot_ps(dummy_mask,velec);
1133             velecsum         = _mm_add_ps(velecsum,velec);
1134
1135             fscal            = felec;
1136
1137             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1138
1139             /* Calculate temporary vectorial force */
1140             tx               = _mm_mul_ps(fscal,dx21);
1141             ty               = _mm_mul_ps(fscal,dy21);
1142             tz               = _mm_mul_ps(fscal,dz21);
1143
1144             /* Update vectorial force */
1145             fix2             = _mm_add_ps(fix2,tx);
1146             fiy2             = _mm_add_ps(fiy2,ty);
1147             fiz2             = _mm_add_ps(fiz2,tz);
1148
1149             fjx1             = _mm_add_ps(fjx1,tx);
1150             fjy1             = _mm_add_ps(fjy1,ty);
1151             fjz1             = _mm_add_ps(fjz1,tz);
1152
1153             /**************************
1154              * CALCULATE INTERACTIONS *
1155              **************************/
1156
1157             r22              = _mm_mul_ps(rsq22,rinv22);
1158             r22              = _mm_andnot_ps(dummy_mask,r22);
1159
1160             /* Calculate table index by multiplying r with table scale and truncate to integer */
1161             rt               = _mm_mul_ps(r22,vftabscale);
1162             vfitab           = _mm_cvttps_epi32(rt);
1163             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1164             vfitab           = _mm_slli_epi32(vfitab,2);
1165
1166             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1167             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1168             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1169             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1170             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1171             _MM_TRANSPOSE4_PS(Y,F,G,H);
1172             Heps             = _mm_mul_ps(vfeps,H);
1173             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1174             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1175             velec            = _mm_mul_ps(qq22,VV);
1176             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1177             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1178
1179             /* Update potential sum for this i atom from the interaction with this j atom. */
1180             velec            = _mm_andnot_ps(dummy_mask,velec);
1181             velecsum         = _mm_add_ps(velecsum,velec);
1182
1183             fscal            = felec;
1184
1185             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1186
1187             /* Calculate temporary vectorial force */
1188             tx               = _mm_mul_ps(fscal,dx22);
1189             ty               = _mm_mul_ps(fscal,dy22);
1190             tz               = _mm_mul_ps(fscal,dz22);
1191
1192             /* Update vectorial force */
1193             fix2             = _mm_add_ps(fix2,tx);
1194             fiy2             = _mm_add_ps(fiy2,ty);
1195             fiz2             = _mm_add_ps(fiz2,tz);
1196
1197             fjx2             = _mm_add_ps(fjx2,tx);
1198             fjy2             = _mm_add_ps(fjy2,ty);
1199             fjz2             = _mm_add_ps(fjz2,tz);
1200
1201             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1202             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1203             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1204             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1205
1206             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1207                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1208
1209             /* Inner loop uses 396 flops */
1210         }
1211
1212         /* End of innermost loop */
1213
1214         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1215                                               f+i_coord_offset,fshift+i_shift_offset);
1216
1217         ggid                        = gid[iidx];
1218         /* Update potential energies */
1219         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1220
1221         /* Increment number of inner iterations */
1222         inneriter                  += j_index_end - j_index_start;
1223
1224         /* Outer loop uses 19 flops */
1225     }
1226
1227     /* Increment number of outer iterations */
1228     outeriter        += nri;
1229
1230     /* Update outer/inner flops */
1231
1232     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*396);
1233 }
1234 /*
1235  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse4_1_single
1236  * Electrostatics interaction: CubicSplineTable
1237  * VdW interaction:            None
1238  * Geometry:                   Water3-Water3
1239  * Calculate force/pot:        Force
1240  */
1241 void
1242 nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse4_1_single
1243                     (t_nblist                    * gmx_restrict       nlist,
1244                      rvec                        * gmx_restrict          xx,
1245                      rvec                        * gmx_restrict          ff,
1246                      t_forcerec                  * gmx_restrict          fr,
1247                      t_mdatoms                   * gmx_restrict     mdatoms,
1248                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1249                      t_nrnb                      * gmx_restrict        nrnb)
1250 {
1251     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1252      * just 0 for non-waters.
1253      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1254      * jnr indices corresponding to data put in the four positions in the SIMD register.
1255      */
1256     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1257     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1258     int              jnrA,jnrB,jnrC,jnrD;
1259     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1260     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1261     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1262     real             rcutoff_scalar;
1263     real             *shiftvec,*fshift,*x,*f;
1264     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1265     real             scratch[4*DIM];
1266     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1267     int              vdwioffset0;
1268     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1269     int              vdwioffset1;
1270     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1271     int              vdwioffset2;
1272     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1273     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1274     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1275     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1276     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1277     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1278     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1279     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1280     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1281     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1282     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1283     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1284     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1285     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1286     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1287     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1288     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1289     real             *charge;
1290     __m128i          vfitab;
1291     __m128i          ifour       = _mm_set1_epi32(4);
1292     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1293     real             *vftab;
1294     __m128           dummy_mask,cutoff_mask;
1295     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1296     __m128           one     = _mm_set1_ps(1.0);
1297     __m128           two     = _mm_set1_ps(2.0);
1298     x                = xx[0];
1299     f                = ff[0];
1300
1301     nri              = nlist->nri;
1302     iinr             = nlist->iinr;
1303     jindex           = nlist->jindex;
1304     jjnr             = nlist->jjnr;
1305     shiftidx         = nlist->shift;
1306     gid              = nlist->gid;
1307     shiftvec         = fr->shift_vec[0];
1308     fshift           = fr->fshift[0];
1309     facel            = _mm_set1_ps(fr->epsfac);
1310     charge           = mdatoms->chargeA;
1311
1312     vftab            = kernel_data->table_elec->data;
1313     vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
1314
1315     /* Setup water-specific parameters */
1316     inr              = nlist->iinr[0];
1317     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1318     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1319     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1320
1321     jq0              = _mm_set1_ps(charge[inr+0]);
1322     jq1              = _mm_set1_ps(charge[inr+1]);
1323     jq2              = _mm_set1_ps(charge[inr+2]);
1324     qq00             = _mm_mul_ps(iq0,jq0);
1325     qq01             = _mm_mul_ps(iq0,jq1);
1326     qq02             = _mm_mul_ps(iq0,jq2);
1327     qq10             = _mm_mul_ps(iq1,jq0);
1328     qq11             = _mm_mul_ps(iq1,jq1);
1329     qq12             = _mm_mul_ps(iq1,jq2);
1330     qq20             = _mm_mul_ps(iq2,jq0);
1331     qq21             = _mm_mul_ps(iq2,jq1);
1332     qq22             = _mm_mul_ps(iq2,jq2);
1333
1334     /* Avoid stupid compiler warnings */
1335     jnrA = jnrB = jnrC = jnrD = 0;
1336     j_coord_offsetA = 0;
1337     j_coord_offsetB = 0;
1338     j_coord_offsetC = 0;
1339     j_coord_offsetD = 0;
1340
1341     outeriter        = 0;
1342     inneriter        = 0;
1343
1344     for(iidx=0;iidx<4*DIM;iidx++)
1345     {
1346         scratch[iidx] = 0.0;
1347     }
1348
1349     /* Start outer loop over neighborlists */
1350     for(iidx=0; iidx<nri; iidx++)
1351     {
1352         /* Load shift vector for this list */
1353         i_shift_offset   = DIM*shiftidx[iidx];
1354
1355         /* Load limits for loop over neighbors */
1356         j_index_start    = jindex[iidx];
1357         j_index_end      = jindex[iidx+1];
1358
1359         /* Get outer coordinate index */
1360         inr              = iinr[iidx];
1361         i_coord_offset   = DIM*inr;
1362
1363         /* Load i particle coords and add shift vector */
1364         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1365                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1366
1367         fix0             = _mm_setzero_ps();
1368         fiy0             = _mm_setzero_ps();
1369         fiz0             = _mm_setzero_ps();
1370         fix1             = _mm_setzero_ps();
1371         fiy1             = _mm_setzero_ps();
1372         fiz1             = _mm_setzero_ps();
1373         fix2             = _mm_setzero_ps();
1374         fiy2             = _mm_setzero_ps();
1375         fiz2             = _mm_setzero_ps();
1376
1377         /* Start inner kernel loop */
1378         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1379         {
1380
1381             /* Get j neighbor index, and coordinate index */
1382             jnrA             = jjnr[jidx];
1383             jnrB             = jjnr[jidx+1];
1384             jnrC             = jjnr[jidx+2];
1385             jnrD             = jjnr[jidx+3];
1386             j_coord_offsetA  = DIM*jnrA;
1387             j_coord_offsetB  = DIM*jnrB;
1388             j_coord_offsetC  = DIM*jnrC;
1389             j_coord_offsetD  = DIM*jnrD;
1390
1391             /* load j atom coordinates */
1392             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1393                                               x+j_coord_offsetC,x+j_coord_offsetD,
1394                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1395
1396             /* Calculate displacement vector */
1397             dx00             = _mm_sub_ps(ix0,jx0);
1398             dy00             = _mm_sub_ps(iy0,jy0);
1399             dz00             = _mm_sub_ps(iz0,jz0);
1400             dx01             = _mm_sub_ps(ix0,jx1);
1401             dy01             = _mm_sub_ps(iy0,jy1);
1402             dz01             = _mm_sub_ps(iz0,jz1);
1403             dx02             = _mm_sub_ps(ix0,jx2);
1404             dy02             = _mm_sub_ps(iy0,jy2);
1405             dz02             = _mm_sub_ps(iz0,jz2);
1406             dx10             = _mm_sub_ps(ix1,jx0);
1407             dy10             = _mm_sub_ps(iy1,jy0);
1408             dz10             = _mm_sub_ps(iz1,jz0);
1409             dx11             = _mm_sub_ps(ix1,jx1);
1410             dy11             = _mm_sub_ps(iy1,jy1);
1411             dz11             = _mm_sub_ps(iz1,jz1);
1412             dx12             = _mm_sub_ps(ix1,jx2);
1413             dy12             = _mm_sub_ps(iy1,jy2);
1414             dz12             = _mm_sub_ps(iz1,jz2);
1415             dx20             = _mm_sub_ps(ix2,jx0);
1416             dy20             = _mm_sub_ps(iy2,jy0);
1417             dz20             = _mm_sub_ps(iz2,jz0);
1418             dx21             = _mm_sub_ps(ix2,jx1);
1419             dy21             = _mm_sub_ps(iy2,jy1);
1420             dz21             = _mm_sub_ps(iz2,jz1);
1421             dx22             = _mm_sub_ps(ix2,jx2);
1422             dy22             = _mm_sub_ps(iy2,jy2);
1423             dz22             = _mm_sub_ps(iz2,jz2);
1424
1425             /* Calculate squared distance and things based on it */
1426             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1427             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1428             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1429             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1430             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1431             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1432             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1433             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1434             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1435
1436             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1437             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1438             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1439             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1440             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1441             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1442             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1443             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1444             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1445
1446             fjx0             = _mm_setzero_ps();
1447             fjy0             = _mm_setzero_ps();
1448             fjz0             = _mm_setzero_ps();
1449             fjx1             = _mm_setzero_ps();
1450             fjy1             = _mm_setzero_ps();
1451             fjz1             = _mm_setzero_ps();
1452             fjx2             = _mm_setzero_ps();
1453             fjy2             = _mm_setzero_ps();
1454             fjz2             = _mm_setzero_ps();
1455
1456             /**************************
1457              * CALCULATE INTERACTIONS *
1458              **************************/
1459
1460             r00              = _mm_mul_ps(rsq00,rinv00);
1461
1462             /* Calculate table index by multiplying r with table scale and truncate to integer */
1463             rt               = _mm_mul_ps(r00,vftabscale);
1464             vfitab           = _mm_cvttps_epi32(rt);
1465             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1466             vfitab           = _mm_slli_epi32(vfitab,2);
1467
1468             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1469             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1470             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1471             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1472             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1473             _MM_TRANSPOSE4_PS(Y,F,G,H);
1474             Heps             = _mm_mul_ps(vfeps,H);
1475             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1476             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1477             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1478
1479             fscal            = felec;
1480
1481             /* Calculate temporary vectorial force */
1482             tx               = _mm_mul_ps(fscal,dx00);
1483             ty               = _mm_mul_ps(fscal,dy00);
1484             tz               = _mm_mul_ps(fscal,dz00);
1485
1486             /* Update vectorial force */
1487             fix0             = _mm_add_ps(fix0,tx);
1488             fiy0             = _mm_add_ps(fiy0,ty);
1489             fiz0             = _mm_add_ps(fiz0,tz);
1490
1491             fjx0             = _mm_add_ps(fjx0,tx);
1492             fjy0             = _mm_add_ps(fjy0,ty);
1493             fjz0             = _mm_add_ps(fjz0,tz);
1494
1495             /**************************
1496              * CALCULATE INTERACTIONS *
1497              **************************/
1498
1499             r01              = _mm_mul_ps(rsq01,rinv01);
1500
1501             /* Calculate table index by multiplying r with table scale and truncate to integer */
1502             rt               = _mm_mul_ps(r01,vftabscale);
1503             vfitab           = _mm_cvttps_epi32(rt);
1504             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1505             vfitab           = _mm_slli_epi32(vfitab,2);
1506
1507             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1508             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1509             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1510             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1511             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1512             _MM_TRANSPOSE4_PS(Y,F,G,H);
1513             Heps             = _mm_mul_ps(vfeps,H);
1514             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1515             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1516             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
1517
1518             fscal            = felec;
1519
1520             /* Calculate temporary vectorial force */
1521             tx               = _mm_mul_ps(fscal,dx01);
1522             ty               = _mm_mul_ps(fscal,dy01);
1523             tz               = _mm_mul_ps(fscal,dz01);
1524
1525             /* Update vectorial force */
1526             fix0             = _mm_add_ps(fix0,tx);
1527             fiy0             = _mm_add_ps(fiy0,ty);
1528             fiz0             = _mm_add_ps(fiz0,tz);
1529
1530             fjx1             = _mm_add_ps(fjx1,tx);
1531             fjy1             = _mm_add_ps(fjy1,ty);
1532             fjz1             = _mm_add_ps(fjz1,tz);
1533
1534             /**************************
1535              * CALCULATE INTERACTIONS *
1536              **************************/
1537
1538             r02              = _mm_mul_ps(rsq02,rinv02);
1539
1540             /* Calculate table index by multiplying r with table scale and truncate to integer */
1541             rt               = _mm_mul_ps(r02,vftabscale);
1542             vfitab           = _mm_cvttps_epi32(rt);
1543             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1544             vfitab           = _mm_slli_epi32(vfitab,2);
1545
1546             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1547             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1548             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1549             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1550             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1551             _MM_TRANSPOSE4_PS(Y,F,G,H);
1552             Heps             = _mm_mul_ps(vfeps,H);
1553             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1554             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1555             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
1556
1557             fscal            = felec;
1558
1559             /* Calculate temporary vectorial force */
1560             tx               = _mm_mul_ps(fscal,dx02);
1561             ty               = _mm_mul_ps(fscal,dy02);
1562             tz               = _mm_mul_ps(fscal,dz02);
1563
1564             /* Update vectorial force */
1565             fix0             = _mm_add_ps(fix0,tx);
1566             fiy0             = _mm_add_ps(fiy0,ty);
1567             fiz0             = _mm_add_ps(fiz0,tz);
1568
1569             fjx2             = _mm_add_ps(fjx2,tx);
1570             fjy2             = _mm_add_ps(fjy2,ty);
1571             fjz2             = _mm_add_ps(fjz2,tz);
1572
1573             /**************************
1574              * CALCULATE INTERACTIONS *
1575              **************************/
1576
1577             r10              = _mm_mul_ps(rsq10,rinv10);
1578
1579             /* Calculate table index by multiplying r with table scale and truncate to integer */
1580             rt               = _mm_mul_ps(r10,vftabscale);
1581             vfitab           = _mm_cvttps_epi32(rt);
1582             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1583             vfitab           = _mm_slli_epi32(vfitab,2);
1584
1585             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1586             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1587             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1588             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1589             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1590             _MM_TRANSPOSE4_PS(Y,F,G,H);
1591             Heps             = _mm_mul_ps(vfeps,H);
1592             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1593             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1594             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
1595
1596             fscal            = felec;
1597
1598             /* Calculate temporary vectorial force */
1599             tx               = _mm_mul_ps(fscal,dx10);
1600             ty               = _mm_mul_ps(fscal,dy10);
1601             tz               = _mm_mul_ps(fscal,dz10);
1602
1603             /* Update vectorial force */
1604             fix1             = _mm_add_ps(fix1,tx);
1605             fiy1             = _mm_add_ps(fiy1,ty);
1606             fiz1             = _mm_add_ps(fiz1,tz);
1607
1608             fjx0             = _mm_add_ps(fjx0,tx);
1609             fjy0             = _mm_add_ps(fjy0,ty);
1610             fjz0             = _mm_add_ps(fjz0,tz);
1611
1612             /**************************
1613              * CALCULATE INTERACTIONS *
1614              **************************/
1615
1616             r11              = _mm_mul_ps(rsq11,rinv11);
1617
1618             /* Calculate table index by multiplying r with table scale and truncate to integer */
1619             rt               = _mm_mul_ps(r11,vftabscale);
1620             vfitab           = _mm_cvttps_epi32(rt);
1621             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1622             vfitab           = _mm_slli_epi32(vfitab,2);
1623
1624             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1625             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1626             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1627             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1628             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1629             _MM_TRANSPOSE4_PS(Y,F,G,H);
1630             Heps             = _mm_mul_ps(vfeps,H);
1631             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1632             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1633             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1634
1635             fscal            = felec;
1636
1637             /* Calculate temporary vectorial force */
1638             tx               = _mm_mul_ps(fscal,dx11);
1639             ty               = _mm_mul_ps(fscal,dy11);
1640             tz               = _mm_mul_ps(fscal,dz11);
1641
1642             /* Update vectorial force */
1643             fix1             = _mm_add_ps(fix1,tx);
1644             fiy1             = _mm_add_ps(fiy1,ty);
1645             fiz1             = _mm_add_ps(fiz1,tz);
1646
1647             fjx1             = _mm_add_ps(fjx1,tx);
1648             fjy1             = _mm_add_ps(fjy1,ty);
1649             fjz1             = _mm_add_ps(fjz1,tz);
1650
1651             /**************************
1652              * CALCULATE INTERACTIONS *
1653              **************************/
1654
1655             r12              = _mm_mul_ps(rsq12,rinv12);
1656
1657             /* Calculate table index by multiplying r with table scale and truncate to integer */
1658             rt               = _mm_mul_ps(r12,vftabscale);
1659             vfitab           = _mm_cvttps_epi32(rt);
1660             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1661             vfitab           = _mm_slli_epi32(vfitab,2);
1662
1663             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1664             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1665             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1666             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1667             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1668             _MM_TRANSPOSE4_PS(Y,F,G,H);
1669             Heps             = _mm_mul_ps(vfeps,H);
1670             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1671             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1672             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1673
1674             fscal            = felec;
1675
1676             /* Calculate temporary vectorial force */
1677             tx               = _mm_mul_ps(fscal,dx12);
1678             ty               = _mm_mul_ps(fscal,dy12);
1679             tz               = _mm_mul_ps(fscal,dz12);
1680
1681             /* Update vectorial force */
1682             fix1             = _mm_add_ps(fix1,tx);
1683             fiy1             = _mm_add_ps(fiy1,ty);
1684             fiz1             = _mm_add_ps(fiz1,tz);
1685
1686             fjx2             = _mm_add_ps(fjx2,tx);
1687             fjy2             = _mm_add_ps(fjy2,ty);
1688             fjz2             = _mm_add_ps(fjz2,tz);
1689
1690             /**************************
1691              * CALCULATE INTERACTIONS *
1692              **************************/
1693
1694             r20              = _mm_mul_ps(rsq20,rinv20);
1695
1696             /* Calculate table index by multiplying r with table scale and truncate to integer */
1697             rt               = _mm_mul_ps(r20,vftabscale);
1698             vfitab           = _mm_cvttps_epi32(rt);
1699             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1700             vfitab           = _mm_slli_epi32(vfitab,2);
1701
1702             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1703             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1704             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1705             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1706             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1707             _MM_TRANSPOSE4_PS(Y,F,G,H);
1708             Heps             = _mm_mul_ps(vfeps,H);
1709             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1710             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1711             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1712
1713             fscal            = felec;
1714
1715             /* Calculate temporary vectorial force */
1716             tx               = _mm_mul_ps(fscal,dx20);
1717             ty               = _mm_mul_ps(fscal,dy20);
1718             tz               = _mm_mul_ps(fscal,dz20);
1719
1720             /* Update vectorial force */
1721             fix2             = _mm_add_ps(fix2,tx);
1722             fiy2             = _mm_add_ps(fiy2,ty);
1723             fiz2             = _mm_add_ps(fiz2,tz);
1724
1725             fjx0             = _mm_add_ps(fjx0,tx);
1726             fjy0             = _mm_add_ps(fjy0,ty);
1727             fjz0             = _mm_add_ps(fjz0,tz);
1728
1729             /**************************
1730              * CALCULATE INTERACTIONS *
1731              **************************/
1732
1733             r21              = _mm_mul_ps(rsq21,rinv21);
1734
1735             /* Calculate table index by multiplying r with table scale and truncate to integer */
1736             rt               = _mm_mul_ps(r21,vftabscale);
1737             vfitab           = _mm_cvttps_epi32(rt);
1738             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1739             vfitab           = _mm_slli_epi32(vfitab,2);
1740
1741             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1742             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1743             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1744             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1745             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1746             _MM_TRANSPOSE4_PS(Y,F,G,H);
1747             Heps             = _mm_mul_ps(vfeps,H);
1748             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1749             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1750             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1751
1752             fscal            = felec;
1753
1754             /* Calculate temporary vectorial force */
1755             tx               = _mm_mul_ps(fscal,dx21);
1756             ty               = _mm_mul_ps(fscal,dy21);
1757             tz               = _mm_mul_ps(fscal,dz21);
1758
1759             /* Update vectorial force */
1760             fix2             = _mm_add_ps(fix2,tx);
1761             fiy2             = _mm_add_ps(fiy2,ty);
1762             fiz2             = _mm_add_ps(fiz2,tz);
1763
1764             fjx1             = _mm_add_ps(fjx1,tx);
1765             fjy1             = _mm_add_ps(fjy1,ty);
1766             fjz1             = _mm_add_ps(fjz1,tz);
1767
1768             /**************************
1769              * CALCULATE INTERACTIONS *
1770              **************************/
1771
1772             r22              = _mm_mul_ps(rsq22,rinv22);
1773
1774             /* Calculate table index by multiplying r with table scale and truncate to integer */
1775             rt               = _mm_mul_ps(r22,vftabscale);
1776             vfitab           = _mm_cvttps_epi32(rt);
1777             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1778             vfitab           = _mm_slli_epi32(vfitab,2);
1779
1780             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1781             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1782             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1783             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1784             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1785             _MM_TRANSPOSE4_PS(Y,F,G,H);
1786             Heps             = _mm_mul_ps(vfeps,H);
1787             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1788             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1789             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1790
1791             fscal            = felec;
1792
1793             /* Calculate temporary vectorial force */
1794             tx               = _mm_mul_ps(fscal,dx22);
1795             ty               = _mm_mul_ps(fscal,dy22);
1796             tz               = _mm_mul_ps(fscal,dz22);
1797
1798             /* Update vectorial force */
1799             fix2             = _mm_add_ps(fix2,tx);
1800             fiy2             = _mm_add_ps(fiy2,ty);
1801             fiz2             = _mm_add_ps(fiz2,tz);
1802
1803             fjx2             = _mm_add_ps(fjx2,tx);
1804             fjy2             = _mm_add_ps(fjy2,ty);
1805             fjz2             = _mm_add_ps(fjz2,tz);
1806
1807             fjptrA             = f+j_coord_offsetA;
1808             fjptrB             = f+j_coord_offsetB;
1809             fjptrC             = f+j_coord_offsetC;
1810             fjptrD             = f+j_coord_offsetD;
1811
1812             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1813                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1814
1815             /* Inner loop uses 351 flops */
1816         }
1817
1818         if(jidx<j_index_end)
1819         {
1820
1821             /* Get j neighbor index, and coordinate index */
1822             jnrlistA         = jjnr[jidx];
1823             jnrlistB         = jjnr[jidx+1];
1824             jnrlistC         = jjnr[jidx+2];
1825             jnrlistD         = jjnr[jidx+3];
1826             /* Sign of each element will be negative for non-real atoms.
1827              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1828              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1829              */
1830             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1831             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1832             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1833             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1834             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1835             j_coord_offsetA  = DIM*jnrA;
1836             j_coord_offsetB  = DIM*jnrB;
1837             j_coord_offsetC  = DIM*jnrC;
1838             j_coord_offsetD  = DIM*jnrD;
1839
1840             /* load j atom coordinates */
1841             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1842                                               x+j_coord_offsetC,x+j_coord_offsetD,
1843                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1844
1845             /* Calculate displacement vector */
1846             dx00             = _mm_sub_ps(ix0,jx0);
1847             dy00             = _mm_sub_ps(iy0,jy0);
1848             dz00             = _mm_sub_ps(iz0,jz0);
1849             dx01             = _mm_sub_ps(ix0,jx1);
1850             dy01             = _mm_sub_ps(iy0,jy1);
1851             dz01             = _mm_sub_ps(iz0,jz1);
1852             dx02             = _mm_sub_ps(ix0,jx2);
1853             dy02             = _mm_sub_ps(iy0,jy2);
1854             dz02             = _mm_sub_ps(iz0,jz2);
1855             dx10             = _mm_sub_ps(ix1,jx0);
1856             dy10             = _mm_sub_ps(iy1,jy0);
1857             dz10             = _mm_sub_ps(iz1,jz0);
1858             dx11             = _mm_sub_ps(ix1,jx1);
1859             dy11             = _mm_sub_ps(iy1,jy1);
1860             dz11             = _mm_sub_ps(iz1,jz1);
1861             dx12             = _mm_sub_ps(ix1,jx2);
1862             dy12             = _mm_sub_ps(iy1,jy2);
1863             dz12             = _mm_sub_ps(iz1,jz2);
1864             dx20             = _mm_sub_ps(ix2,jx0);
1865             dy20             = _mm_sub_ps(iy2,jy0);
1866             dz20             = _mm_sub_ps(iz2,jz0);
1867             dx21             = _mm_sub_ps(ix2,jx1);
1868             dy21             = _mm_sub_ps(iy2,jy1);
1869             dz21             = _mm_sub_ps(iz2,jz1);
1870             dx22             = _mm_sub_ps(ix2,jx2);
1871             dy22             = _mm_sub_ps(iy2,jy2);
1872             dz22             = _mm_sub_ps(iz2,jz2);
1873
1874             /* Calculate squared distance and things based on it */
1875             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1876             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1877             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1878             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1879             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1880             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1881             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1882             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1883             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1884
1885             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1886             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1887             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1888             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1889             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1890             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1891             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1892             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1893             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1894
1895             fjx0             = _mm_setzero_ps();
1896             fjy0             = _mm_setzero_ps();
1897             fjz0             = _mm_setzero_ps();
1898             fjx1             = _mm_setzero_ps();
1899             fjy1             = _mm_setzero_ps();
1900             fjz1             = _mm_setzero_ps();
1901             fjx2             = _mm_setzero_ps();
1902             fjy2             = _mm_setzero_ps();
1903             fjz2             = _mm_setzero_ps();
1904
1905             /**************************
1906              * CALCULATE INTERACTIONS *
1907              **************************/
1908
1909             r00              = _mm_mul_ps(rsq00,rinv00);
1910             r00              = _mm_andnot_ps(dummy_mask,r00);
1911
1912             /* Calculate table index by multiplying r with table scale and truncate to integer */
1913             rt               = _mm_mul_ps(r00,vftabscale);
1914             vfitab           = _mm_cvttps_epi32(rt);
1915             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1916             vfitab           = _mm_slli_epi32(vfitab,2);
1917
1918             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1919             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1920             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1921             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1922             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1923             _MM_TRANSPOSE4_PS(Y,F,G,H);
1924             Heps             = _mm_mul_ps(vfeps,H);
1925             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1926             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1927             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1928
1929             fscal            = felec;
1930
1931             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1932
1933             /* Calculate temporary vectorial force */
1934             tx               = _mm_mul_ps(fscal,dx00);
1935             ty               = _mm_mul_ps(fscal,dy00);
1936             tz               = _mm_mul_ps(fscal,dz00);
1937
1938             /* Update vectorial force */
1939             fix0             = _mm_add_ps(fix0,tx);
1940             fiy0             = _mm_add_ps(fiy0,ty);
1941             fiz0             = _mm_add_ps(fiz0,tz);
1942
1943             fjx0             = _mm_add_ps(fjx0,tx);
1944             fjy0             = _mm_add_ps(fjy0,ty);
1945             fjz0             = _mm_add_ps(fjz0,tz);
1946
1947             /**************************
1948              * CALCULATE INTERACTIONS *
1949              **************************/
1950
1951             r01              = _mm_mul_ps(rsq01,rinv01);
1952             r01              = _mm_andnot_ps(dummy_mask,r01);
1953
1954             /* Calculate table index by multiplying r with table scale and truncate to integer */
1955             rt               = _mm_mul_ps(r01,vftabscale);
1956             vfitab           = _mm_cvttps_epi32(rt);
1957             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1958             vfitab           = _mm_slli_epi32(vfitab,2);
1959
1960             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1961             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1962             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1963             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1964             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1965             _MM_TRANSPOSE4_PS(Y,F,G,H);
1966             Heps             = _mm_mul_ps(vfeps,H);
1967             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1968             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1969             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
1970
1971             fscal            = felec;
1972
1973             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1974
1975             /* Calculate temporary vectorial force */
1976             tx               = _mm_mul_ps(fscal,dx01);
1977             ty               = _mm_mul_ps(fscal,dy01);
1978             tz               = _mm_mul_ps(fscal,dz01);
1979
1980             /* Update vectorial force */
1981             fix0             = _mm_add_ps(fix0,tx);
1982             fiy0             = _mm_add_ps(fiy0,ty);
1983             fiz0             = _mm_add_ps(fiz0,tz);
1984
1985             fjx1             = _mm_add_ps(fjx1,tx);
1986             fjy1             = _mm_add_ps(fjy1,ty);
1987             fjz1             = _mm_add_ps(fjz1,tz);
1988
1989             /**************************
1990              * CALCULATE INTERACTIONS *
1991              **************************/
1992
1993             r02              = _mm_mul_ps(rsq02,rinv02);
1994             r02              = _mm_andnot_ps(dummy_mask,r02);
1995
1996             /* Calculate table index by multiplying r with table scale and truncate to integer */
1997             rt               = _mm_mul_ps(r02,vftabscale);
1998             vfitab           = _mm_cvttps_epi32(rt);
1999             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2000             vfitab           = _mm_slli_epi32(vfitab,2);
2001
2002             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2003             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2004             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2005             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2006             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2007             _MM_TRANSPOSE4_PS(Y,F,G,H);
2008             Heps             = _mm_mul_ps(vfeps,H);
2009             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2010             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2011             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
2012
2013             fscal            = felec;
2014
2015             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2016
2017             /* Calculate temporary vectorial force */
2018             tx               = _mm_mul_ps(fscal,dx02);
2019             ty               = _mm_mul_ps(fscal,dy02);
2020             tz               = _mm_mul_ps(fscal,dz02);
2021
2022             /* Update vectorial force */
2023             fix0             = _mm_add_ps(fix0,tx);
2024             fiy0             = _mm_add_ps(fiy0,ty);
2025             fiz0             = _mm_add_ps(fiz0,tz);
2026
2027             fjx2             = _mm_add_ps(fjx2,tx);
2028             fjy2             = _mm_add_ps(fjy2,ty);
2029             fjz2             = _mm_add_ps(fjz2,tz);
2030
2031             /**************************
2032              * CALCULATE INTERACTIONS *
2033              **************************/
2034
2035             r10              = _mm_mul_ps(rsq10,rinv10);
2036             r10              = _mm_andnot_ps(dummy_mask,r10);
2037
2038             /* Calculate table index by multiplying r with table scale and truncate to integer */
2039             rt               = _mm_mul_ps(r10,vftabscale);
2040             vfitab           = _mm_cvttps_epi32(rt);
2041             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2042             vfitab           = _mm_slli_epi32(vfitab,2);
2043
2044             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2045             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2046             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2047             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2048             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2049             _MM_TRANSPOSE4_PS(Y,F,G,H);
2050             Heps             = _mm_mul_ps(vfeps,H);
2051             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2052             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2053             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
2054
2055             fscal            = felec;
2056
2057             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2058
2059             /* Calculate temporary vectorial force */
2060             tx               = _mm_mul_ps(fscal,dx10);
2061             ty               = _mm_mul_ps(fscal,dy10);
2062             tz               = _mm_mul_ps(fscal,dz10);
2063
2064             /* Update vectorial force */
2065             fix1             = _mm_add_ps(fix1,tx);
2066             fiy1             = _mm_add_ps(fiy1,ty);
2067             fiz1             = _mm_add_ps(fiz1,tz);
2068
2069             fjx0             = _mm_add_ps(fjx0,tx);
2070             fjy0             = _mm_add_ps(fjy0,ty);
2071             fjz0             = _mm_add_ps(fjz0,tz);
2072
2073             /**************************
2074              * CALCULATE INTERACTIONS *
2075              **************************/
2076
2077             r11              = _mm_mul_ps(rsq11,rinv11);
2078             r11              = _mm_andnot_ps(dummy_mask,r11);
2079
2080             /* Calculate table index by multiplying r with table scale and truncate to integer */
2081             rt               = _mm_mul_ps(r11,vftabscale);
2082             vfitab           = _mm_cvttps_epi32(rt);
2083             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2084             vfitab           = _mm_slli_epi32(vfitab,2);
2085
2086             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2087             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2088             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2089             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2090             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2091             _MM_TRANSPOSE4_PS(Y,F,G,H);
2092             Heps             = _mm_mul_ps(vfeps,H);
2093             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2094             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2095             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2096
2097             fscal            = felec;
2098
2099             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2100
2101             /* Calculate temporary vectorial force */
2102             tx               = _mm_mul_ps(fscal,dx11);
2103             ty               = _mm_mul_ps(fscal,dy11);
2104             tz               = _mm_mul_ps(fscal,dz11);
2105
2106             /* Update vectorial force */
2107             fix1             = _mm_add_ps(fix1,tx);
2108             fiy1             = _mm_add_ps(fiy1,ty);
2109             fiz1             = _mm_add_ps(fiz1,tz);
2110
2111             fjx1             = _mm_add_ps(fjx1,tx);
2112             fjy1             = _mm_add_ps(fjy1,ty);
2113             fjz1             = _mm_add_ps(fjz1,tz);
2114
2115             /**************************
2116              * CALCULATE INTERACTIONS *
2117              **************************/
2118
2119             r12              = _mm_mul_ps(rsq12,rinv12);
2120             r12              = _mm_andnot_ps(dummy_mask,r12);
2121
2122             /* Calculate table index by multiplying r with table scale and truncate to integer */
2123             rt               = _mm_mul_ps(r12,vftabscale);
2124             vfitab           = _mm_cvttps_epi32(rt);
2125             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2126             vfitab           = _mm_slli_epi32(vfitab,2);
2127
2128             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2129             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2130             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2131             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2132             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2133             _MM_TRANSPOSE4_PS(Y,F,G,H);
2134             Heps             = _mm_mul_ps(vfeps,H);
2135             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2136             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2137             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2138
2139             fscal            = felec;
2140
2141             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2142
2143             /* Calculate temporary vectorial force */
2144             tx               = _mm_mul_ps(fscal,dx12);
2145             ty               = _mm_mul_ps(fscal,dy12);
2146             tz               = _mm_mul_ps(fscal,dz12);
2147
2148             /* Update vectorial force */
2149             fix1             = _mm_add_ps(fix1,tx);
2150             fiy1             = _mm_add_ps(fiy1,ty);
2151             fiz1             = _mm_add_ps(fiz1,tz);
2152
2153             fjx2             = _mm_add_ps(fjx2,tx);
2154             fjy2             = _mm_add_ps(fjy2,ty);
2155             fjz2             = _mm_add_ps(fjz2,tz);
2156
2157             /**************************
2158              * CALCULATE INTERACTIONS *
2159              **************************/
2160
2161             r20              = _mm_mul_ps(rsq20,rinv20);
2162             r20              = _mm_andnot_ps(dummy_mask,r20);
2163
2164             /* Calculate table index by multiplying r with table scale and truncate to integer */
2165             rt               = _mm_mul_ps(r20,vftabscale);
2166             vfitab           = _mm_cvttps_epi32(rt);
2167             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2168             vfitab           = _mm_slli_epi32(vfitab,2);
2169
2170             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2171             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2172             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2173             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2174             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2175             _MM_TRANSPOSE4_PS(Y,F,G,H);
2176             Heps             = _mm_mul_ps(vfeps,H);
2177             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2178             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2179             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
2180
2181             fscal            = felec;
2182
2183             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2184
2185             /* Calculate temporary vectorial force */
2186             tx               = _mm_mul_ps(fscal,dx20);
2187             ty               = _mm_mul_ps(fscal,dy20);
2188             tz               = _mm_mul_ps(fscal,dz20);
2189
2190             /* Update vectorial force */
2191             fix2             = _mm_add_ps(fix2,tx);
2192             fiy2             = _mm_add_ps(fiy2,ty);
2193             fiz2             = _mm_add_ps(fiz2,tz);
2194
2195             fjx0             = _mm_add_ps(fjx0,tx);
2196             fjy0             = _mm_add_ps(fjy0,ty);
2197             fjz0             = _mm_add_ps(fjz0,tz);
2198
2199             /**************************
2200              * CALCULATE INTERACTIONS *
2201              **************************/
2202
2203             r21              = _mm_mul_ps(rsq21,rinv21);
2204             r21              = _mm_andnot_ps(dummy_mask,r21);
2205
2206             /* Calculate table index by multiplying r with table scale and truncate to integer */
2207             rt               = _mm_mul_ps(r21,vftabscale);
2208             vfitab           = _mm_cvttps_epi32(rt);
2209             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2210             vfitab           = _mm_slli_epi32(vfitab,2);
2211
2212             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2213             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2214             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2215             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2216             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2217             _MM_TRANSPOSE4_PS(Y,F,G,H);
2218             Heps             = _mm_mul_ps(vfeps,H);
2219             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2220             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2221             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2222
2223             fscal            = felec;
2224
2225             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2226
2227             /* Calculate temporary vectorial force */
2228             tx               = _mm_mul_ps(fscal,dx21);
2229             ty               = _mm_mul_ps(fscal,dy21);
2230             tz               = _mm_mul_ps(fscal,dz21);
2231
2232             /* Update vectorial force */
2233             fix2             = _mm_add_ps(fix2,tx);
2234             fiy2             = _mm_add_ps(fiy2,ty);
2235             fiz2             = _mm_add_ps(fiz2,tz);
2236
2237             fjx1             = _mm_add_ps(fjx1,tx);
2238             fjy1             = _mm_add_ps(fjy1,ty);
2239             fjz1             = _mm_add_ps(fjz1,tz);
2240
2241             /**************************
2242              * CALCULATE INTERACTIONS *
2243              **************************/
2244
2245             r22              = _mm_mul_ps(rsq22,rinv22);
2246             r22              = _mm_andnot_ps(dummy_mask,r22);
2247
2248             /* Calculate table index by multiplying r with table scale and truncate to integer */
2249             rt               = _mm_mul_ps(r22,vftabscale);
2250             vfitab           = _mm_cvttps_epi32(rt);
2251             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2252             vfitab           = _mm_slli_epi32(vfitab,2);
2253
2254             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2255             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2256             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2257             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2258             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2259             _MM_TRANSPOSE4_PS(Y,F,G,H);
2260             Heps             = _mm_mul_ps(vfeps,H);
2261             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2262             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2263             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2264
2265             fscal            = felec;
2266
2267             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2268
2269             /* Calculate temporary vectorial force */
2270             tx               = _mm_mul_ps(fscal,dx22);
2271             ty               = _mm_mul_ps(fscal,dy22);
2272             tz               = _mm_mul_ps(fscal,dz22);
2273
2274             /* Update vectorial force */
2275             fix2             = _mm_add_ps(fix2,tx);
2276             fiy2             = _mm_add_ps(fiy2,ty);
2277             fiz2             = _mm_add_ps(fiz2,tz);
2278
2279             fjx2             = _mm_add_ps(fjx2,tx);
2280             fjy2             = _mm_add_ps(fjy2,ty);
2281             fjz2             = _mm_add_ps(fjz2,tz);
2282
2283             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2284             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2285             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2286             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2287
2288             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2289                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2290
2291             /* Inner loop uses 360 flops */
2292         }
2293
2294         /* End of innermost loop */
2295
2296         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2297                                               f+i_coord_offset,fshift+i_shift_offset);
2298
2299         /* Increment number of inner iterations */
2300         inneriter                  += j_index_end - j_index_start;
2301
2302         /* Outer loop uses 18 flops */
2303     }
2304
2305     /* Increment number of outer iterations */
2306     outeriter        += nri;
2307
2308     /* Update outer/inner flops */
2309
2310     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*360);
2311 }