32748779e39803bde5d4a6d244d8e00629611fa1
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_ElecRF_VdwCSTab_GeomW4P1_sse4_1_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
46
47 #include "gromacs/simd/math_x86_sse4_1_single.h"
48 #include "kernelutil_x86_sse4_1_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse4_1_single
52  * Electrostatics interaction: ReactionField
53  * VdW interaction:            CubicSplineTable
54  * Geometry:                   Water4-Particle
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse4_1_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83     int              vdwioffset0;
84     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     int              vdwioffset1;
86     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     int              vdwioffset2;
88     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     int              vdwioffset3;
90     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
94     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
95     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
96     __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
97     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
98     real             *charge;
99     int              nvdwtype;
100     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
101     int              *vdwtype;
102     real             *vdwparam;
103     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
104     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
105     __m128i          vfitab;
106     __m128i          ifour       = _mm_set1_epi32(4);
107     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
108     real             *vftab;
109     __m128           dummy_mask,cutoff_mask;
110     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
111     __m128           one     = _mm_set1_ps(1.0);
112     __m128           two     = _mm_set1_ps(2.0);
113     x                = xx[0];
114     f                = ff[0];
115
116     nri              = nlist->nri;
117     iinr             = nlist->iinr;
118     jindex           = nlist->jindex;
119     jjnr             = nlist->jjnr;
120     shiftidx         = nlist->shift;
121     gid              = nlist->gid;
122     shiftvec         = fr->shift_vec[0];
123     fshift           = fr->fshift[0];
124     facel            = _mm_set1_ps(fr->epsfac);
125     charge           = mdatoms->chargeA;
126     krf              = _mm_set1_ps(fr->ic->k_rf);
127     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
128     crf              = _mm_set1_ps(fr->ic->c_rf);
129     nvdwtype         = fr->ntype;
130     vdwparam         = fr->nbfp;
131     vdwtype          = mdatoms->typeA;
132
133     vftab            = kernel_data->table_vdw->data;
134     vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
135
136     /* Setup water-specific parameters */
137     inr              = nlist->iinr[0];
138     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
139     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
140     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
141     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
142
143     /* Avoid stupid compiler warnings */
144     jnrA = jnrB = jnrC = jnrD = 0;
145     j_coord_offsetA = 0;
146     j_coord_offsetB = 0;
147     j_coord_offsetC = 0;
148     j_coord_offsetD = 0;
149
150     outeriter        = 0;
151     inneriter        = 0;
152
153     for(iidx=0;iidx<4*DIM;iidx++)
154     {
155         scratch[iidx] = 0.0;
156     }
157
158     /* Start outer loop over neighborlists */
159     for(iidx=0; iidx<nri; iidx++)
160     {
161         /* Load shift vector for this list */
162         i_shift_offset   = DIM*shiftidx[iidx];
163
164         /* Load limits for loop over neighbors */
165         j_index_start    = jindex[iidx];
166         j_index_end      = jindex[iidx+1];
167
168         /* Get outer coordinate index */
169         inr              = iinr[iidx];
170         i_coord_offset   = DIM*inr;
171
172         /* Load i particle coords and add shift vector */
173         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
174                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
175
176         fix0             = _mm_setzero_ps();
177         fiy0             = _mm_setzero_ps();
178         fiz0             = _mm_setzero_ps();
179         fix1             = _mm_setzero_ps();
180         fiy1             = _mm_setzero_ps();
181         fiz1             = _mm_setzero_ps();
182         fix2             = _mm_setzero_ps();
183         fiy2             = _mm_setzero_ps();
184         fiz2             = _mm_setzero_ps();
185         fix3             = _mm_setzero_ps();
186         fiy3             = _mm_setzero_ps();
187         fiz3             = _mm_setzero_ps();
188
189         /* Reset potential sums */
190         velecsum         = _mm_setzero_ps();
191         vvdwsum          = _mm_setzero_ps();
192
193         /* Start inner kernel loop */
194         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
195         {
196
197             /* Get j neighbor index, and coordinate index */
198             jnrA             = jjnr[jidx];
199             jnrB             = jjnr[jidx+1];
200             jnrC             = jjnr[jidx+2];
201             jnrD             = jjnr[jidx+3];
202             j_coord_offsetA  = DIM*jnrA;
203             j_coord_offsetB  = DIM*jnrB;
204             j_coord_offsetC  = DIM*jnrC;
205             j_coord_offsetD  = DIM*jnrD;
206
207             /* load j atom coordinates */
208             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
209                                               x+j_coord_offsetC,x+j_coord_offsetD,
210                                               &jx0,&jy0,&jz0);
211
212             /* Calculate displacement vector */
213             dx00             = _mm_sub_ps(ix0,jx0);
214             dy00             = _mm_sub_ps(iy0,jy0);
215             dz00             = _mm_sub_ps(iz0,jz0);
216             dx10             = _mm_sub_ps(ix1,jx0);
217             dy10             = _mm_sub_ps(iy1,jy0);
218             dz10             = _mm_sub_ps(iz1,jz0);
219             dx20             = _mm_sub_ps(ix2,jx0);
220             dy20             = _mm_sub_ps(iy2,jy0);
221             dz20             = _mm_sub_ps(iz2,jz0);
222             dx30             = _mm_sub_ps(ix3,jx0);
223             dy30             = _mm_sub_ps(iy3,jy0);
224             dz30             = _mm_sub_ps(iz3,jz0);
225
226             /* Calculate squared distance and things based on it */
227             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
228             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
229             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
230             rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
231
232             rinv00           = gmx_mm_invsqrt_ps(rsq00);
233             rinv10           = gmx_mm_invsqrt_ps(rsq10);
234             rinv20           = gmx_mm_invsqrt_ps(rsq20);
235             rinv30           = gmx_mm_invsqrt_ps(rsq30);
236
237             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
238             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
239             rinvsq30         = _mm_mul_ps(rinv30,rinv30);
240
241             /* Load parameters for j particles */
242             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
243                                                               charge+jnrC+0,charge+jnrD+0);
244             vdwjidx0A        = 2*vdwtype[jnrA+0];
245             vdwjidx0B        = 2*vdwtype[jnrB+0];
246             vdwjidx0C        = 2*vdwtype[jnrC+0];
247             vdwjidx0D        = 2*vdwtype[jnrD+0];
248
249             fjx0             = _mm_setzero_ps();
250             fjy0             = _mm_setzero_ps();
251             fjz0             = _mm_setzero_ps();
252
253             /**************************
254              * CALCULATE INTERACTIONS *
255              **************************/
256
257             r00              = _mm_mul_ps(rsq00,rinv00);
258
259             /* Compute parameters for interactions between i and j atoms */
260             gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
261                                          vdwparam+vdwioffset0+vdwjidx0B,
262                                          vdwparam+vdwioffset0+vdwjidx0C,
263                                          vdwparam+vdwioffset0+vdwjidx0D,
264                                          &c6_00,&c12_00);
265
266             /* Calculate table index by multiplying r with table scale and truncate to integer */
267             rt               = _mm_mul_ps(r00,vftabscale);
268             vfitab           = _mm_cvttps_epi32(rt);
269             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
270             vfitab           = _mm_slli_epi32(vfitab,3);
271
272             /* CUBIC SPLINE TABLE DISPERSION */
273             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
274             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
275             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
276             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
277             _MM_TRANSPOSE4_PS(Y,F,G,H);
278             Heps             = _mm_mul_ps(vfeps,H);
279             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
280             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
281             vvdw6            = _mm_mul_ps(c6_00,VV);
282             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
283             fvdw6            = _mm_mul_ps(c6_00,FF);
284
285             /* CUBIC SPLINE TABLE REPULSION */
286             vfitab           = _mm_add_epi32(vfitab,ifour);
287             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
288             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
289             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
290             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
291             _MM_TRANSPOSE4_PS(Y,F,G,H);
292             Heps             = _mm_mul_ps(vfeps,H);
293             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
294             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
295             vvdw12           = _mm_mul_ps(c12_00,VV);
296             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
297             fvdw12           = _mm_mul_ps(c12_00,FF);
298             vvdw             = _mm_add_ps(vvdw12,vvdw6);
299             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
300
301             /* Update potential sum for this i atom from the interaction with this j atom. */
302             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
303
304             fscal            = fvdw;
305
306             /* Calculate temporary vectorial force */
307             tx               = _mm_mul_ps(fscal,dx00);
308             ty               = _mm_mul_ps(fscal,dy00);
309             tz               = _mm_mul_ps(fscal,dz00);
310
311             /* Update vectorial force */
312             fix0             = _mm_add_ps(fix0,tx);
313             fiy0             = _mm_add_ps(fiy0,ty);
314             fiz0             = _mm_add_ps(fiz0,tz);
315
316             fjx0             = _mm_add_ps(fjx0,tx);
317             fjy0             = _mm_add_ps(fjy0,ty);
318             fjz0             = _mm_add_ps(fjz0,tz);
319
320             /**************************
321              * CALCULATE INTERACTIONS *
322              **************************/
323
324             /* Compute parameters for interactions between i and j atoms */
325             qq10             = _mm_mul_ps(iq1,jq0);
326
327             /* REACTION-FIELD ELECTROSTATICS */
328             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
329             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
330
331             /* Update potential sum for this i atom from the interaction with this j atom. */
332             velecsum         = _mm_add_ps(velecsum,velec);
333
334             fscal            = felec;
335
336             /* Calculate temporary vectorial force */
337             tx               = _mm_mul_ps(fscal,dx10);
338             ty               = _mm_mul_ps(fscal,dy10);
339             tz               = _mm_mul_ps(fscal,dz10);
340
341             /* Update vectorial force */
342             fix1             = _mm_add_ps(fix1,tx);
343             fiy1             = _mm_add_ps(fiy1,ty);
344             fiz1             = _mm_add_ps(fiz1,tz);
345
346             fjx0             = _mm_add_ps(fjx0,tx);
347             fjy0             = _mm_add_ps(fjy0,ty);
348             fjz0             = _mm_add_ps(fjz0,tz);
349
350             /**************************
351              * CALCULATE INTERACTIONS *
352              **************************/
353
354             /* Compute parameters for interactions between i and j atoms */
355             qq20             = _mm_mul_ps(iq2,jq0);
356
357             /* REACTION-FIELD ELECTROSTATICS */
358             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
359             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
360
361             /* Update potential sum for this i atom from the interaction with this j atom. */
362             velecsum         = _mm_add_ps(velecsum,velec);
363
364             fscal            = felec;
365
366             /* Calculate temporary vectorial force */
367             tx               = _mm_mul_ps(fscal,dx20);
368             ty               = _mm_mul_ps(fscal,dy20);
369             tz               = _mm_mul_ps(fscal,dz20);
370
371             /* Update vectorial force */
372             fix2             = _mm_add_ps(fix2,tx);
373             fiy2             = _mm_add_ps(fiy2,ty);
374             fiz2             = _mm_add_ps(fiz2,tz);
375
376             fjx0             = _mm_add_ps(fjx0,tx);
377             fjy0             = _mm_add_ps(fjy0,ty);
378             fjz0             = _mm_add_ps(fjz0,tz);
379
380             /**************************
381              * CALCULATE INTERACTIONS *
382              **************************/
383
384             /* Compute parameters for interactions between i and j atoms */
385             qq30             = _mm_mul_ps(iq3,jq0);
386
387             /* REACTION-FIELD ELECTROSTATICS */
388             velec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
389             felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
390
391             /* Update potential sum for this i atom from the interaction with this j atom. */
392             velecsum         = _mm_add_ps(velecsum,velec);
393
394             fscal            = felec;
395
396             /* Calculate temporary vectorial force */
397             tx               = _mm_mul_ps(fscal,dx30);
398             ty               = _mm_mul_ps(fscal,dy30);
399             tz               = _mm_mul_ps(fscal,dz30);
400
401             /* Update vectorial force */
402             fix3             = _mm_add_ps(fix3,tx);
403             fiy3             = _mm_add_ps(fiy3,ty);
404             fiz3             = _mm_add_ps(fiz3,tz);
405
406             fjx0             = _mm_add_ps(fjx0,tx);
407             fjy0             = _mm_add_ps(fjy0,ty);
408             fjz0             = _mm_add_ps(fjz0,tz);
409
410             fjptrA             = f+j_coord_offsetA;
411             fjptrB             = f+j_coord_offsetB;
412             fjptrC             = f+j_coord_offsetC;
413             fjptrD             = f+j_coord_offsetD;
414
415             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
416
417             /* Inner loop uses 152 flops */
418         }
419
420         if(jidx<j_index_end)
421         {
422
423             /* Get j neighbor index, and coordinate index */
424             jnrlistA         = jjnr[jidx];
425             jnrlistB         = jjnr[jidx+1];
426             jnrlistC         = jjnr[jidx+2];
427             jnrlistD         = jjnr[jidx+3];
428             /* Sign of each element will be negative for non-real atoms.
429              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
430              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
431              */
432             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
433             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
434             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
435             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
436             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
437             j_coord_offsetA  = DIM*jnrA;
438             j_coord_offsetB  = DIM*jnrB;
439             j_coord_offsetC  = DIM*jnrC;
440             j_coord_offsetD  = DIM*jnrD;
441
442             /* load j atom coordinates */
443             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
444                                               x+j_coord_offsetC,x+j_coord_offsetD,
445                                               &jx0,&jy0,&jz0);
446
447             /* Calculate displacement vector */
448             dx00             = _mm_sub_ps(ix0,jx0);
449             dy00             = _mm_sub_ps(iy0,jy0);
450             dz00             = _mm_sub_ps(iz0,jz0);
451             dx10             = _mm_sub_ps(ix1,jx0);
452             dy10             = _mm_sub_ps(iy1,jy0);
453             dz10             = _mm_sub_ps(iz1,jz0);
454             dx20             = _mm_sub_ps(ix2,jx0);
455             dy20             = _mm_sub_ps(iy2,jy0);
456             dz20             = _mm_sub_ps(iz2,jz0);
457             dx30             = _mm_sub_ps(ix3,jx0);
458             dy30             = _mm_sub_ps(iy3,jy0);
459             dz30             = _mm_sub_ps(iz3,jz0);
460
461             /* Calculate squared distance and things based on it */
462             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
463             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
464             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
465             rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
466
467             rinv00           = gmx_mm_invsqrt_ps(rsq00);
468             rinv10           = gmx_mm_invsqrt_ps(rsq10);
469             rinv20           = gmx_mm_invsqrt_ps(rsq20);
470             rinv30           = gmx_mm_invsqrt_ps(rsq30);
471
472             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
473             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
474             rinvsq30         = _mm_mul_ps(rinv30,rinv30);
475
476             /* Load parameters for j particles */
477             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
478                                                               charge+jnrC+0,charge+jnrD+0);
479             vdwjidx0A        = 2*vdwtype[jnrA+0];
480             vdwjidx0B        = 2*vdwtype[jnrB+0];
481             vdwjidx0C        = 2*vdwtype[jnrC+0];
482             vdwjidx0D        = 2*vdwtype[jnrD+0];
483
484             fjx0             = _mm_setzero_ps();
485             fjy0             = _mm_setzero_ps();
486             fjz0             = _mm_setzero_ps();
487
488             /**************************
489              * CALCULATE INTERACTIONS *
490              **************************/
491
492             r00              = _mm_mul_ps(rsq00,rinv00);
493             r00              = _mm_andnot_ps(dummy_mask,r00);
494
495             /* Compute parameters for interactions between i and j atoms */
496             gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
497                                          vdwparam+vdwioffset0+vdwjidx0B,
498                                          vdwparam+vdwioffset0+vdwjidx0C,
499                                          vdwparam+vdwioffset0+vdwjidx0D,
500                                          &c6_00,&c12_00);
501
502             /* Calculate table index by multiplying r with table scale and truncate to integer */
503             rt               = _mm_mul_ps(r00,vftabscale);
504             vfitab           = _mm_cvttps_epi32(rt);
505             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
506             vfitab           = _mm_slli_epi32(vfitab,3);
507
508             /* CUBIC SPLINE TABLE DISPERSION */
509             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
510             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
511             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
512             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
513             _MM_TRANSPOSE4_PS(Y,F,G,H);
514             Heps             = _mm_mul_ps(vfeps,H);
515             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
516             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
517             vvdw6            = _mm_mul_ps(c6_00,VV);
518             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
519             fvdw6            = _mm_mul_ps(c6_00,FF);
520
521             /* CUBIC SPLINE TABLE REPULSION */
522             vfitab           = _mm_add_epi32(vfitab,ifour);
523             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
524             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
525             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
526             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
527             _MM_TRANSPOSE4_PS(Y,F,G,H);
528             Heps             = _mm_mul_ps(vfeps,H);
529             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
530             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
531             vvdw12           = _mm_mul_ps(c12_00,VV);
532             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
533             fvdw12           = _mm_mul_ps(c12_00,FF);
534             vvdw             = _mm_add_ps(vvdw12,vvdw6);
535             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
536
537             /* Update potential sum for this i atom from the interaction with this j atom. */
538             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
539             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
540
541             fscal            = fvdw;
542
543             fscal            = _mm_andnot_ps(dummy_mask,fscal);
544
545             /* Calculate temporary vectorial force */
546             tx               = _mm_mul_ps(fscal,dx00);
547             ty               = _mm_mul_ps(fscal,dy00);
548             tz               = _mm_mul_ps(fscal,dz00);
549
550             /* Update vectorial force */
551             fix0             = _mm_add_ps(fix0,tx);
552             fiy0             = _mm_add_ps(fiy0,ty);
553             fiz0             = _mm_add_ps(fiz0,tz);
554
555             fjx0             = _mm_add_ps(fjx0,tx);
556             fjy0             = _mm_add_ps(fjy0,ty);
557             fjz0             = _mm_add_ps(fjz0,tz);
558
559             /**************************
560              * CALCULATE INTERACTIONS *
561              **************************/
562
563             /* Compute parameters for interactions between i and j atoms */
564             qq10             = _mm_mul_ps(iq1,jq0);
565
566             /* REACTION-FIELD ELECTROSTATICS */
567             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
568             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
569
570             /* Update potential sum for this i atom from the interaction with this j atom. */
571             velec            = _mm_andnot_ps(dummy_mask,velec);
572             velecsum         = _mm_add_ps(velecsum,velec);
573
574             fscal            = felec;
575
576             fscal            = _mm_andnot_ps(dummy_mask,fscal);
577
578             /* Calculate temporary vectorial force */
579             tx               = _mm_mul_ps(fscal,dx10);
580             ty               = _mm_mul_ps(fscal,dy10);
581             tz               = _mm_mul_ps(fscal,dz10);
582
583             /* Update vectorial force */
584             fix1             = _mm_add_ps(fix1,tx);
585             fiy1             = _mm_add_ps(fiy1,ty);
586             fiz1             = _mm_add_ps(fiz1,tz);
587
588             fjx0             = _mm_add_ps(fjx0,tx);
589             fjy0             = _mm_add_ps(fjy0,ty);
590             fjz0             = _mm_add_ps(fjz0,tz);
591
592             /**************************
593              * CALCULATE INTERACTIONS *
594              **************************/
595
596             /* Compute parameters for interactions between i and j atoms */
597             qq20             = _mm_mul_ps(iq2,jq0);
598
599             /* REACTION-FIELD ELECTROSTATICS */
600             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
601             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
602
603             /* Update potential sum for this i atom from the interaction with this j atom. */
604             velec            = _mm_andnot_ps(dummy_mask,velec);
605             velecsum         = _mm_add_ps(velecsum,velec);
606
607             fscal            = felec;
608
609             fscal            = _mm_andnot_ps(dummy_mask,fscal);
610
611             /* Calculate temporary vectorial force */
612             tx               = _mm_mul_ps(fscal,dx20);
613             ty               = _mm_mul_ps(fscal,dy20);
614             tz               = _mm_mul_ps(fscal,dz20);
615
616             /* Update vectorial force */
617             fix2             = _mm_add_ps(fix2,tx);
618             fiy2             = _mm_add_ps(fiy2,ty);
619             fiz2             = _mm_add_ps(fiz2,tz);
620
621             fjx0             = _mm_add_ps(fjx0,tx);
622             fjy0             = _mm_add_ps(fjy0,ty);
623             fjz0             = _mm_add_ps(fjz0,tz);
624
625             /**************************
626              * CALCULATE INTERACTIONS *
627              **************************/
628
629             /* Compute parameters for interactions between i and j atoms */
630             qq30             = _mm_mul_ps(iq3,jq0);
631
632             /* REACTION-FIELD ELECTROSTATICS */
633             velec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
634             felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
635
636             /* Update potential sum for this i atom from the interaction with this j atom. */
637             velec            = _mm_andnot_ps(dummy_mask,velec);
638             velecsum         = _mm_add_ps(velecsum,velec);
639
640             fscal            = felec;
641
642             fscal            = _mm_andnot_ps(dummy_mask,fscal);
643
644             /* Calculate temporary vectorial force */
645             tx               = _mm_mul_ps(fscal,dx30);
646             ty               = _mm_mul_ps(fscal,dy30);
647             tz               = _mm_mul_ps(fscal,dz30);
648
649             /* Update vectorial force */
650             fix3             = _mm_add_ps(fix3,tx);
651             fiy3             = _mm_add_ps(fiy3,ty);
652             fiz3             = _mm_add_ps(fiz3,tz);
653
654             fjx0             = _mm_add_ps(fjx0,tx);
655             fjy0             = _mm_add_ps(fjy0,ty);
656             fjz0             = _mm_add_ps(fjz0,tz);
657
658             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
659             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
660             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
661             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
662
663             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
664
665             /* Inner loop uses 153 flops */
666         }
667
668         /* End of innermost loop */
669
670         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
671                                               f+i_coord_offset,fshift+i_shift_offset);
672
673         ggid                        = gid[iidx];
674         /* Update potential energies */
675         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
676         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
677
678         /* Increment number of inner iterations */
679         inneriter                  += j_index_end - j_index_start;
680
681         /* Outer loop uses 26 flops */
682     }
683
684     /* Increment number of outer iterations */
685     outeriter        += nri;
686
687     /* Update outer/inner flops */
688
689     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*153);
690 }
691 /*
692  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse4_1_single
693  * Electrostatics interaction: ReactionField
694  * VdW interaction:            CubicSplineTable
695  * Geometry:                   Water4-Particle
696  * Calculate force/pot:        Force
697  */
698 void
699 nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse4_1_single
700                     (t_nblist                    * gmx_restrict       nlist,
701                      rvec                        * gmx_restrict          xx,
702                      rvec                        * gmx_restrict          ff,
703                      t_forcerec                  * gmx_restrict          fr,
704                      t_mdatoms                   * gmx_restrict     mdatoms,
705                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
706                      t_nrnb                      * gmx_restrict        nrnb)
707 {
708     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
709      * just 0 for non-waters.
710      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
711      * jnr indices corresponding to data put in the four positions in the SIMD register.
712      */
713     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
714     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
715     int              jnrA,jnrB,jnrC,jnrD;
716     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
717     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
718     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
719     real             rcutoff_scalar;
720     real             *shiftvec,*fshift,*x,*f;
721     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
722     real             scratch[4*DIM];
723     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
724     int              vdwioffset0;
725     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
726     int              vdwioffset1;
727     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
728     int              vdwioffset2;
729     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
730     int              vdwioffset3;
731     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
732     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
733     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
734     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
735     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
736     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
737     __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
738     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
739     real             *charge;
740     int              nvdwtype;
741     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
742     int              *vdwtype;
743     real             *vdwparam;
744     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
745     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
746     __m128i          vfitab;
747     __m128i          ifour       = _mm_set1_epi32(4);
748     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
749     real             *vftab;
750     __m128           dummy_mask,cutoff_mask;
751     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
752     __m128           one     = _mm_set1_ps(1.0);
753     __m128           two     = _mm_set1_ps(2.0);
754     x                = xx[0];
755     f                = ff[0];
756
757     nri              = nlist->nri;
758     iinr             = nlist->iinr;
759     jindex           = nlist->jindex;
760     jjnr             = nlist->jjnr;
761     shiftidx         = nlist->shift;
762     gid              = nlist->gid;
763     shiftvec         = fr->shift_vec[0];
764     fshift           = fr->fshift[0];
765     facel            = _mm_set1_ps(fr->epsfac);
766     charge           = mdatoms->chargeA;
767     krf              = _mm_set1_ps(fr->ic->k_rf);
768     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
769     crf              = _mm_set1_ps(fr->ic->c_rf);
770     nvdwtype         = fr->ntype;
771     vdwparam         = fr->nbfp;
772     vdwtype          = mdatoms->typeA;
773
774     vftab            = kernel_data->table_vdw->data;
775     vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
776
777     /* Setup water-specific parameters */
778     inr              = nlist->iinr[0];
779     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
780     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
781     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
782     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
783
784     /* Avoid stupid compiler warnings */
785     jnrA = jnrB = jnrC = jnrD = 0;
786     j_coord_offsetA = 0;
787     j_coord_offsetB = 0;
788     j_coord_offsetC = 0;
789     j_coord_offsetD = 0;
790
791     outeriter        = 0;
792     inneriter        = 0;
793
794     for(iidx=0;iidx<4*DIM;iidx++)
795     {
796         scratch[iidx] = 0.0;
797     }
798
799     /* Start outer loop over neighborlists */
800     for(iidx=0; iidx<nri; iidx++)
801     {
802         /* Load shift vector for this list */
803         i_shift_offset   = DIM*shiftidx[iidx];
804
805         /* Load limits for loop over neighbors */
806         j_index_start    = jindex[iidx];
807         j_index_end      = jindex[iidx+1];
808
809         /* Get outer coordinate index */
810         inr              = iinr[iidx];
811         i_coord_offset   = DIM*inr;
812
813         /* Load i particle coords and add shift vector */
814         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
815                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
816
817         fix0             = _mm_setzero_ps();
818         fiy0             = _mm_setzero_ps();
819         fiz0             = _mm_setzero_ps();
820         fix1             = _mm_setzero_ps();
821         fiy1             = _mm_setzero_ps();
822         fiz1             = _mm_setzero_ps();
823         fix2             = _mm_setzero_ps();
824         fiy2             = _mm_setzero_ps();
825         fiz2             = _mm_setzero_ps();
826         fix3             = _mm_setzero_ps();
827         fiy3             = _mm_setzero_ps();
828         fiz3             = _mm_setzero_ps();
829
830         /* Start inner kernel loop */
831         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
832         {
833
834             /* Get j neighbor index, and coordinate index */
835             jnrA             = jjnr[jidx];
836             jnrB             = jjnr[jidx+1];
837             jnrC             = jjnr[jidx+2];
838             jnrD             = jjnr[jidx+3];
839             j_coord_offsetA  = DIM*jnrA;
840             j_coord_offsetB  = DIM*jnrB;
841             j_coord_offsetC  = DIM*jnrC;
842             j_coord_offsetD  = DIM*jnrD;
843
844             /* load j atom coordinates */
845             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
846                                               x+j_coord_offsetC,x+j_coord_offsetD,
847                                               &jx0,&jy0,&jz0);
848
849             /* Calculate displacement vector */
850             dx00             = _mm_sub_ps(ix0,jx0);
851             dy00             = _mm_sub_ps(iy0,jy0);
852             dz00             = _mm_sub_ps(iz0,jz0);
853             dx10             = _mm_sub_ps(ix1,jx0);
854             dy10             = _mm_sub_ps(iy1,jy0);
855             dz10             = _mm_sub_ps(iz1,jz0);
856             dx20             = _mm_sub_ps(ix2,jx0);
857             dy20             = _mm_sub_ps(iy2,jy0);
858             dz20             = _mm_sub_ps(iz2,jz0);
859             dx30             = _mm_sub_ps(ix3,jx0);
860             dy30             = _mm_sub_ps(iy3,jy0);
861             dz30             = _mm_sub_ps(iz3,jz0);
862
863             /* Calculate squared distance and things based on it */
864             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
865             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
866             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
867             rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
868
869             rinv00           = gmx_mm_invsqrt_ps(rsq00);
870             rinv10           = gmx_mm_invsqrt_ps(rsq10);
871             rinv20           = gmx_mm_invsqrt_ps(rsq20);
872             rinv30           = gmx_mm_invsqrt_ps(rsq30);
873
874             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
875             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
876             rinvsq30         = _mm_mul_ps(rinv30,rinv30);
877
878             /* Load parameters for j particles */
879             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
880                                                               charge+jnrC+0,charge+jnrD+0);
881             vdwjidx0A        = 2*vdwtype[jnrA+0];
882             vdwjidx0B        = 2*vdwtype[jnrB+0];
883             vdwjidx0C        = 2*vdwtype[jnrC+0];
884             vdwjidx0D        = 2*vdwtype[jnrD+0];
885
886             fjx0             = _mm_setzero_ps();
887             fjy0             = _mm_setzero_ps();
888             fjz0             = _mm_setzero_ps();
889
890             /**************************
891              * CALCULATE INTERACTIONS *
892              **************************/
893
894             r00              = _mm_mul_ps(rsq00,rinv00);
895
896             /* Compute parameters for interactions between i and j atoms */
897             gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
898                                          vdwparam+vdwioffset0+vdwjidx0B,
899                                          vdwparam+vdwioffset0+vdwjidx0C,
900                                          vdwparam+vdwioffset0+vdwjidx0D,
901                                          &c6_00,&c12_00);
902
903             /* Calculate table index by multiplying r with table scale and truncate to integer */
904             rt               = _mm_mul_ps(r00,vftabscale);
905             vfitab           = _mm_cvttps_epi32(rt);
906             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
907             vfitab           = _mm_slli_epi32(vfitab,3);
908
909             /* CUBIC SPLINE TABLE DISPERSION */
910             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
911             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
912             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
913             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
914             _MM_TRANSPOSE4_PS(Y,F,G,H);
915             Heps             = _mm_mul_ps(vfeps,H);
916             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
917             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
918             fvdw6            = _mm_mul_ps(c6_00,FF);
919
920             /* CUBIC SPLINE TABLE REPULSION */
921             vfitab           = _mm_add_epi32(vfitab,ifour);
922             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
923             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
924             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
925             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
926             _MM_TRANSPOSE4_PS(Y,F,G,H);
927             Heps             = _mm_mul_ps(vfeps,H);
928             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
929             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
930             fvdw12           = _mm_mul_ps(c12_00,FF);
931             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
932
933             fscal            = fvdw;
934
935             /* Calculate temporary vectorial force */
936             tx               = _mm_mul_ps(fscal,dx00);
937             ty               = _mm_mul_ps(fscal,dy00);
938             tz               = _mm_mul_ps(fscal,dz00);
939
940             /* Update vectorial force */
941             fix0             = _mm_add_ps(fix0,tx);
942             fiy0             = _mm_add_ps(fiy0,ty);
943             fiz0             = _mm_add_ps(fiz0,tz);
944
945             fjx0             = _mm_add_ps(fjx0,tx);
946             fjy0             = _mm_add_ps(fjy0,ty);
947             fjz0             = _mm_add_ps(fjz0,tz);
948
949             /**************************
950              * CALCULATE INTERACTIONS *
951              **************************/
952
953             /* Compute parameters for interactions between i and j atoms */
954             qq10             = _mm_mul_ps(iq1,jq0);
955
956             /* REACTION-FIELD ELECTROSTATICS */
957             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
958
959             fscal            = felec;
960
961             /* Calculate temporary vectorial force */
962             tx               = _mm_mul_ps(fscal,dx10);
963             ty               = _mm_mul_ps(fscal,dy10);
964             tz               = _mm_mul_ps(fscal,dz10);
965
966             /* Update vectorial force */
967             fix1             = _mm_add_ps(fix1,tx);
968             fiy1             = _mm_add_ps(fiy1,ty);
969             fiz1             = _mm_add_ps(fiz1,tz);
970
971             fjx0             = _mm_add_ps(fjx0,tx);
972             fjy0             = _mm_add_ps(fjy0,ty);
973             fjz0             = _mm_add_ps(fjz0,tz);
974
975             /**************************
976              * CALCULATE INTERACTIONS *
977              **************************/
978
979             /* Compute parameters for interactions between i and j atoms */
980             qq20             = _mm_mul_ps(iq2,jq0);
981
982             /* REACTION-FIELD ELECTROSTATICS */
983             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
984
985             fscal            = felec;
986
987             /* Calculate temporary vectorial force */
988             tx               = _mm_mul_ps(fscal,dx20);
989             ty               = _mm_mul_ps(fscal,dy20);
990             tz               = _mm_mul_ps(fscal,dz20);
991
992             /* Update vectorial force */
993             fix2             = _mm_add_ps(fix2,tx);
994             fiy2             = _mm_add_ps(fiy2,ty);
995             fiz2             = _mm_add_ps(fiz2,tz);
996
997             fjx0             = _mm_add_ps(fjx0,tx);
998             fjy0             = _mm_add_ps(fjy0,ty);
999             fjz0             = _mm_add_ps(fjz0,tz);
1000
1001             /**************************
1002              * CALCULATE INTERACTIONS *
1003              **************************/
1004
1005             /* Compute parameters for interactions between i and j atoms */
1006             qq30             = _mm_mul_ps(iq3,jq0);
1007
1008             /* REACTION-FIELD ELECTROSTATICS */
1009             felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
1010
1011             fscal            = felec;
1012
1013             /* Calculate temporary vectorial force */
1014             tx               = _mm_mul_ps(fscal,dx30);
1015             ty               = _mm_mul_ps(fscal,dy30);
1016             tz               = _mm_mul_ps(fscal,dz30);
1017
1018             /* Update vectorial force */
1019             fix3             = _mm_add_ps(fix3,tx);
1020             fiy3             = _mm_add_ps(fiy3,ty);
1021             fiz3             = _mm_add_ps(fiz3,tz);
1022
1023             fjx0             = _mm_add_ps(fjx0,tx);
1024             fjy0             = _mm_add_ps(fjy0,ty);
1025             fjz0             = _mm_add_ps(fjz0,tz);
1026
1027             fjptrA             = f+j_coord_offsetA;
1028             fjptrB             = f+j_coord_offsetB;
1029             fjptrC             = f+j_coord_offsetC;
1030             fjptrD             = f+j_coord_offsetD;
1031
1032             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1033
1034             /* Inner loop uses 129 flops */
1035         }
1036
1037         if(jidx<j_index_end)
1038         {
1039
1040             /* Get j neighbor index, and coordinate index */
1041             jnrlistA         = jjnr[jidx];
1042             jnrlistB         = jjnr[jidx+1];
1043             jnrlistC         = jjnr[jidx+2];
1044             jnrlistD         = jjnr[jidx+3];
1045             /* Sign of each element will be negative for non-real atoms.
1046              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1047              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1048              */
1049             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1050             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1051             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1052             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1053             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1054             j_coord_offsetA  = DIM*jnrA;
1055             j_coord_offsetB  = DIM*jnrB;
1056             j_coord_offsetC  = DIM*jnrC;
1057             j_coord_offsetD  = DIM*jnrD;
1058
1059             /* load j atom coordinates */
1060             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1061                                               x+j_coord_offsetC,x+j_coord_offsetD,
1062                                               &jx0,&jy0,&jz0);
1063
1064             /* Calculate displacement vector */
1065             dx00             = _mm_sub_ps(ix0,jx0);
1066             dy00             = _mm_sub_ps(iy0,jy0);
1067             dz00             = _mm_sub_ps(iz0,jz0);
1068             dx10             = _mm_sub_ps(ix1,jx0);
1069             dy10             = _mm_sub_ps(iy1,jy0);
1070             dz10             = _mm_sub_ps(iz1,jz0);
1071             dx20             = _mm_sub_ps(ix2,jx0);
1072             dy20             = _mm_sub_ps(iy2,jy0);
1073             dz20             = _mm_sub_ps(iz2,jz0);
1074             dx30             = _mm_sub_ps(ix3,jx0);
1075             dy30             = _mm_sub_ps(iy3,jy0);
1076             dz30             = _mm_sub_ps(iz3,jz0);
1077
1078             /* Calculate squared distance and things based on it */
1079             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1080             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1081             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1082             rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1083
1084             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1085             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1086             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1087             rinv30           = gmx_mm_invsqrt_ps(rsq30);
1088
1089             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1090             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1091             rinvsq30         = _mm_mul_ps(rinv30,rinv30);
1092
1093             /* Load parameters for j particles */
1094             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1095                                                               charge+jnrC+0,charge+jnrD+0);
1096             vdwjidx0A        = 2*vdwtype[jnrA+0];
1097             vdwjidx0B        = 2*vdwtype[jnrB+0];
1098             vdwjidx0C        = 2*vdwtype[jnrC+0];
1099             vdwjidx0D        = 2*vdwtype[jnrD+0];
1100
1101             fjx0             = _mm_setzero_ps();
1102             fjy0             = _mm_setzero_ps();
1103             fjz0             = _mm_setzero_ps();
1104
1105             /**************************
1106              * CALCULATE INTERACTIONS *
1107              **************************/
1108
1109             r00              = _mm_mul_ps(rsq00,rinv00);
1110             r00              = _mm_andnot_ps(dummy_mask,r00);
1111
1112             /* Compute parameters for interactions between i and j atoms */
1113             gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1114                                          vdwparam+vdwioffset0+vdwjidx0B,
1115                                          vdwparam+vdwioffset0+vdwjidx0C,
1116                                          vdwparam+vdwioffset0+vdwjidx0D,
1117                                          &c6_00,&c12_00);
1118
1119             /* Calculate table index by multiplying r with table scale and truncate to integer */
1120             rt               = _mm_mul_ps(r00,vftabscale);
1121             vfitab           = _mm_cvttps_epi32(rt);
1122             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1123             vfitab           = _mm_slli_epi32(vfitab,3);
1124
1125             /* CUBIC SPLINE TABLE DISPERSION */
1126             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1127             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1128             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1129             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1130             _MM_TRANSPOSE4_PS(Y,F,G,H);
1131             Heps             = _mm_mul_ps(vfeps,H);
1132             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1133             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1134             fvdw6            = _mm_mul_ps(c6_00,FF);
1135
1136             /* CUBIC SPLINE TABLE REPULSION */
1137             vfitab           = _mm_add_epi32(vfitab,ifour);
1138             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1139             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1140             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1141             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1142             _MM_TRANSPOSE4_PS(Y,F,G,H);
1143             Heps             = _mm_mul_ps(vfeps,H);
1144             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1145             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1146             fvdw12           = _mm_mul_ps(c12_00,FF);
1147             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1148
1149             fscal            = fvdw;
1150
1151             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1152
1153             /* Calculate temporary vectorial force */
1154             tx               = _mm_mul_ps(fscal,dx00);
1155             ty               = _mm_mul_ps(fscal,dy00);
1156             tz               = _mm_mul_ps(fscal,dz00);
1157
1158             /* Update vectorial force */
1159             fix0             = _mm_add_ps(fix0,tx);
1160             fiy0             = _mm_add_ps(fiy0,ty);
1161             fiz0             = _mm_add_ps(fiz0,tz);
1162
1163             fjx0             = _mm_add_ps(fjx0,tx);
1164             fjy0             = _mm_add_ps(fjy0,ty);
1165             fjz0             = _mm_add_ps(fjz0,tz);
1166
1167             /**************************
1168              * CALCULATE INTERACTIONS *
1169              **************************/
1170
1171             /* Compute parameters for interactions between i and j atoms */
1172             qq10             = _mm_mul_ps(iq1,jq0);
1173
1174             /* REACTION-FIELD ELECTROSTATICS */
1175             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1176
1177             fscal            = felec;
1178
1179             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1180
1181             /* Calculate temporary vectorial force */
1182             tx               = _mm_mul_ps(fscal,dx10);
1183             ty               = _mm_mul_ps(fscal,dy10);
1184             tz               = _mm_mul_ps(fscal,dz10);
1185
1186             /* Update vectorial force */
1187             fix1             = _mm_add_ps(fix1,tx);
1188             fiy1             = _mm_add_ps(fiy1,ty);
1189             fiz1             = _mm_add_ps(fiz1,tz);
1190
1191             fjx0             = _mm_add_ps(fjx0,tx);
1192             fjy0             = _mm_add_ps(fjy0,ty);
1193             fjz0             = _mm_add_ps(fjz0,tz);
1194
1195             /**************************
1196              * CALCULATE INTERACTIONS *
1197              **************************/
1198
1199             /* Compute parameters for interactions between i and j atoms */
1200             qq20             = _mm_mul_ps(iq2,jq0);
1201
1202             /* REACTION-FIELD ELECTROSTATICS */
1203             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1204
1205             fscal            = felec;
1206
1207             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1208
1209             /* Calculate temporary vectorial force */
1210             tx               = _mm_mul_ps(fscal,dx20);
1211             ty               = _mm_mul_ps(fscal,dy20);
1212             tz               = _mm_mul_ps(fscal,dz20);
1213
1214             /* Update vectorial force */
1215             fix2             = _mm_add_ps(fix2,tx);
1216             fiy2             = _mm_add_ps(fiy2,ty);
1217             fiz2             = _mm_add_ps(fiz2,tz);
1218
1219             fjx0             = _mm_add_ps(fjx0,tx);
1220             fjy0             = _mm_add_ps(fjy0,ty);
1221             fjz0             = _mm_add_ps(fjz0,tz);
1222
1223             /**************************
1224              * CALCULATE INTERACTIONS *
1225              **************************/
1226
1227             /* Compute parameters for interactions between i and j atoms */
1228             qq30             = _mm_mul_ps(iq3,jq0);
1229
1230             /* REACTION-FIELD ELECTROSTATICS */
1231             felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
1232
1233             fscal            = felec;
1234
1235             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1236
1237             /* Calculate temporary vectorial force */
1238             tx               = _mm_mul_ps(fscal,dx30);
1239             ty               = _mm_mul_ps(fscal,dy30);
1240             tz               = _mm_mul_ps(fscal,dz30);
1241
1242             /* Update vectorial force */
1243             fix3             = _mm_add_ps(fix3,tx);
1244             fiy3             = _mm_add_ps(fiy3,ty);
1245             fiz3             = _mm_add_ps(fiz3,tz);
1246
1247             fjx0             = _mm_add_ps(fjx0,tx);
1248             fjy0             = _mm_add_ps(fjy0,ty);
1249             fjz0             = _mm_add_ps(fjz0,tz);
1250
1251             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1252             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1253             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1254             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1255
1256             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1257
1258             /* Inner loop uses 130 flops */
1259         }
1260
1261         /* End of innermost loop */
1262
1263         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1264                                               f+i_coord_offset,fshift+i_shift_offset);
1265
1266         /* Increment number of inner iterations */
1267         inneriter                  += j_index_end - j_index_start;
1268
1269         /* Outer loop uses 24 flops */
1270     }
1271
1272     /* Increment number of outer iterations */
1273     outeriter        += nri;
1274
1275     /* Update outer/inner flops */
1276
1277     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*130);
1278 }