Merge release-5-0 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_sse4_1_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
48
49 #include "gromacs/simd/math_x86_sse4_1_single.h"
50 #include "kernelutil_x86_sse4_1_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse4_1_single
54  * Electrostatics interaction: ReactionField
55  * VdW interaction:            LennardJones
56  * Geometry:                   Water3-Particle
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse4_1_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83     real             scratch[4*DIM];
84     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85     int              vdwioffset0;
86     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87     int              vdwioffset1;
88     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89     int              vdwioffset2;
90     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
94     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
95     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
96     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
97     real             *charge;
98     int              nvdwtype;
99     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
100     int              *vdwtype;
101     real             *vdwparam;
102     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
103     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
104     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
105     real             rswitch_scalar,d_scalar;
106     __m128           dummy_mask,cutoff_mask;
107     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
108     __m128           one     = _mm_set1_ps(1.0);
109     __m128           two     = _mm_set1_ps(2.0);
110     x                = xx[0];
111     f                = ff[0];
112
113     nri              = nlist->nri;
114     iinr             = nlist->iinr;
115     jindex           = nlist->jindex;
116     jjnr             = nlist->jjnr;
117     shiftidx         = nlist->shift;
118     gid              = nlist->gid;
119     shiftvec         = fr->shift_vec[0];
120     fshift           = fr->fshift[0];
121     facel            = _mm_set1_ps(fr->epsfac);
122     charge           = mdatoms->chargeA;
123     krf              = _mm_set1_ps(fr->ic->k_rf);
124     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
125     crf              = _mm_set1_ps(fr->ic->c_rf);
126     nvdwtype         = fr->ntype;
127     vdwparam         = fr->nbfp;
128     vdwtype          = mdatoms->typeA;
129
130     /* Setup water-specific parameters */
131     inr              = nlist->iinr[0];
132     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
133     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
134     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
135     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
136
137     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
138     rcutoff_scalar   = fr->rcoulomb;
139     rcutoff          = _mm_set1_ps(rcutoff_scalar);
140     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
141
142     rswitch_scalar   = fr->rvdw_switch;
143     rswitch          = _mm_set1_ps(rswitch_scalar);
144     /* Setup switch parameters */
145     d_scalar         = rcutoff_scalar-rswitch_scalar;
146     d                = _mm_set1_ps(d_scalar);
147     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
148     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
149     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
150     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
151     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
152     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
153
154     /* Avoid stupid compiler warnings */
155     jnrA = jnrB = jnrC = jnrD = 0;
156     j_coord_offsetA = 0;
157     j_coord_offsetB = 0;
158     j_coord_offsetC = 0;
159     j_coord_offsetD = 0;
160
161     outeriter        = 0;
162     inneriter        = 0;
163
164     for(iidx=0;iidx<4*DIM;iidx++)
165     {
166         scratch[iidx] = 0.0;
167     }
168
169     /* Start outer loop over neighborlists */
170     for(iidx=0; iidx<nri; iidx++)
171     {
172         /* Load shift vector for this list */
173         i_shift_offset   = DIM*shiftidx[iidx];
174
175         /* Load limits for loop over neighbors */
176         j_index_start    = jindex[iidx];
177         j_index_end      = jindex[iidx+1];
178
179         /* Get outer coordinate index */
180         inr              = iinr[iidx];
181         i_coord_offset   = DIM*inr;
182
183         /* Load i particle coords and add shift vector */
184         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
185                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
186
187         fix0             = _mm_setzero_ps();
188         fiy0             = _mm_setzero_ps();
189         fiz0             = _mm_setzero_ps();
190         fix1             = _mm_setzero_ps();
191         fiy1             = _mm_setzero_ps();
192         fiz1             = _mm_setzero_ps();
193         fix2             = _mm_setzero_ps();
194         fiy2             = _mm_setzero_ps();
195         fiz2             = _mm_setzero_ps();
196
197         /* Reset potential sums */
198         velecsum         = _mm_setzero_ps();
199         vvdwsum          = _mm_setzero_ps();
200
201         /* Start inner kernel loop */
202         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
203         {
204
205             /* Get j neighbor index, and coordinate index */
206             jnrA             = jjnr[jidx];
207             jnrB             = jjnr[jidx+1];
208             jnrC             = jjnr[jidx+2];
209             jnrD             = jjnr[jidx+3];
210             j_coord_offsetA  = DIM*jnrA;
211             j_coord_offsetB  = DIM*jnrB;
212             j_coord_offsetC  = DIM*jnrC;
213             j_coord_offsetD  = DIM*jnrD;
214
215             /* load j atom coordinates */
216             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
217                                               x+j_coord_offsetC,x+j_coord_offsetD,
218                                               &jx0,&jy0,&jz0);
219
220             /* Calculate displacement vector */
221             dx00             = _mm_sub_ps(ix0,jx0);
222             dy00             = _mm_sub_ps(iy0,jy0);
223             dz00             = _mm_sub_ps(iz0,jz0);
224             dx10             = _mm_sub_ps(ix1,jx0);
225             dy10             = _mm_sub_ps(iy1,jy0);
226             dz10             = _mm_sub_ps(iz1,jz0);
227             dx20             = _mm_sub_ps(ix2,jx0);
228             dy20             = _mm_sub_ps(iy2,jy0);
229             dz20             = _mm_sub_ps(iz2,jz0);
230
231             /* Calculate squared distance and things based on it */
232             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
233             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
234             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
235
236             rinv00           = gmx_mm_invsqrt_ps(rsq00);
237             rinv10           = gmx_mm_invsqrt_ps(rsq10);
238             rinv20           = gmx_mm_invsqrt_ps(rsq20);
239
240             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
241             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
242             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
243
244             /* Load parameters for j particles */
245             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
246                                                               charge+jnrC+0,charge+jnrD+0);
247             vdwjidx0A        = 2*vdwtype[jnrA+0];
248             vdwjidx0B        = 2*vdwtype[jnrB+0];
249             vdwjidx0C        = 2*vdwtype[jnrC+0];
250             vdwjidx0D        = 2*vdwtype[jnrD+0];
251
252             fjx0             = _mm_setzero_ps();
253             fjy0             = _mm_setzero_ps();
254             fjz0             = _mm_setzero_ps();
255
256             /**************************
257              * CALCULATE INTERACTIONS *
258              **************************/
259
260             if (gmx_mm_any_lt(rsq00,rcutoff2))
261             {
262
263             r00              = _mm_mul_ps(rsq00,rinv00);
264
265             /* Compute parameters for interactions between i and j atoms */
266             qq00             = _mm_mul_ps(iq0,jq0);
267             gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
268                                          vdwparam+vdwioffset0+vdwjidx0B,
269                                          vdwparam+vdwioffset0+vdwjidx0C,
270                                          vdwparam+vdwioffset0+vdwjidx0D,
271                                          &c6_00,&c12_00);
272
273             /* REACTION-FIELD ELECTROSTATICS */
274             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
275             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
276
277             /* LENNARD-JONES DISPERSION/REPULSION */
278
279             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
280             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
281             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
282             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
283             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
284
285             d                = _mm_sub_ps(r00,rswitch);
286             d                = _mm_max_ps(d,_mm_setzero_ps());
287             d2               = _mm_mul_ps(d,d);
288             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
289
290             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
291
292             /* Evaluate switch function */
293             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
294             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
295             vvdw             = _mm_mul_ps(vvdw,sw);
296             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
297
298             /* Update potential sum for this i atom from the interaction with this j atom. */
299             velec            = _mm_and_ps(velec,cutoff_mask);
300             velecsum         = _mm_add_ps(velecsum,velec);
301             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
302             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
303
304             fscal            = _mm_add_ps(felec,fvdw);
305
306             fscal            = _mm_and_ps(fscal,cutoff_mask);
307
308             /* Calculate temporary vectorial force */
309             tx               = _mm_mul_ps(fscal,dx00);
310             ty               = _mm_mul_ps(fscal,dy00);
311             tz               = _mm_mul_ps(fscal,dz00);
312
313             /* Update vectorial force */
314             fix0             = _mm_add_ps(fix0,tx);
315             fiy0             = _mm_add_ps(fiy0,ty);
316             fiz0             = _mm_add_ps(fiz0,tz);
317
318             fjx0             = _mm_add_ps(fjx0,tx);
319             fjy0             = _mm_add_ps(fjy0,ty);
320             fjz0             = _mm_add_ps(fjz0,tz);
321
322             }
323
324             /**************************
325              * CALCULATE INTERACTIONS *
326              **************************/
327
328             if (gmx_mm_any_lt(rsq10,rcutoff2))
329             {
330
331             /* Compute parameters for interactions between i and j atoms */
332             qq10             = _mm_mul_ps(iq1,jq0);
333
334             /* REACTION-FIELD ELECTROSTATICS */
335             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
336             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
337
338             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
339
340             /* Update potential sum for this i atom from the interaction with this j atom. */
341             velec            = _mm_and_ps(velec,cutoff_mask);
342             velecsum         = _mm_add_ps(velecsum,velec);
343
344             fscal            = felec;
345
346             fscal            = _mm_and_ps(fscal,cutoff_mask);
347
348             /* Calculate temporary vectorial force */
349             tx               = _mm_mul_ps(fscal,dx10);
350             ty               = _mm_mul_ps(fscal,dy10);
351             tz               = _mm_mul_ps(fscal,dz10);
352
353             /* Update vectorial force */
354             fix1             = _mm_add_ps(fix1,tx);
355             fiy1             = _mm_add_ps(fiy1,ty);
356             fiz1             = _mm_add_ps(fiz1,tz);
357
358             fjx0             = _mm_add_ps(fjx0,tx);
359             fjy0             = _mm_add_ps(fjy0,ty);
360             fjz0             = _mm_add_ps(fjz0,tz);
361
362             }
363
364             /**************************
365              * CALCULATE INTERACTIONS *
366              **************************/
367
368             if (gmx_mm_any_lt(rsq20,rcutoff2))
369             {
370
371             /* Compute parameters for interactions between i and j atoms */
372             qq20             = _mm_mul_ps(iq2,jq0);
373
374             /* REACTION-FIELD ELECTROSTATICS */
375             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
376             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
377
378             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
379
380             /* Update potential sum for this i atom from the interaction with this j atom. */
381             velec            = _mm_and_ps(velec,cutoff_mask);
382             velecsum         = _mm_add_ps(velecsum,velec);
383
384             fscal            = felec;
385
386             fscal            = _mm_and_ps(fscal,cutoff_mask);
387
388             /* Calculate temporary vectorial force */
389             tx               = _mm_mul_ps(fscal,dx20);
390             ty               = _mm_mul_ps(fscal,dy20);
391             tz               = _mm_mul_ps(fscal,dz20);
392
393             /* Update vectorial force */
394             fix2             = _mm_add_ps(fix2,tx);
395             fiy2             = _mm_add_ps(fiy2,ty);
396             fiz2             = _mm_add_ps(fiz2,tz);
397
398             fjx0             = _mm_add_ps(fjx0,tx);
399             fjy0             = _mm_add_ps(fjy0,ty);
400             fjz0             = _mm_add_ps(fjz0,tz);
401
402             }
403
404             fjptrA             = f+j_coord_offsetA;
405             fjptrB             = f+j_coord_offsetB;
406             fjptrC             = f+j_coord_offsetC;
407             fjptrD             = f+j_coord_offsetD;
408
409             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
410
411             /* Inner loop uses 142 flops */
412         }
413
414         if(jidx<j_index_end)
415         {
416
417             /* Get j neighbor index, and coordinate index */
418             jnrlistA         = jjnr[jidx];
419             jnrlistB         = jjnr[jidx+1];
420             jnrlistC         = jjnr[jidx+2];
421             jnrlistD         = jjnr[jidx+3];
422             /* Sign of each element will be negative for non-real atoms.
423              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
424              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
425              */
426             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
427             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
428             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
429             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
430             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
431             j_coord_offsetA  = DIM*jnrA;
432             j_coord_offsetB  = DIM*jnrB;
433             j_coord_offsetC  = DIM*jnrC;
434             j_coord_offsetD  = DIM*jnrD;
435
436             /* load j atom coordinates */
437             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
438                                               x+j_coord_offsetC,x+j_coord_offsetD,
439                                               &jx0,&jy0,&jz0);
440
441             /* Calculate displacement vector */
442             dx00             = _mm_sub_ps(ix0,jx0);
443             dy00             = _mm_sub_ps(iy0,jy0);
444             dz00             = _mm_sub_ps(iz0,jz0);
445             dx10             = _mm_sub_ps(ix1,jx0);
446             dy10             = _mm_sub_ps(iy1,jy0);
447             dz10             = _mm_sub_ps(iz1,jz0);
448             dx20             = _mm_sub_ps(ix2,jx0);
449             dy20             = _mm_sub_ps(iy2,jy0);
450             dz20             = _mm_sub_ps(iz2,jz0);
451
452             /* Calculate squared distance and things based on it */
453             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
454             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
455             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
456
457             rinv00           = gmx_mm_invsqrt_ps(rsq00);
458             rinv10           = gmx_mm_invsqrt_ps(rsq10);
459             rinv20           = gmx_mm_invsqrt_ps(rsq20);
460
461             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
462             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
463             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
464
465             /* Load parameters for j particles */
466             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
467                                                               charge+jnrC+0,charge+jnrD+0);
468             vdwjidx0A        = 2*vdwtype[jnrA+0];
469             vdwjidx0B        = 2*vdwtype[jnrB+0];
470             vdwjidx0C        = 2*vdwtype[jnrC+0];
471             vdwjidx0D        = 2*vdwtype[jnrD+0];
472
473             fjx0             = _mm_setzero_ps();
474             fjy0             = _mm_setzero_ps();
475             fjz0             = _mm_setzero_ps();
476
477             /**************************
478              * CALCULATE INTERACTIONS *
479              **************************/
480
481             if (gmx_mm_any_lt(rsq00,rcutoff2))
482             {
483
484             r00              = _mm_mul_ps(rsq00,rinv00);
485             r00              = _mm_andnot_ps(dummy_mask,r00);
486
487             /* Compute parameters for interactions between i and j atoms */
488             qq00             = _mm_mul_ps(iq0,jq0);
489             gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
490                                          vdwparam+vdwioffset0+vdwjidx0B,
491                                          vdwparam+vdwioffset0+vdwjidx0C,
492                                          vdwparam+vdwioffset0+vdwjidx0D,
493                                          &c6_00,&c12_00);
494
495             /* REACTION-FIELD ELECTROSTATICS */
496             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
497             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
498
499             /* LENNARD-JONES DISPERSION/REPULSION */
500
501             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
502             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
503             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
504             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
505             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
506
507             d                = _mm_sub_ps(r00,rswitch);
508             d                = _mm_max_ps(d,_mm_setzero_ps());
509             d2               = _mm_mul_ps(d,d);
510             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
511
512             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
513
514             /* Evaluate switch function */
515             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
516             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
517             vvdw             = _mm_mul_ps(vvdw,sw);
518             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
519
520             /* Update potential sum for this i atom from the interaction with this j atom. */
521             velec            = _mm_and_ps(velec,cutoff_mask);
522             velec            = _mm_andnot_ps(dummy_mask,velec);
523             velecsum         = _mm_add_ps(velecsum,velec);
524             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
525             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
526             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
527
528             fscal            = _mm_add_ps(felec,fvdw);
529
530             fscal            = _mm_and_ps(fscal,cutoff_mask);
531
532             fscal            = _mm_andnot_ps(dummy_mask,fscal);
533
534             /* Calculate temporary vectorial force */
535             tx               = _mm_mul_ps(fscal,dx00);
536             ty               = _mm_mul_ps(fscal,dy00);
537             tz               = _mm_mul_ps(fscal,dz00);
538
539             /* Update vectorial force */
540             fix0             = _mm_add_ps(fix0,tx);
541             fiy0             = _mm_add_ps(fiy0,ty);
542             fiz0             = _mm_add_ps(fiz0,tz);
543
544             fjx0             = _mm_add_ps(fjx0,tx);
545             fjy0             = _mm_add_ps(fjy0,ty);
546             fjz0             = _mm_add_ps(fjz0,tz);
547
548             }
549
550             /**************************
551              * CALCULATE INTERACTIONS *
552              **************************/
553
554             if (gmx_mm_any_lt(rsq10,rcutoff2))
555             {
556
557             /* Compute parameters for interactions between i and j atoms */
558             qq10             = _mm_mul_ps(iq1,jq0);
559
560             /* REACTION-FIELD ELECTROSTATICS */
561             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
562             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
563
564             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
565
566             /* Update potential sum for this i atom from the interaction with this j atom. */
567             velec            = _mm_and_ps(velec,cutoff_mask);
568             velec            = _mm_andnot_ps(dummy_mask,velec);
569             velecsum         = _mm_add_ps(velecsum,velec);
570
571             fscal            = felec;
572
573             fscal            = _mm_and_ps(fscal,cutoff_mask);
574
575             fscal            = _mm_andnot_ps(dummy_mask,fscal);
576
577             /* Calculate temporary vectorial force */
578             tx               = _mm_mul_ps(fscal,dx10);
579             ty               = _mm_mul_ps(fscal,dy10);
580             tz               = _mm_mul_ps(fscal,dz10);
581
582             /* Update vectorial force */
583             fix1             = _mm_add_ps(fix1,tx);
584             fiy1             = _mm_add_ps(fiy1,ty);
585             fiz1             = _mm_add_ps(fiz1,tz);
586
587             fjx0             = _mm_add_ps(fjx0,tx);
588             fjy0             = _mm_add_ps(fjy0,ty);
589             fjz0             = _mm_add_ps(fjz0,tz);
590
591             }
592
593             /**************************
594              * CALCULATE INTERACTIONS *
595              **************************/
596
597             if (gmx_mm_any_lt(rsq20,rcutoff2))
598             {
599
600             /* Compute parameters for interactions between i and j atoms */
601             qq20             = _mm_mul_ps(iq2,jq0);
602
603             /* REACTION-FIELD ELECTROSTATICS */
604             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
605             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
606
607             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
608
609             /* Update potential sum for this i atom from the interaction with this j atom. */
610             velec            = _mm_and_ps(velec,cutoff_mask);
611             velec            = _mm_andnot_ps(dummy_mask,velec);
612             velecsum         = _mm_add_ps(velecsum,velec);
613
614             fscal            = felec;
615
616             fscal            = _mm_and_ps(fscal,cutoff_mask);
617
618             fscal            = _mm_andnot_ps(dummy_mask,fscal);
619
620             /* Calculate temporary vectorial force */
621             tx               = _mm_mul_ps(fscal,dx20);
622             ty               = _mm_mul_ps(fscal,dy20);
623             tz               = _mm_mul_ps(fscal,dz20);
624
625             /* Update vectorial force */
626             fix2             = _mm_add_ps(fix2,tx);
627             fiy2             = _mm_add_ps(fiy2,ty);
628             fiz2             = _mm_add_ps(fiz2,tz);
629
630             fjx0             = _mm_add_ps(fjx0,tx);
631             fjy0             = _mm_add_ps(fjy0,ty);
632             fjz0             = _mm_add_ps(fjz0,tz);
633
634             }
635
636             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
637             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
638             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
639             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
640
641             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
642
643             /* Inner loop uses 143 flops */
644         }
645
646         /* End of innermost loop */
647
648         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
649                                               f+i_coord_offset,fshift+i_shift_offset);
650
651         ggid                        = gid[iidx];
652         /* Update potential energies */
653         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
654         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
655
656         /* Increment number of inner iterations */
657         inneriter                  += j_index_end - j_index_start;
658
659         /* Outer loop uses 20 flops */
660     }
661
662     /* Increment number of outer iterations */
663     outeriter        += nri;
664
665     /* Update outer/inner flops */
666
667     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*143);
668 }
669 /*
670  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse4_1_single
671  * Electrostatics interaction: ReactionField
672  * VdW interaction:            LennardJones
673  * Geometry:                   Water3-Particle
674  * Calculate force/pot:        Force
675  */
676 void
677 nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse4_1_single
678                     (t_nblist                    * gmx_restrict       nlist,
679                      rvec                        * gmx_restrict          xx,
680                      rvec                        * gmx_restrict          ff,
681                      t_forcerec                  * gmx_restrict          fr,
682                      t_mdatoms                   * gmx_restrict     mdatoms,
683                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
684                      t_nrnb                      * gmx_restrict        nrnb)
685 {
686     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
687      * just 0 for non-waters.
688      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
689      * jnr indices corresponding to data put in the four positions in the SIMD register.
690      */
691     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
692     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
693     int              jnrA,jnrB,jnrC,jnrD;
694     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
695     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
696     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
697     real             rcutoff_scalar;
698     real             *shiftvec,*fshift,*x,*f;
699     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
700     real             scratch[4*DIM];
701     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
702     int              vdwioffset0;
703     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
704     int              vdwioffset1;
705     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
706     int              vdwioffset2;
707     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
708     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
709     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
710     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
711     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
712     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
713     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
714     real             *charge;
715     int              nvdwtype;
716     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
717     int              *vdwtype;
718     real             *vdwparam;
719     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
720     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
721     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
722     real             rswitch_scalar,d_scalar;
723     __m128           dummy_mask,cutoff_mask;
724     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
725     __m128           one     = _mm_set1_ps(1.0);
726     __m128           two     = _mm_set1_ps(2.0);
727     x                = xx[0];
728     f                = ff[0];
729
730     nri              = nlist->nri;
731     iinr             = nlist->iinr;
732     jindex           = nlist->jindex;
733     jjnr             = nlist->jjnr;
734     shiftidx         = nlist->shift;
735     gid              = nlist->gid;
736     shiftvec         = fr->shift_vec[0];
737     fshift           = fr->fshift[0];
738     facel            = _mm_set1_ps(fr->epsfac);
739     charge           = mdatoms->chargeA;
740     krf              = _mm_set1_ps(fr->ic->k_rf);
741     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
742     crf              = _mm_set1_ps(fr->ic->c_rf);
743     nvdwtype         = fr->ntype;
744     vdwparam         = fr->nbfp;
745     vdwtype          = mdatoms->typeA;
746
747     /* Setup water-specific parameters */
748     inr              = nlist->iinr[0];
749     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
750     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
751     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
752     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
753
754     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
755     rcutoff_scalar   = fr->rcoulomb;
756     rcutoff          = _mm_set1_ps(rcutoff_scalar);
757     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
758
759     rswitch_scalar   = fr->rvdw_switch;
760     rswitch          = _mm_set1_ps(rswitch_scalar);
761     /* Setup switch parameters */
762     d_scalar         = rcutoff_scalar-rswitch_scalar;
763     d                = _mm_set1_ps(d_scalar);
764     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
765     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
766     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
767     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
768     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
769     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
770
771     /* Avoid stupid compiler warnings */
772     jnrA = jnrB = jnrC = jnrD = 0;
773     j_coord_offsetA = 0;
774     j_coord_offsetB = 0;
775     j_coord_offsetC = 0;
776     j_coord_offsetD = 0;
777
778     outeriter        = 0;
779     inneriter        = 0;
780
781     for(iidx=0;iidx<4*DIM;iidx++)
782     {
783         scratch[iidx] = 0.0;
784     }
785
786     /* Start outer loop over neighborlists */
787     for(iidx=0; iidx<nri; iidx++)
788     {
789         /* Load shift vector for this list */
790         i_shift_offset   = DIM*shiftidx[iidx];
791
792         /* Load limits for loop over neighbors */
793         j_index_start    = jindex[iidx];
794         j_index_end      = jindex[iidx+1];
795
796         /* Get outer coordinate index */
797         inr              = iinr[iidx];
798         i_coord_offset   = DIM*inr;
799
800         /* Load i particle coords and add shift vector */
801         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
802                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
803
804         fix0             = _mm_setzero_ps();
805         fiy0             = _mm_setzero_ps();
806         fiz0             = _mm_setzero_ps();
807         fix1             = _mm_setzero_ps();
808         fiy1             = _mm_setzero_ps();
809         fiz1             = _mm_setzero_ps();
810         fix2             = _mm_setzero_ps();
811         fiy2             = _mm_setzero_ps();
812         fiz2             = _mm_setzero_ps();
813
814         /* Start inner kernel loop */
815         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
816         {
817
818             /* Get j neighbor index, and coordinate index */
819             jnrA             = jjnr[jidx];
820             jnrB             = jjnr[jidx+1];
821             jnrC             = jjnr[jidx+2];
822             jnrD             = jjnr[jidx+3];
823             j_coord_offsetA  = DIM*jnrA;
824             j_coord_offsetB  = DIM*jnrB;
825             j_coord_offsetC  = DIM*jnrC;
826             j_coord_offsetD  = DIM*jnrD;
827
828             /* load j atom coordinates */
829             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
830                                               x+j_coord_offsetC,x+j_coord_offsetD,
831                                               &jx0,&jy0,&jz0);
832
833             /* Calculate displacement vector */
834             dx00             = _mm_sub_ps(ix0,jx0);
835             dy00             = _mm_sub_ps(iy0,jy0);
836             dz00             = _mm_sub_ps(iz0,jz0);
837             dx10             = _mm_sub_ps(ix1,jx0);
838             dy10             = _mm_sub_ps(iy1,jy0);
839             dz10             = _mm_sub_ps(iz1,jz0);
840             dx20             = _mm_sub_ps(ix2,jx0);
841             dy20             = _mm_sub_ps(iy2,jy0);
842             dz20             = _mm_sub_ps(iz2,jz0);
843
844             /* Calculate squared distance and things based on it */
845             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
846             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
847             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
848
849             rinv00           = gmx_mm_invsqrt_ps(rsq00);
850             rinv10           = gmx_mm_invsqrt_ps(rsq10);
851             rinv20           = gmx_mm_invsqrt_ps(rsq20);
852
853             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
854             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
855             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
856
857             /* Load parameters for j particles */
858             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
859                                                               charge+jnrC+0,charge+jnrD+0);
860             vdwjidx0A        = 2*vdwtype[jnrA+0];
861             vdwjidx0B        = 2*vdwtype[jnrB+0];
862             vdwjidx0C        = 2*vdwtype[jnrC+0];
863             vdwjidx0D        = 2*vdwtype[jnrD+0];
864
865             fjx0             = _mm_setzero_ps();
866             fjy0             = _mm_setzero_ps();
867             fjz0             = _mm_setzero_ps();
868
869             /**************************
870              * CALCULATE INTERACTIONS *
871              **************************/
872
873             if (gmx_mm_any_lt(rsq00,rcutoff2))
874             {
875
876             r00              = _mm_mul_ps(rsq00,rinv00);
877
878             /* Compute parameters for interactions between i and j atoms */
879             qq00             = _mm_mul_ps(iq0,jq0);
880             gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
881                                          vdwparam+vdwioffset0+vdwjidx0B,
882                                          vdwparam+vdwioffset0+vdwjidx0C,
883                                          vdwparam+vdwioffset0+vdwjidx0D,
884                                          &c6_00,&c12_00);
885
886             /* REACTION-FIELD ELECTROSTATICS */
887             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
888
889             /* LENNARD-JONES DISPERSION/REPULSION */
890
891             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
892             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
893             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
894             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
895             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
896
897             d                = _mm_sub_ps(r00,rswitch);
898             d                = _mm_max_ps(d,_mm_setzero_ps());
899             d2               = _mm_mul_ps(d,d);
900             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
901
902             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
903
904             /* Evaluate switch function */
905             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
906             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
907             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
908
909             fscal            = _mm_add_ps(felec,fvdw);
910
911             fscal            = _mm_and_ps(fscal,cutoff_mask);
912
913             /* Calculate temporary vectorial force */
914             tx               = _mm_mul_ps(fscal,dx00);
915             ty               = _mm_mul_ps(fscal,dy00);
916             tz               = _mm_mul_ps(fscal,dz00);
917
918             /* Update vectorial force */
919             fix0             = _mm_add_ps(fix0,tx);
920             fiy0             = _mm_add_ps(fiy0,ty);
921             fiz0             = _mm_add_ps(fiz0,tz);
922
923             fjx0             = _mm_add_ps(fjx0,tx);
924             fjy0             = _mm_add_ps(fjy0,ty);
925             fjz0             = _mm_add_ps(fjz0,tz);
926
927             }
928
929             /**************************
930              * CALCULATE INTERACTIONS *
931              **************************/
932
933             if (gmx_mm_any_lt(rsq10,rcutoff2))
934             {
935
936             /* Compute parameters for interactions between i and j atoms */
937             qq10             = _mm_mul_ps(iq1,jq0);
938
939             /* REACTION-FIELD ELECTROSTATICS */
940             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
941
942             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
943
944             fscal            = felec;
945
946             fscal            = _mm_and_ps(fscal,cutoff_mask);
947
948             /* Calculate temporary vectorial force */
949             tx               = _mm_mul_ps(fscal,dx10);
950             ty               = _mm_mul_ps(fscal,dy10);
951             tz               = _mm_mul_ps(fscal,dz10);
952
953             /* Update vectorial force */
954             fix1             = _mm_add_ps(fix1,tx);
955             fiy1             = _mm_add_ps(fiy1,ty);
956             fiz1             = _mm_add_ps(fiz1,tz);
957
958             fjx0             = _mm_add_ps(fjx0,tx);
959             fjy0             = _mm_add_ps(fjy0,ty);
960             fjz0             = _mm_add_ps(fjz0,tz);
961
962             }
963
964             /**************************
965              * CALCULATE INTERACTIONS *
966              **************************/
967
968             if (gmx_mm_any_lt(rsq20,rcutoff2))
969             {
970
971             /* Compute parameters for interactions between i and j atoms */
972             qq20             = _mm_mul_ps(iq2,jq0);
973
974             /* REACTION-FIELD ELECTROSTATICS */
975             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
976
977             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
978
979             fscal            = felec;
980
981             fscal            = _mm_and_ps(fscal,cutoff_mask);
982
983             /* Calculate temporary vectorial force */
984             tx               = _mm_mul_ps(fscal,dx20);
985             ty               = _mm_mul_ps(fscal,dy20);
986             tz               = _mm_mul_ps(fscal,dz20);
987
988             /* Update vectorial force */
989             fix2             = _mm_add_ps(fix2,tx);
990             fiy2             = _mm_add_ps(fiy2,ty);
991             fiz2             = _mm_add_ps(fiz2,tz);
992
993             fjx0             = _mm_add_ps(fjx0,tx);
994             fjy0             = _mm_add_ps(fjy0,ty);
995             fjz0             = _mm_add_ps(fjz0,tz);
996
997             }
998
999             fjptrA             = f+j_coord_offsetA;
1000             fjptrB             = f+j_coord_offsetB;
1001             fjptrC             = f+j_coord_offsetC;
1002             fjptrD             = f+j_coord_offsetD;
1003
1004             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1005
1006             /* Inner loop uses 121 flops */
1007         }
1008
1009         if(jidx<j_index_end)
1010         {
1011
1012             /* Get j neighbor index, and coordinate index */
1013             jnrlistA         = jjnr[jidx];
1014             jnrlistB         = jjnr[jidx+1];
1015             jnrlistC         = jjnr[jidx+2];
1016             jnrlistD         = jjnr[jidx+3];
1017             /* Sign of each element will be negative for non-real atoms.
1018              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1019              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1020              */
1021             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1022             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1023             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1024             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1025             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1026             j_coord_offsetA  = DIM*jnrA;
1027             j_coord_offsetB  = DIM*jnrB;
1028             j_coord_offsetC  = DIM*jnrC;
1029             j_coord_offsetD  = DIM*jnrD;
1030
1031             /* load j atom coordinates */
1032             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1033                                               x+j_coord_offsetC,x+j_coord_offsetD,
1034                                               &jx0,&jy0,&jz0);
1035
1036             /* Calculate displacement vector */
1037             dx00             = _mm_sub_ps(ix0,jx0);
1038             dy00             = _mm_sub_ps(iy0,jy0);
1039             dz00             = _mm_sub_ps(iz0,jz0);
1040             dx10             = _mm_sub_ps(ix1,jx0);
1041             dy10             = _mm_sub_ps(iy1,jy0);
1042             dz10             = _mm_sub_ps(iz1,jz0);
1043             dx20             = _mm_sub_ps(ix2,jx0);
1044             dy20             = _mm_sub_ps(iy2,jy0);
1045             dz20             = _mm_sub_ps(iz2,jz0);
1046
1047             /* Calculate squared distance and things based on it */
1048             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1049             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1050             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1051
1052             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1053             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1054             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1055
1056             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1057             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1058             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1059
1060             /* Load parameters for j particles */
1061             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1062                                                               charge+jnrC+0,charge+jnrD+0);
1063             vdwjidx0A        = 2*vdwtype[jnrA+0];
1064             vdwjidx0B        = 2*vdwtype[jnrB+0];
1065             vdwjidx0C        = 2*vdwtype[jnrC+0];
1066             vdwjidx0D        = 2*vdwtype[jnrD+0];
1067
1068             fjx0             = _mm_setzero_ps();
1069             fjy0             = _mm_setzero_ps();
1070             fjz0             = _mm_setzero_ps();
1071
1072             /**************************
1073              * CALCULATE INTERACTIONS *
1074              **************************/
1075
1076             if (gmx_mm_any_lt(rsq00,rcutoff2))
1077             {
1078
1079             r00              = _mm_mul_ps(rsq00,rinv00);
1080             r00              = _mm_andnot_ps(dummy_mask,r00);
1081
1082             /* Compute parameters for interactions between i and j atoms */
1083             qq00             = _mm_mul_ps(iq0,jq0);
1084             gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1085                                          vdwparam+vdwioffset0+vdwjidx0B,
1086                                          vdwparam+vdwioffset0+vdwjidx0C,
1087                                          vdwparam+vdwioffset0+vdwjidx0D,
1088                                          &c6_00,&c12_00);
1089
1090             /* REACTION-FIELD ELECTROSTATICS */
1091             felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1092
1093             /* LENNARD-JONES DISPERSION/REPULSION */
1094
1095             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1096             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
1097             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1098             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1099             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1100
1101             d                = _mm_sub_ps(r00,rswitch);
1102             d                = _mm_max_ps(d,_mm_setzero_ps());
1103             d2               = _mm_mul_ps(d,d);
1104             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1105
1106             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1107
1108             /* Evaluate switch function */
1109             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1110             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1111             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1112
1113             fscal            = _mm_add_ps(felec,fvdw);
1114
1115             fscal            = _mm_and_ps(fscal,cutoff_mask);
1116
1117             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1118
1119             /* Calculate temporary vectorial force */
1120             tx               = _mm_mul_ps(fscal,dx00);
1121             ty               = _mm_mul_ps(fscal,dy00);
1122             tz               = _mm_mul_ps(fscal,dz00);
1123
1124             /* Update vectorial force */
1125             fix0             = _mm_add_ps(fix0,tx);
1126             fiy0             = _mm_add_ps(fiy0,ty);
1127             fiz0             = _mm_add_ps(fiz0,tz);
1128
1129             fjx0             = _mm_add_ps(fjx0,tx);
1130             fjy0             = _mm_add_ps(fjy0,ty);
1131             fjz0             = _mm_add_ps(fjz0,tz);
1132
1133             }
1134
1135             /**************************
1136              * CALCULATE INTERACTIONS *
1137              **************************/
1138
1139             if (gmx_mm_any_lt(rsq10,rcutoff2))
1140             {
1141
1142             /* Compute parameters for interactions between i and j atoms */
1143             qq10             = _mm_mul_ps(iq1,jq0);
1144
1145             /* REACTION-FIELD ELECTROSTATICS */
1146             felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1147
1148             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1149
1150             fscal            = felec;
1151
1152             fscal            = _mm_and_ps(fscal,cutoff_mask);
1153
1154             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1155
1156             /* Calculate temporary vectorial force */
1157             tx               = _mm_mul_ps(fscal,dx10);
1158             ty               = _mm_mul_ps(fscal,dy10);
1159             tz               = _mm_mul_ps(fscal,dz10);
1160
1161             /* Update vectorial force */
1162             fix1             = _mm_add_ps(fix1,tx);
1163             fiy1             = _mm_add_ps(fiy1,ty);
1164             fiz1             = _mm_add_ps(fiz1,tz);
1165
1166             fjx0             = _mm_add_ps(fjx0,tx);
1167             fjy0             = _mm_add_ps(fjy0,ty);
1168             fjz0             = _mm_add_ps(fjz0,tz);
1169
1170             }
1171
1172             /**************************
1173              * CALCULATE INTERACTIONS *
1174              **************************/
1175
1176             if (gmx_mm_any_lt(rsq20,rcutoff2))
1177             {
1178
1179             /* Compute parameters for interactions between i and j atoms */
1180             qq20             = _mm_mul_ps(iq2,jq0);
1181
1182             /* REACTION-FIELD ELECTROSTATICS */
1183             felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1184
1185             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1186
1187             fscal            = felec;
1188
1189             fscal            = _mm_and_ps(fscal,cutoff_mask);
1190
1191             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1192
1193             /* Calculate temporary vectorial force */
1194             tx               = _mm_mul_ps(fscal,dx20);
1195             ty               = _mm_mul_ps(fscal,dy20);
1196             tz               = _mm_mul_ps(fscal,dz20);
1197
1198             /* Update vectorial force */
1199             fix2             = _mm_add_ps(fix2,tx);
1200             fiy2             = _mm_add_ps(fiy2,ty);
1201             fiz2             = _mm_add_ps(fiz2,tz);
1202
1203             fjx0             = _mm_add_ps(fjx0,tx);
1204             fjy0             = _mm_add_ps(fjy0,ty);
1205             fjz0             = _mm_add_ps(fjz0,tz);
1206
1207             }
1208
1209             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1210             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1211             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1212             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1213
1214             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1215
1216             /* Inner loop uses 122 flops */
1217         }
1218
1219         /* End of innermost loop */
1220
1221         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1222                                               f+i_coord_offset,fshift+i_shift_offset);
1223
1224         /* Increment number of inner iterations */
1225         inneriter                  += j_index_end - j_index_start;
1226
1227         /* Outer loop uses 18 flops */
1228     }
1229
1230     /* Increment number of outer iterations */
1231     outeriter        += nri;
1232
1233     /* Update outer/inner flops */
1234
1235     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*122);
1236 }