2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_single
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real shX,shY,shZ,rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
73 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
75 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
77 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
78 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
79 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
80 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
83 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
86 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
87 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
88 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
89 real rswitch_scalar,d_scalar;
90 __m128 dummy_mask,cutoff_mask;
91 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
92 __m128 one = _mm_set1_ps(1.0);
93 __m128 two = _mm_set1_ps(2.0);
99 jindex = nlist->jindex;
101 shiftidx = nlist->shift;
103 shiftvec = fr->shift_vec[0];
104 fshift = fr->fshift[0];
105 facel = _mm_set1_ps(fr->epsfac);
106 charge = mdatoms->chargeA;
107 krf = _mm_set1_ps(fr->ic->k_rf);
108 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
109 crf = _mm_set1_ps(fr->ic->c_rf);
110 nvdwtype = fr->ntype;
112 vdwtype = mdatoms->typeA;
114 /* Setup water-specific parameters */
115 inr = nlist->iinr[0];
116 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
117 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
118 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
119 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
121 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
122 rcutoff_scalar = fr->rcoulomb;
123 rcutoff = _mm_set1_ps(rcutoff_scalar);
124 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
126 rswitch_scalar = fr->rvdw_switch;
127 rswitch = _mm_set1_ps(rswitch_scalar);
128 /* Setup switch parameters */
129 d_scalar = rcutoff_scalar-rswitch_scalar;
130 d = _mm_set1_ps(d_scalar);
131 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
132 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
133 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
134 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
135 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
136 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
138 /* Avoid stupid compiler warnings */
139 jnrA = jnrB = jnrC = jnrD = 0;
148 /* Start outer loop over neighborlists */
149 for(iidx=0; iidx<nri; iidx++)
151 /* Load shift vector for this list */
152 i_shift_offset = DIM*shiftidx[iidx];
153 shX = shiftvec[i_shift_offset+XX];
154 shY = shiftvec[i_shift_offset+YY];
155 shZ = shiftvec[i_shift_offset+ZZ];
157 /* Load limits for loop over neighbors */
158 j_index_start = jindex[iidx];
159 j_index_end = jindex[iidx+1];
161 /* Get outer coordinate index */
163 i_coord_offset = DIM*inr;
165 /* Load i particle coords and add shift vector */
166 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
167 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
168 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
169 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
170 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
171 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
172 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
173 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
174 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
175 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
176 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
177 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
179 fix0 = _mm_setzero_ps();
180 fiy0 = _mm_setzero_ps();
181 fiz0 = _mm_setzero_ps();
182 fix1 = _mm_setzero_ps();
183 fiy1 = _mm_setzero_ps();
184 fiz1 = _mm_setzero_ps();
185 fix2 = _mm_setzero_ps();
186 fiy2 = _mm_setzero_ps();
187 fiz2 = _mm_setzero_ps();
188 fix3 = _mm_setzero_ps();
189 fiy3 = _mm_setzero_ps();
190 fiz3 = _mm_setzero_ps();
192 /* Reset potential sums */
193 velecsum = _mm_setzero_ps();
194 vvdwsum = _mm_setzero_ps();
196 /* Start inner kernel loop */
197 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
200 /* Get j neighbor index, and coordinate index */
206 j_coord_offsetA = DIM*jnrA;
207 j_coord_offsetB = DIM*jnrB;
208 j_coord_offsetC = DIM*jnrC;
209 j_coord_offsetD = DIM*jnrD;
211 /* load j atom coordinates */
212 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
213 x+j_coord_offsetC,x+j_coord_offsetD,
216 /* Calculate displacement vector */
217 dx00 = _mm_sub_ps(ix0,jx0);
218 dy00 = _mm_sub_ps(iy0,jy0);
219 dz00 = _mm_sub_ps(iz0,jz0);
220 dx10 = _mm_sub_ps(ix1,jx0);
221 dy10 = _mm_sub_ps(iy1,jy0);
222 dz10 = _mm_sub_ps(iz1,jz0);
223 dx20 = _mm_sub_ps(ix2,jx0);
224 dy20 = _mm_sub_ps(iy2,jy0);
225 dz20 = _mm_sub_ps(iz2,jz0);
226 dx30 = _mm_sub_ps(ix3,jx0);
227 dy30 = _mm_sub_ps(iy3,jy0);
228 dz30 = _mm_sub_ps(iz3,jz0);
230 /* Calculate squared distance and things based on it */
231 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
232 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
233 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
234 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
236 rinv00 = gmx_mm_invsqrt_ps(rsq00);
237 rinv10 = gmx_mm_invsqrt_ps(rsq10);
238 rinv20 = gmx_mm_invsqrt_ps(rsq20);
239 rinv30 = gmx_mm_invsqrt_ps(rsq30);
241 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
242 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
243 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
244 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
246 /* Load parameters for j particles */
247 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
248 charge+jnrC+0,charge+jnrD+0);
249 vdwjidx0A = 2*vdwtype[jnrA+0];
250 vdwjidx0B = 2*vdwtype[jnrB+0];
251 vdwjidx0C = 2*vdwtype[jnrC+0];
252 vdwjidx0D = 2*vdwtype[jnrD+0];
254 /**************************
255 * CALCULATE INTERACTIONS *
256 **************************/
258 if (gmx_mm_any_lt(rsq00,rcutoff2))
261 r00 = _mm_mul_ps(rsq00,rinv00);
263 /* Compute parameters for interactions between i and j atoms */
264 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
265 vdwparam+vdwioffset0+vdwjidx0B,
266 vdwparam+vdwioffset0+vdwjidx0C,
267 vdwparam+vdwioffset0+vdwjidx0D,
270 /* LENNARD-JONES DISPERSION/REPULSION */
272 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
273 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
274 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
275 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
276 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
278 d = _mm_sub_ps(r00,rswitch);
279 d = _mm_max_ps(d,_mm_setzero_ps());
280 d2 = _mm_mul_ps(d,d);
281 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
283 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
285 /* Evaluate switch function */
286 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
287 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
288 vvdw = _mm_mul_ps(vvdw,sw);
289 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
291 /* Update potential sum for this i atom from the interaction with this j atom. */
292 vvdw = _mm_and_ps(vvdw,cutoff_mask);
293 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
297 fscal = _mm_and_ps(fscal,cutoff_mask);
299 /* Calculate temporary vectorial force */
300 tx = _mm_mul_ps(fscal,dx00);
301 ty = _mm_mul_ps(fscal,dy00);
302 tz = _mm_mul_ps(fscal,dz00);
304 /* Update vectorial force */
305 fix0 = _mm_add_ps(fix0,tx);
306 fiy0 = _mm_add_ps(fiy0,ty);
307 fiz0 = _mm_add_ps(fiz0,tz);
309 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
310 f+j_coord_offsetC,f+j_coord_offsetD,
315 /**************************
316 * CALCULATE INTERACTIONS *
317 **************************/
319 if (gmx_mm_any_lt(rsq10,rcutoff2))
322 /* Compute parameters for interactions between i and j atoms */
323 qq10 = _mm_mul_ps(iq1,jq0);
325 /* REACTION-FIELD ELECTROSTATICS */
326 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
327 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
329 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
331 /* Update potential sum for this i atom from the interaction with this j atom. */
332 velec = _mm_and_ps(velec,cutoff_mask);
333 velecsum = _mm_add_ps(velecsum,velec);
337 fscal = _mm_and_ps(fscal,cutoff_mask);
339 /* Calculate temporary vectorial force */
340 tx = _mm_mul_ps(fscal,dx10);
341 ty = _mm_mul_ps(fscal,dy10);
342 tz = _mm_mul_ps(fscal,dz10);
344 /* Update vectorial force */
345 fix1 = _mm_add_ps(fix1,tx);
346 fiy1 = _mm_add_ps(fiy1,ty);
347 fiz1 = _mm_add_ps(fiz1,tz);
349 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
350 f+j_coord_offsetC,f+j_coord_offsetD,
355 /**************************
356 * CALCULATE INTERACTIONS *
357 **************************/
359 if (gmx_mm_any_lt(rsq20,rcutoff2))
362 /* Compute parameters for interactions between i and j atoms */
363 qq20 = _mm_mul_ps(iq2,jq0);
365 /* REACTION-FIELD ELECTROSTATICS */
366 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
367 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
369 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
371 /* Update potential sum for this i atom from the interaction with this j atom. */
372 velec = _mm_and_ps(velec,cutoff_mask);
373 velecsum = _mm_add_ps(velecsum,velec);
377 fscal = _mm_and_ps(fscal,cutoff_mask);
379 /* Calculate temporary vectorial force */
380 tx = _mm_mul_ps(fscal,dx20);
381 ty = _mm_mul_ps(fscal,dy20);
382 tz = _mm_mul_ps(fscal,dz20);
384 /* Update vectorial force */
385 fix2 = _mm_add_ps(fix2,tx);
386 fiy2 = _mm_add_ps(fiy2,ty);
387 fiz2 = _mm_add_ps(fiz2,tz);
389 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
390 f+j_coord_offsetC,f+j_coord_offsetD,
395 /**************************
396 * CALCULATE INTERACTIONS *
397 **************************/
399 if (gmx_mm_any_lt(rsq30,rcutoff2))
402 /* Compute parameters for interactions between i and j atoms */
403 qq30 = _mm_mul_ps(iq3,jq0);
405 /* REACTION-FIELD ELECTROSTATICS */
406 velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
407 felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
409 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
411 /* Update potential sum for this i atom from the interaction with this j atom. */
412 velec = _mm_and_ps(velec,cutoff_mask);
413 velecsum = _mm_add_ps(velecsum,velec);
417 fscal = _mm_and_ps(fscal,cutoff_mask);
419 /* Calculate temporary vectorial force */
420 tx = _mm_mul_ps(fscal,dx30);
421 ty = _mm_mul_ps(fscal,dy30);
422 tz = _mm_mul_ps(fscal,dz30);
424 /* Update vectorial force */
425 fix3 = _mm_add_ps(fix3,tx);
426 fiy3 = _mm_add_ps(fiy3,ty);
427 fiz3 = _mm_add_ps(fiz3,tz);
429 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
430 f+j_coord_offsetC,f+j_coord_offsetD,
435 /* Inner loop uses 167 flops */
441 /* Get j neighbor index, and coordinate index */
447 /* Sign of each element will be negative for non-real atoms.
448 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
449 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
451 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
452 jnrA = (jnrA>=0) ? jnrA : 0;
453 jnrB = (jnrB>=0) ? jnrB : 0;
454 jnrC = (jnrC>=0) ? jnrC : 0;
455 jnrD = (jnrD>=0) ? jnrD : 0;
457 j_coord_offsetA = DIM*jnrA;
458 j_coord_offsetB = DIM*jnrB;
459 j_coord_offsetC = DIM*jnrC;
460 j_coord_offsetD = DIM*jnrD;
462 /* load j atom coordinates */
463 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
464 x+j_coord_offsetC,x+j_coord_offsetD,
467 /* Calculate displacement vector */
468 dx00 = _mm_sub_ps(ix0,jx0);
469 dy00 = _mm_sub_ps(iy0,jy0);
470 dz00 = _mm_sub_ps(iz0,jz0);
471 dx10 = _mm_sub_ps(ix1,jx0);
472 dy10 = _mm_sub_ps(iy1,jy0);
473 dz10 = _mm_sub_ps(iz1,jz0);
474 dx20 = _mm_sub_ps(ix2,jx0);
475 dy20 = _mm_sub_ps(iy2,jy0);
476 dz20 = _mm_sub_ps(iz2,jz0);
477 dx30 = _mm_sub_ps(ix3,jx0);
478 dy30 = _mm_sub_ps(iy3,jy0);
479 dz30 = _mm_sub_ps(iz3,jz0);
481 /* Calculate squared distance and things based on it */
482 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
483 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
484 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
485 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
487 rinv00 = gmx_mm_invsqrt_ps(rsq00);
488 rinv10 = gmx_mm_invsqrt_ps(rsq10);
489 rinv20 = gmx_mm_invsqrt_ps(rsq20);
490 rinv30 = gmx_mm_invsqrt_ps(rsq30);
492 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
493 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
494 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
495 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
497 /* Load parameters for j particles */
498 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
499 charge+jnrC+0,charge+jnrD+0);
500 vdwjidx0A = 2*vdwtype[jnrA+0];
501 vdwjidx0B = 2*vdwtype[jnrB+0];
502 vdwjidx0C = 2*vdwtype[jnrC+0];
503 vdwjidx0D = 2*vdwtype[jnrD+0];
505 /**************************
506 * CALCULATE INTERACTIONS *
507 **************************/
509 if (gmx_mm_any_lt(rsq00,rcutoff2))
512 r00 = _mm_mul_ps(rsq00,rinv00);
513 r00 = _mm_andnot_ps(dummy_mask,r00);
515 /* Compute parameters for interactions between i and j atoms */
516 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
517 vdwparam+vdwioffset0+vdwjidx0B,
518 vdwparam+vdwioffset0+vdwjidx0C,
519 vdwparam+vdwioffset0+vdwjidx0D,
522 /* LENNARD-JONES DISPERSION/REPULSION */
524 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
525 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
526 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
527 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
528 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
530 d = _mm_sub_ps(r00,rswitch);
531 d = _mm_max_ps(d,_mm_setzero_ps());
532 d2 = _mm_mul_ps(d,d);
533 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
535 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
537 /* Evaluate switch function */
538 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
539 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
540 vvdw = _mm_mul_ps(vvdw,sw);
541 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
543 /* Update potential sum for this i atom from the interaction with this j atom. */
544 vvdw = _mm_and_ps(vvdw,cutoff_mask);
545 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
546 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
550 fscal = _mm_and_ps(fscal,cutoff_mask);
552 fscal = _mm_andnot_ps(dummy_mask,fscal);
554 /* Calculate temporary vectorial force */
555 tx = _mm_mul_ps(fscal,dx00);
556 ty = _mm_mul_ps(fscal,dy00);
557 tz = _mm_mul_ps(fscal,dz00);
559 /* Update vectorial force */
560 fix0 = _mm_add_ps(fix0,tx);
561 fiy0 = _mm_add_ps(fiy0,ty);
562 fiz0 = _mm_add_ps(fiz0,tz);
564 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
565 f+j_coord_offsetC,f+j_coord_offsetD,
570 /**************************
571 * CALCULATE INTERACTIONS *
572 **************************/
574 if (gmx_mm_any_lt(rsq10,rcutoff2))
577 /* Compute parameters for interactions between i and j atoms */
578 qq10 = _mm_mul_ps(iq1,jq0);
580 /* REACTION-FIELD ELECTROSTATICS */
581 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
582 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
584 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
586 /* Update potential sum for this i atom from the interaction with this j atom. */
587 velec = _mm_and_ps(velec,cutoff_mask);
588 velec = _mm_andnot_ps(dummy_mask,velec);
589 velecsum = _mm_add_ps(velecsum,velec);
593 fscal = _mm_and_ps(fscal,cutoff_mask);
595 fscal = _mm_andnot_ps(dummy_mask,fscal);
597 /* Calculate temporary vectorial force */
598 tx = _mm_mul_ps(fscal,dx10);
599 ty = _mm_mul_ps(fscal,dy10);
600 tz = _mm_mul_ps(fscal,dz10);
602 /* Update vectorial force */
603 fix1 = _mm_add_ps(fix1,tx);
604 fiy1 = _mm_add_ps(fiy1,ty);
605 fiz1 = _mm_add_ps(fiz1,tz);
607 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
608 f+j_coord_offsetC,f+j_coord_offsetD,
613 /**************************
614 * CALCULATE INTERACTIONS *
615 **************************/
617 if (gmx_mm_any_lt(rsq20,rcutoff2))
620 /* Compute parameters for interactions between i and j atoms */
621 qq20 = _mm_mul_ps(iq2,jq0);
623 /* REACTION-FIELD ELECTROSTATICS */
624 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
625 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
627 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
629 /* Update potential sum for this i atom from the interaction with this j atom. */
630 velec = _mm_and_ps(velec,cutoff_mask);
631 velec = _mm_andnot_ps(dummy_mask,velec);
632 velecsum = _mm_add_ps(velecsum,velec);
636 fscal = _mm_and_ps(fscal,cutoff_mask);
638 fscal = _mm_andnot_ps(dummy_mask,fscal);
640 /* Calculate temporary vectorial force */
641 tx = _mm_mul_ps(fscal,dx20);
642 ty = _mm_mul_ps(fscal,dy20);
643 tz = _mm_mul_ps(fscal,dz20);
645 /* Update vectorial force */
646 fix2 = _mm_add_ps(fix2,tx);
647 fiy2 = _mm_add_ps(fiy2,ty);
648 fiz2 = _mm_add_ps(fiz2,tz);
650 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
651 f+j_coord_offsetC,f+j_coord_offsetD,
656 /**************************
657 * CALCULATE INTERACTIONS *
658 **************************/
660 if (gmx_mm_any_lt(rsq30,rcutoff2))
663 /* Compute parameters for interactions between i and j atoms */
664 qq30 = _mm_mul_ps(iq3,jq0);
666 /* REACTION-FIELD ELECTROSTATICS */
667 velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
668 felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
670 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
672 /* Update potential sum for this i atom from the interaction with this j atom. */
673 velec = _mm_and_ps(velec,cutoff_mask);
674 velec = _mm_andnot_ps(dummy_mask,velec);
675 velecsum = _mm_add_ps(velecsum,velec);
679 fscal = _mm_and_ps(fscal,cutoff_mask);
681 fscal = _mm_andnot_ps(dummy_mask,fscal);
683 /* Calculate temporary vectorial force */
684 tx = _mm_mul_ps(fscal,dx30);
685 ty = _mm_mul_ps(fscal,dy30);
686 tz = _mm_mul_ps(fscal,dz30);
688 /* Update vectorial force */
689 fix3 = _mm_add_ps(fix3,tx);
690 fiy3 = _mm_add_ps(fiy3,ty);
691 fiz3 = _mm_add_ps(fiz3,tz);
693 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
694 f+j_coord_offsetC,f+j_coord_offsetD,
699 /* Inner loop uses 168 flops */
702 /* End of innermost loop */
704 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
705 f+i_coord_offset,fshift+i_shift_offset);
708 /* Update potential energies */
709 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
710 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
712 /* Increment number of inner iterations */
713 inneriter += j_index_end - j_index_start;
715 /* Outer loop uses 38 flops */
718 /* Increment number of outer iterations */
721 /* Update outer/inner flops */
723 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*168);
726 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_single
727 * Electrostatics interaction: ReactionField
728 * VdW interaction: LennardJones
729 * Geometry: Water4-Particle
730 * Calculate force/pot: Force
733 nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_single
734 (t_nblist * gmx_restrict nlist,
735 rvec * gmx_restrict xx,
736 rvec * gmx_restrict ff,
737 t_forcerec * gmx_restrict fr,
738 t_mdatoms * gmx_restrict mdatoms,
739 nb_kernel_data_t * gmx_restrict kernel_data,
740 t_nrnb * gmx_restrict nrnb)
742 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
743 * just 0 for non-waters.
744 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
745 * jnr indices corresponding to data put in the four positions in the SIMD register.
747 int i_shift_offset,i_coord_offset,outeriter,inneriter;
748 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
749 int jnrA,jnrB,jnrC,jnrD;
750 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
751 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
752 real shX,shY,shZ,rcutoff_scalar;
753 real *shiftvec,*fshift,*x,*f;
754 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
756 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
758 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
760 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
762 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
763 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
764 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
765 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
766 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
767 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
768 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
769 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
772 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
775 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
776 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
777 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
778 real rswitch_scalar,d_scalar;
779 __m128 dummy_mask,cutoff_mask;
780 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
781 __m128 one = _mm_set1_ps(1.0);
782 __m128 two = _mm_set1_ps(2.0);
788 jindex = nlist->jindex;
790 shiftidx = nlist->shift;
792 shiftvec = fr->shift_vec[0];
793 fshift = fr->fshift[0];
794 facel = _mm_set1_ps(fr->epsfac);
795 charge = mdatoms->chargeA;
796 krf = _mm_set1_ps(fr->ic->k_rf);
797 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
798 crf = _mm_set1_ps(fr->ic->c_rf);
799 nvdwtype = fr->ntype;
801 vdwtype = mdatoms->typeA;
803 /* Setup water-specific parameters */
804 inr = nlist->iinr[0];
805 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
806 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
807 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
808 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
810 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
811 rcutoff_scalar = fr->rcoulomb;
812 rcutoff = _mm_set1_ps(rcutoff_scalar);
813 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
815 rswitch_scalar = fr->rvdw_switch;
816 rswitch = _mm_set1_ps(rswitch_scalar);
817 /* Setup switch parameters */
818 d_scalar = rcutoff_scalar-rswitch_scalar;
819 d = _mm_set1_ps(d_scalar);
820 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
821 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
822 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
823 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
824 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
825 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
827 /* Avoid stupid compiler warnings */
828 jnrA = jnrB = jnrC = jnrD = 0;
837 /* Start outer loop over neighborlists */
838 for(iidx=0; iidx<nri; iidx++)
840 /* Load shift vector for this list */
841 i_shift_offset = DIM*shiftidx[iidx];
842 shX = shiftvec[i_shift_offset+XX];
843 shY = shiftvec[i_shift_offset+YY];
844 shZ = shiftvec[i_shift_offset+ZZ];
846 /* Load limits for loop over neighbors */
847 j_index_start = jindex[iidx];
848 j_index_end = jindex[iidx+1];
850 /* Get outer coordinate index */
852 i_coord_offset = DIM*inr;
854 /* Load i particle coords and add shift vector */
855 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
856 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
857 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
858 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
859 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
860 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
861 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
862 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
863 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
864 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
865 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
866 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
868 fix0 = _mm_setzero_ps();
869 fiy0 = _mm_setzero_ps();
870 fiz0 = _mm_setzero_ps();
871 fix1 = _mm_setzero_ps();
872 fiy1 = _mm_setzero_ps();
873 fiz1 = _mm_setzero_ps();
874 fix2 = _mm_setzero_ps();
875 fiy2 = _mm_setzero_ps();
876 fiz2 = _mm_setzero_ps();
877 fix3 = _mm_setzero_ps();
878 fiy3 = _mm_setzero_ps();
879 fiz3 = _mm_setzero_ps();
881 /* Start inner kernel loop */
882 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
885 /* Get j neighbor index, and coordinate index */
891 j_coord_offsetA = DIM*jnrA;
892 j_coord_offsetB = DIM*jnrB;
893 j_coord_offsetC = DIM*jnrC;
894 j_coord_offsetD = DIM*jnrD;
896 /* load j atom coordinates */
897 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
898 x+j_coord_offsetC,x+j_coord_offsetD,
901 /* Calculate displacement vector */
902 dx00 = _mm_sub_ps(ix0,jx0);
903 dy00 = _mm_sub_ps(iy0,jy0);
904 dz00 = _mm_sub_ps(iz0,jz0);
905 dx10 = _mm_sub_ps(ix1,jx0);
906 dy10 = _mm_sub_ps(iy1,jy0);
907 dz10 = _mm_sub_ps(iz1,jz0);
908 dx20 = _mm_sub_ps(ix2,jx0);
909 dy20 = _mm_sub_ps(iy2,jy0);
910 dz20 = _mm_sub_ps(iz2,jz0);
911 dx30 = _mm_sub_ps(ix3,jx0);
912 dy30 = _mm_sub_ps(iy3,jy0);
913 dz30 = _mm_sub_ps(iz3,jz0);
915 /* Calculate squared distance and things based on it */
916 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
917 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
918 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
919 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
921 rinv00 = gmx_mm_invsqrt_ps(rsq00);
922 rinv10 = gmx_mm_invsqrt_ps(rsq10);
923 rinv20 = gmx_mm_invsqrt_ps(rsq20);
924 rinv30 = gmx_mm_invsqrt_ps(rsq30);
926 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
927 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
928 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
929 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
931 /* Load parameters for j particles */
932 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
933 charge+jnrC+0,charge+jnrD+0);
934 vdwjidx0A = 2*vdwtype[jnrA+0];
935 vdwjidx0B = 2*vdwtype[jnrB+0];
936 vdwjidx0C = 2*vdwtype[jnrC+0];
937 vdwjidx0D = 2*vdwtype[jnrD+0];
939 /**************************
940 * CALCULATE INTERACTIONS *
941 **************************/
943 if (gmx_mm_any_lt(rsq00,rcutoff2))
946 r00 = _mm_mul_ps(rsq00,rinv00);
948 /* Compute parameters for interactions between i and j atoms */
949 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
950 vdwparam+vdwioffset0+vdwjidx0B,
951 vdwparam+vdwioffset0+vdwjidx0C,
952 vdwparam+vdwioffset0+vdwjidx0D,
955 /* LENNARD-JONES DISPERSION/REPULSION */
957 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
958 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
959 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
960 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
961 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
963 d = _mm_sub_ps(r00,rswitch);
964 d = _mm_max_ps(d,_mm_setzero_ps());
965 d2 = _mm_mul_ps(d,d);
966 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
968 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
970 /* Evaluate switch function */
971 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
972 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
973 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
977 fscal = _mm_and_ps(fscal,cutoff_mask);
979 /* Calculate temporary vectorial force */
980 tx = _mm_mul_ps(fscal,dx00);
981 ty = _mm_mul_ps(fscal,dy00);
982 tz = _mm_mul_ps(fscal,dz00);
984 /* Update vectorial force */
985 fix0 = _mm_add_ps(fix0,tx);
986 fiy0 = _mm_add_ps(fiy0,ty);
987 fiz0 = _mm_add_ps(fiz0,tz);
989 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
990 f+j_coord_offsetC,f+j_coord_offsetD,
995 /**************************
996 * CALCULATE INTERACTIONS *
997 **************************/
999 if (gmx_mm_any_lt(rsq10,rcutoff2))
1002 /* Compute parameters for interactions between i and j atoms */
1003 qq10 = _mm_mul_ps(iq1,jq0);
1005 /* REACTION-FIELD ELECTROSTATICS */
1006 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1008 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1012 fscal = _mm_and_ps(fscal,cutoff_mask);
1014 /* Calculate temporary vectorial force */
1015 tx = _mm_mul_ps(fscal,dx10);
1016 ty = _mm_mul_ps(fscal,dy10);
1017 tz = _mm_mul_ps(fscal,dz10);
1019 /* Update vectorial force */
1020 fix1 = _mm_add_ps(fix1,tx);
1021 fiy1 = _mm_add_ps(fiy1,ty);
1022 fiz1 = _mm_add_ps(fiz1,tz);
1024 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1025 f+j_coord_offsetC,f+j_coord_offsetD,
1030 /**************************
1031 * CALCULATE INTERACTIONS *
1032 **************************/
1034 if (gmx_mm_any_lt(rsq20,rcutoff2))
1037 /* Compute parameters for interactions between i and j atoms */
1038 qq20 = _mm_mul_ps(iq2,jq0);
1040 /* REACTION-FIELD ELECTROSTATICS */
1041 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1043 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1047 fscal = _mm_and_ps(fscal,cutoff_mask);
1049 /* Calculate temporary vectorial force */
1050 tx = _mm_mul_ps(fscal,dx20);
1051 ty = _mm_mul_ps(fscal,dy20);
1052 tz = _mm_mul_ps(fscal,dz20);
1054 /* Update vectorial force */
1055 fix2 = _mm_add_ps(fix2,tx);
1056 fiy2 = _mm_add_ps(fiy2,ty);
1057 fiz2 = _mm_add_ps(fiz2,tz);
1059 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1060 f+j_coord_offsetC,f+j_coord_offsetD,
1065 /**************************
1066 * CALCULATE INTERACTIONS *
1067 **************************/
1069 if (gmx_mm_any_lt(rsq30,rcutoff2))
1072 /* Compute parameters for interactions between i and j atoms */
1073 qq30 = _mm_mul_ps(iq3,jq0);
1075 /* REACTION-FIELD ELECTROSTATICS */
1076 felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
1078 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
1082 fscal = _mm_and_ps(fscal,cutoff_mask);
1084 /* Calculate temporary vectorial force */
1085 tx = _mm_mul_ps(fscal,dx30);
1086 ty = _mm_mul_ps(fscal,dy30);
1087 tz = _mm_mul_ps(fscal,dz30);
1089 /* Update vectorial force */
1090 fix3 = _mm_add_ps(fix3,tx);
1091 fiy3 = _mm_add_ps(fiy3,ty);
1092 fiz3 = _mm_add_ps(fiz3,tz);
1094 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1095 f+j_coord_offsetC,f+j_coord_offsetD,
1100 /* Inner loop uses 146 flops */
1103 if(jidx<j_index_end)
1106 /* Get j neighbor index, and coordinate index */
1108 jnrB = jjnr[jidx+1];
1109 jnrC = jjnr[jidx+2];
1110 jnrD = jjnr[jidx+3];
1112 /* Sign of each element will be negative for non-real atoms.
1113 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1114 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1116 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1117 jnrA = (jnrA>=0) ? jnrA : 0;
1118 jnrB = (jnrB>=0) ? jnrB : 0;
1119 jnrC = (jnrC>=0) ? jnrC : 0;
1120 jnrD = (jnrD>=0) ? jnrD : 0;
1122 j_coord_offsetA = DIM*jnrA;
1123 j_coord_offsetB = DIM*jnrB;
1124 j_coord_offsetC = DIM*jnrC;
1125 j_coord_offsetD = DIM*jnrD;
1127 /* load j atom coordinates */
1128 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1129 x+j_coord_offsetC,x+j_coord_offsetD,
1132 /* Calculate displacement vector */
1133 dx00 = _mm_sub_ps(ix0,jx0);
1134 dy00 = _mm_sub_ps(iy0,jy0);
1135 dz00 = _mm_sub_ps(iz0,jz0);
1136 dx10 = _mm_sub_ps(ix1,jx0);
1137 dy10 = _mm_sub_ps(iy1,jy0);
1138 dz10 = _mm_sub_ps(iz1,jz0);
1139 dx20 = _mm_sub_ps(ix2,jx0);
1140 dy20 = _mm_sub_ps(iy2,jy0);
1141 dz20 = _mm_sub_ps(iz2,jz0);
1142 dx30 = _mm_sub_ps(ix3,jx0);
1143 dy30 = _mm_sub_ps(iy3,jy0);
1144 dz30 = _mm_sub_ps(iz3,jz0);
1146 /* Calculate squared distance and things based on it */
1147 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1148 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1149 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1150 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1152 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1153 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1154 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1155 rinv30 = gmx_mm_invsqrt_ps(rsq30);
1157 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1158 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1159 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1160 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
1162 /* Load parameters for j particles */
1163 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1164 charge+jnrC+0,charge+jnrD+0);
1165 vdwjidx0A = 2*vdwtype[jnrA+0];
1166 vdwjidx0B = 2*vdwtype[jnrB+0];
1167 vdwjidx0C = 2*vdwtype[jnrC+0];
1168 vdwjidx0D = 2*vdwtype[jnrD+0];
1170 /**************************
1171 * CALCULATE INTERACTIONS *
1172 **************************/
1174 if (gmx_mm_any_lt(rsq00,rcutoff2))
1177 r00 = _mm_mul_ps(rsq00,rinv00);
1178 r00 = _mm_andnot_ps(dummy_mask,r00);
1180 /* Compute parameters for interactions between i and j atoms */
1181 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1182 vdwparam+vdwioffset0+vdwjidx0B,
1183 vdwparam+vdwioffset0+vdwjidx0C,
1184 vdwparam+vdwioffset0+vdwjidx0D,
1187 /* LENNARD-JONES DISPERSION/REPULSION */
1189 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1190 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1191 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1192 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1193 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1195 d = _mm_sub_ps(r00,rswitch);
1196 d = _mm_max_ps(d,_mm_setzero_ps());
1197 d2 = _mm_mul_ps(d,d);
1198 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1200 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1202 /* Evaluate switch function */
1203 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1204 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1205 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1209 fscal = _mm_and_ps(fscal,cutoff_mask);
1211 fscal = _mm_andnot_ps(dummy_mask,fscal);
1213 /* Calculate temporary vectorial force */
1214 tx = _mm_mul_ps(fscal,dx00);
1215 ty = _mm_mul_ps(fscal,dy00);
1216 tz = _mm_mul_ps(fscal,dz00);
1218 /* Update vectorial force */
1219 fix0 = _mm_add_ps(fix0,tx);
1220 fiy0 = _mm_add_ps(fiy0,ty);
1221 fiz0 = _mm_add_ps(fiz0,tz);
1223 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1224 f+j_coord_offsetC,f+j_coord_offsetD,
1229 /**************************
1230 * CALCULATE INTERACTIONS *
1231 **************************/
1233 if (gmx_mm_any_lt(rsq10,rcutoff2))
1236 /* Compute parameters for interactions between i and j atoms */
1237 qq10 = _mm_mul_ps(iq1,jq0);
1239 /* REACTION-FIELD ELECTROSTATICS */
1240 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1242 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1246 fscal = _mm_and_ps(fscal,cutoff_mask);
1248 fscal = _mm_andnot_ps(dummy_mask,fscal);
1250 /* Calculate temporary vectorial force */
1251 tx = _mm_mul_ps(fscal,dx10);
1252 ty = _mm_mul_ps(fscal,dy10);
1253 tz = _mm_mul_ps(fscal,dz10);
1255 /* Update vectorial force */
1256 fix1 = _mm_add_ps(fix1,tx);
1257 fiy1 = _mm_add_ps(fiy1,ty);
1258 fiz1 = _mm_add_ps(fiz1,tz);
1260 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1261 f+j_coord_offsetC,f+j_coord_offsetD,
1266 /**************************
1267 * CALCULATE INTERACTIONS *
1268 **************************/
1270 if (gmx_mm_any_lt(rsq20,rcutoff2))
1273 /* Compute parameters for interactions between i and j atoms */
1274 qq20 = _mm_mul_ps(iq2,jq0);
1276 /* REACTION-FIELD ELECTROSTATICS */
1277 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1279 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1283 fscal = _mm_and_ps(fscal,cutoff_mask);
1285 fscal = _mm_andnot_ps(dummy_mask,fscal);
1287 /* Calculate temporary vectorial force */
1288 tx = _mm_mul_ps(fscal,dx20);
1289 ty = _mm_mul_ps(fscal,dy20);
1290 tz = _mm_mul_ps(fscal,dz20);
1292 /* Update vectorial force */
1293 fix2 = _mm_add_ps(fix2,tx);
1294 fiy2 = _mm_add_ps(fiy2,ty);
1295 fiz2 = _mm_add_ps(fiz2,tz);
1297 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1298 f+j_coord_offsetC,f+j_coord_offsetD,
1303 /**************************
1304 * CALCULATE INTERACTIONS *
1305 **************************/
1307 if (gmx_mm_any_lt(rsq30,rcutoff2))
1310 /* Compute parameters for interactions between i and j atoms */
1311 qq30 = _mm_mul_ps(iq3,jq0);
1313 /* REACTION-FIELD ELECTROSTATICS */
1314 felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
1316 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
1320 fscal = _mm_and_ps(fscal,cutoff_mask);
1322 fscal = _mm_andnot_ps(dummy_mask,fscal);
1324 /* Calculate temporary vectorial force */
1325 tx = _mm_mul_ps(fscal,dx30);
1326 ty = _mm_mul_ps(fscal,dy30);
1327 tz = _mm_mul_ps(fscal,dz30);
1329 /* Update vectorial force */
1330 fix3 = _mm_add_ps(fix3,tx);
1331 fiy3 = _mm_add_ps(fiy3,ty);
1332 fiz3 = _mm_add_ps(fiz3,tz);
1334 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1335 f+j_coord_offsetC,f+j_coord_offsetD,
1340 /* Inner loop uses 147 flops */
1343 /* End of innermost loop */
1345 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1346 f+i_coord_offset,fshift+i_shift_offset);
1348 /* Increment number of inner iterations */
1349 inneriter += j_index_end - j_index_start;
1351 /* Outer loop uses 36 flops */
1354 /* Increment number of outer iterations */
1357 /* Update outer/inner flops */
1359 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*147);