2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse2_single
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real shX,shY,shZ,rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
73 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
75 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
77 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
90 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
93 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
94 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
95 __m128 dummy_mask,cutoff_mask;
96 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
97 __m128 one = _mm_set1_ps(1.0);
98 __m128 two = _mm_set1_ps(2.0);
104 jindex = nlist->jindex;
106 shiftidx = nlist->shift;
108 shiftvec = fr->shift_vec[0];
109 fshift = fr->fshift[0];
110 facel = _mm_set1_ps(fr->epsfac);
111 charge = mdatoms->chargeA;
112 krf = _mm_set1_ps(fr->ic->k_rf);
113 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
114 crf = _mm_set1_ps(fr->ic->c_rf);
115 nvdwtype = fr->ntype;
117 vdwtype = mdatoms->typeA;
119 /* Setup water-specific parameters */
120 inr = nlist->iinr[0];
121 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
122 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
123 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
124 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
126 jq0 = _mm_set1_ps(charge[inr+0]);
127 jq1 = _mm_set1_ps(charge[inr+1]);
128 jq2 = _mm_set1_ps(charge[inr+2]);
129 vdwjidx0A = 2*vdwtype[inr+0];
130 qq00 = _mm_mul_ps(iq0,jq0);
131 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
132 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
133 qq01 = _mm_mul_ps(iq0,jq1);
134 qq02 = _mm_mul_ps(iq0,jq2);
135 qq10 = _mm_mul_ps(iq1,jq0);
136 qq11 = _mm_mul_ps(iq1,jq1);
137 qq12 = _mm_mul_ps(iq1,jq2);
138 qq20 = _mm_mul_ps(iq2,jq0);
139 qq21 = _mm_mul_ps(iq2,jq1);
140 qq22 = _mm_mul_ps(iq2,jq2);
142 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
143 rcutoff_scalar = fr->rcoulomb;
144 rcutoff = _mm_set1_ps(rcutoff_scalar);
145 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
147 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
148 rvdw = _mm_set1_ps(fr->rvdw);
150 /* Avoid stupid compiler warnings */
151 jnrA = jnrB = jnrC = jnrD = 0;
160 /* Start outer loop over neighborlists */
161 for(iidx=0; iidx<nri; iidx++)
163 /* Load shift vector for this list */
164 i_shift_offset = DIM*shiftidx[iidx];
165 shX = shiftvec[i_shift_offset+XX];
166 shY = shiftvec[i_shift_offset+YY];
167 shZ = shiftvec[i_shift_offset+ZZ];
169 /* Load limits for loop over neighbors */
170 j_index_start = jindex[iidx];
171 j_index_end = jindex[iidx+1];
173 /* Get outer coordinate index */
175 i_coord_offset = DIM*inr;
177 /* Load i particle coords and add shift vector */
178 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
179 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
180 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
181 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
182 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
183 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
184 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
185 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
186 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
188 fix0 = _mm_setzero_ps();
189 fiy0 = _mm_setzero_ps();
190 fiz0 = _mm_setzero_ps();
191 fix1 = _mm_setzero_ps();
192 fiy1 = _mm_setzero_ps();
193 fiz1 = _mm_setzero_ps();
194 fix2 = _mm_setzero_ps();
195 fiy2 = _mm_setzero_ps();
196 fiz2 = _mm_setzero_ps();
198 /* Reset potential sums */
199 velecsum = _mm_setzero_ps();
200 vvdwsum = _mm_setzero_ps();
202 /* Start inner kernel loop */
203 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
206 /* Get j neighbor index, and coordinate index */
212 j_coord_offsetA = DIM*jnrA;
213 j_coord_offsetB = DIM*jnrB;
214 j_coord_offsetC = DIM*jnrC;
215 j_coord_offsetD = DIM*jnrD;
217 /* load j atom coordinates */
218 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
219 x+j_coord_offsetC,x+j_coord_offsetD,
220 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
222 /* Calculate displacement vector */
223 dx00 = _mm_sub_ps(ix0,jx0);
224 dy00 = _mm_sub_ps(iy0,jy0);
225 dz00 = _mm_sub_ps(iz0,jz0);
226 dx01 = _mm_sub_ps(ix0,jx1);
227 dy01 = _mm_sub_ps(iy0,jy1);
228 dz01 = _mm_sub_ps(iz0,jz1);
229 dx02 = _mm_sub_ps(ix0,jx2);
230 dy02 = _mm_sub_ps(iy0,jy2);
231 dz02 = _mm_sub_ps(iz0,jz2);
232 dx10 = _mm_sub_ps(ix1,jx0);
233 dy10 = _mm_sub_ps(iy1,jy0);
234 dz10 = _mm_sub_ps(iz1,jz0);
235 dx11 = _mm_sub_ps(ix1,jx1);
236 dy11 = _mm_sub_ps(iy1,jy1);
237 dz11 = _mm_sub_ps(iz1,jz1);
238 dx12 = _mm_sub_ps(ix1,jx2);
239 dy12 = _mm_sub_ps(iy1,jy2);
240 dz12 = _mm_sub_ps(iz1,jz2);
241 dx20 = _mm_sub_ps(ix2,jx0);
242 dy20 = _mm_sub_ps(iy2,jy0);
243 dz20 = _mm_sub_ps(iz2,jz0);
244 dx21 = _mm_sub_ps(ix2,jx1);
245 dy21 = _mm_sub_ps(iy2,jy1);
246 dz21 = _mm_sub_ps(iz2,jz1);
247 dx22 = _mm_sub_ps(ix2,jx2);
248 dy22 = _mm_sub_ps(iy2,jy2);
249 dz22 = _mm_sub_ps(iz2,jz2);
251 /* Calculate squared distance and things based on it */
252 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
253 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
254 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
255 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
256 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
257 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
258 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
259 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
260 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
262 rinv00 = gmx_mm_invsqrt_ps(rsq00);
263 rinv01 = gmx_mm_invsqrt_ps(rsq01);
264 rinv02 = gmx_mm_invsqrt_ps(rsq02);
265 rinv10 = gmx_mm_invsqrt_ps(rsq10);
266 rinv11 = gmx_mm_invsqrt_ps(rsq11);
267 rinv12 = gmx_mm_invsqrt_ps(rsq12);
268 rinv20 = gmx_mm_invsqrt_ps(rsq20);
269 rinv21 = gmx_mm_invsqrt_ps(rsq21);
270 rinv22 = gmx_mm_invsqrt_ps(rsq22);
272 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
273 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
274 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
275 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
276 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
277 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
278 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
279 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
280 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
282 fjx0 = _mm_setzero_ps();
283 fjy0 = _mm_setzero_ps();
284 fjz0 = _mm_setzero_ps();
285 fjx1 = _mm_setzero_ps();
286 fjy1 = _mm_setzero_ps();
287 fjz1 = _mm_setzero_ps();
288 fjx2 = _mm_setzero_ps();
289 fjy2 = _mm_setzero_ps();
290 fjz2 = _mm_setzero_ps();
292 /**************************
293 * CALCULATE INTERACTIONS *
294 **************************/
296 if (gmx_mm_any_lt(rsq00,rcutoff2))
299 /* REACTION-FIELD ELECTROSTATICS */
300 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
301 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
303 /* LENNARD-JONES DISPERSION/REPULSION */
305 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
306 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
307 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
308 vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
309 _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
310 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
312 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
314 /* Update potential sum for this i atom from the interaction with this j atom. */
315 velec = _mm_and_ps(velec,cutoff_mask);
316 velecsum = _mm_add_ps(velecsum,velec);
317 vvdw = _mm_and_ps(vvdw,cutoff_mask);
318 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
320 fscal = _mm_add_ps(felec,fvdw);
322 fscal = _mm_and_ps(fscal,cutoff_mask);
324 /* Calculate temporary vectorial force */
325 tx = _mm_mul_ps(fscal,dx00);
326 ty = _mm_mul_ps(fscal,dy00);
327 tz = _mm_mul_ps(fscal,dz00);
329 /* Update vectorial force */
330 fix0 = _mm_add_ps(fix0,tx);
331 fiy0 = _mm_add_ps(fiy0,ty);
332 fiz0 = _mm_add_ps(fiz0,tz);
334 fjx0 = _mm_add_ps(fjx0,tx);
335 fjy0 = _mm_add_ps(fjy0,ty);
336 fjz0 = _mm_add_ps(fjz0,tz);
340 /**************************
341 * CALCULATE INTERACTIONS *
342 **************************/
344 if (gmx_mm_any_lt(rsq01,rcutoff2))
347 /* REACTION-FIELD ELECTROSTATICS */
348 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
349 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
351 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
353 /* Update potential sum for this i atom from the interaction with this j atom. */
354 velec = _mm_and_ps(velec,cutoff_mask);
355 velecsum = _mm_add_ps(velecsum,velec);
359 fscal = _mm_and_ps(fscal,cutoff_mask);
361 /* Calculate temporary vectorial force */
362 tx = _mm_mul_ps(fscal,dx01);
363 ty = _mm_mul_ps(fscal,dy01);
364 tz = _mm_mul_ps(fscal,dz01);
366 /* Update vectorial force */
367 fix0 = _mm_add_ps(fix0,tx);
368 fiy0 = _mm_add_ps(fiy0,ty);
369 fiz0 = _mm_add_ps(fiz0,tz);
371 fjx1 = _mm_add_ps(fjx1,tx);
372 fjy1 = _mm_add_ps(fjy1,ty);
373 fjz1 = _mm_add_ps(fjz1,tz);
377 /**************************
378 * CALCULATE INTERACTIONS *
379 **************************/
381 if (gmx_mm_any_lt(rsq02,rcutoff2))
384 /* REACTION-FIELD ELECTROSTATICS */
385 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
386 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
388 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
390 /* Update potential sum for this i atom from the interaction with this j atom. */
391 velec = _mm_and_ps(velec,cutoff_mask);
392 velecsum = _mm_add_ps(velecsum,velec);
396 fscal = _mm_and_ps(fscal,cutoff_mask);
398 /* Calculate temporary vectorial force */
399 tx = _mm_mul_ps(fscal,dx02);
400 ty = _mm_mul_ps(fscal,dy02);
401 tz = _mm_mul_ps(fscal,dz02);
403 /* Update vectorial force */
404 fix0 = _mm_add_ps(fix0,tx);
405 fiy0 = _mm_add_ps(fiy0,ty);
406 fiz0 = _mm_add_ps(fiz0,tz);
408 fjx2 = _mm_add_ps(fjx2,tx);
409 fjy2 = _mm_add_ps(fjy2,ty);
410 fjz2 = _mm_add_ps(fjz2,tz);
414 /**************************
415 * CALCULATE INTERACTIONS *
416 **************************/
418 if (gmx_mm_any_lt(rsq10,rcutoff2))
421 /* REACTION-FIELD ELECTROSTATICS */
422 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
423 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
425 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
427 /* Update potential sum for this i atom from the interaction with this j atom. */
428 velec = _mm_and_ps(velec,cutoff_mask);
429 velecsum = _mm_add_ps(velecsum,velec);
433 fscal = _mm_and_ps(fscal,cutoff_mask);
435 /* Calculate temporary vectorial force */
436 tx = _mm_mul_ps(fscal,dx10);
437 ty = _mm_mul_ps(fscal,dy10);
438 tz = _mm_mul_ps(fscal,dz10);
440 /* Update vectorial force */
441 fix1 = _mm_add_ps(fix1,tx);
442 fiy1 = _mm_add_ps(fiy1,ty);
443 fiz1 = _mm_add_ps(fiz1,tz);
445 fjx0 = _mm_add_ps(fjx0,tx);
446 fjy0 = _mm_add_ps(fjy0,ty);
447 fjz0 = _mm_add_ps(fjz0,tz);
451 /**************************
452 * CALCULATE INTERACTIONS *
453 **************************/
455 if (gmx_mm_any_lt(rsq11,rcutoff2))
458 /* REACTION-FIELD ELECTROSTATICS */
459 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
460 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
462 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
464 /* Update potential sum for this i atom from the interaction with this j atom. */
465 velec = _mm_and_ps(velec,cutoff_mask);
466 velecsum = _mm_add_ps(velecsum,velec);
470 fscal = _mm_and_ps(fscal,cutoff_mask);
472 /* Calculate temporary vectorial force */
473 tx = _mm_mul_ps(fscal,dx11);
474 ty = _mm_mul_ps(fscal,dy11);
475 tz = _mm_mul_ps(fscal,dz11);
477 /* Update vectorial force */
478 fix1 = _mm_add_ps(fix1,tx);
479 fiy1 = _mm_add_ps(fiy1,ty);
480 fiz1 = _mm_add_ps(fiz1,tz);
482 fjx1 = _mm_add_ps(fjx1,tx);
483 fjy1 = _mm_add_ps(fjy1,ty);
484 fjz1 = _mm_add_ps(fjz1,tz);
488 /**************************
489 * CALCULATE INTERACTIONS *
490 **************************/
492 if (gmx_mm_any_lt(rsq12,rcutoff2))
495 /* REACTION-FIELD ELECTROSTATICS */
496 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
497 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
499 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
501 /* Update potential sum for this i atom from the interaction with this j atom. */
502 velec = _mm_and_ps(velec,cutoff_mask);
503 velecsum = _mm_add_ps(velecsum,velec);
507 fscal = _mm_and_ps(fscal,cutoff_mask);
509 /* Calculate temporary vectorial force */
510 tx = _mm_mul_ps(fscal,dx12);
511 ty = _mm_mul_ps(fscal,dy12);
512 tz = _mm_mul_ps(fscal,dz12);
514 /* Update vectorial force */
515 fix1 = _mm_add_ps(fix1,tx);
516 fiy1 = _mm_add_ps(fiy1,ty);
517 fiz1 = _mm_add_ps(fiz1,tz);
519 fjx2 = _mm_add_ps(fjx2,tx);
520 fjy2 = _mm_add_ps(fjy2,ty);
521 fjz2 = _mm_add_ps(fjz2,tz);
525 /**************************
526 * CALCULATE INTERACTIONS *
527 **************************/
529 if (gmx_mm_any_lt(rsq20,rcutoff2))
532 /* REACTION-FIELD ELECTROSTATICS */
533 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
534 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
536 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
538 /* Update potential sum for this i atom from the interaction with this j atom. */
539 velec = _mm_and_ps(velec,cutoff_mask);
540 velecsum = _mm_add_ps(velecsum,velec);
544 fscal = _mm_and_ps(fscal,cutoff_mask);
546 /* Calculate temporary vectorial force */
547 tx = _mm_mul_ps(fscal,dx20);
548 ty = _mm_mul_ps(fscal,dy20);
549 tz = _mm_mul_ps(fscal,dz20);
551 /* Update vectorial force */
552 fix2 = _mm_add_ps(fix2,tx);
553 fiy2 = _mm_add_ps(fiy2,ty);
554 fiz2 = _mm_add_ps(fiz2,tz);
556 fjx0 = _mm_add_ps(fjx0,tx);
557 fjy0 = _mm_add_ps(fjy0,ty);
558 fjz0 = _mm_add_ps(fjz0,tz);
562 /**************************
563 * CALCULATE INTERACTIONS *
564 **************************/
566 if (gmx_mm_any_lt(rsq21,rcutoff2))
569 /* REACTION-FIELD ELECTROSTATICS */
570 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
571 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
573 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
575 /* Update potential sum for this i atom from the interaction with this j atom. */
576 velec = _mm_and_ps(velec,cutoff_mask);
577 velecsum = _mm_add_ps(velecsum,velec);
581 fscal = _mm_and_ps(fscal,cutoff_mask);
583 /* Calculate temporary vectorial force */
584 tx = _mm_mul_ps(fscal,dx21);
585 ty = _mm_mul_ps(fscal,dy21);
586 tz = _mm_mul_ps(fscal,dz21);
588 /* Update vectorial force */
589 fix2 = _mm_add_ps(fix2,tx);
590 fiy2 = _mm_add_ps(fiy2,ty);
591 fiz2 = _mm_add_ps(fiz2,tz);
593 fjx1 = _mm_add_ps(fjx1,tx);
594 fjy1 = _mm_add_ps(fjy1,ty);
595 fjz1 = _mm_add_ps(fjz1,tz);
599 /**************************
600 * CALCULATE INTERACTIONS *
601 **************************/
603 if (gmx_mm_any_lt(rsq22,rcutoff2))
606 /* REACTION-FIELD ELECTROSTATICS */
607 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
608 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
610 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
612 /* Update potential sum for this i atom from the interaction with this j atom. */
613 velec = _mm_and_ps(velec,cutoff_mask);
614 velecsum = _mm_add_ps(velecsum,velec);
618 fscal = _mm_and_ps(fscal,cutoff_mask);
620 /* Calculate temporary vectorial force */
621 tx = _mm_mul_ps(fscal,dx22);
622 ty = _mm_mul_ps(fscal,dy22);
623 tz = _mm_mul_ps(fscal,dz22);
625 /* Update vectorial force */
626 fix2 = _mm_add_ps(fix2,tx);
627 fiy2 = _mm_add_ps(fiy2,ty);
628 fiz2 = _mm_add_ps(fiz2,tz);
630 fjx2 = _mm_add_ps(fjx2,tx);
631 fjy2 = _mm_add_ps(fjy2,ty);
632 fjz2 = _mm_add_ps(fjz2,tz);
636 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
637 f+j_coord_offsetC,f+j_coord_offsetD,
638 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
640 /* Inner loop uses 342 flops */
646 /* Get j neighbor index, and coordinate index */
652 /* Sign of each element will be negative for non-real atoms.
653 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
654 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
656 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
657 jnrA = (jnrA>=0) ? jnrA : 0;
658 jnrB = (jnrB>=0) ? jnrB : 0;
659 jnrC = (jnrC>=0) ? jnrC : 0;
660 jnrD = (jnrD>=0) ? jnrD : 0;
662 j_coord_offsetA = DIM*jnrA;
663 j_coord_offsetB = DIM*jnrB;
664 j_coord_offsetC = DIM*jnrC;
665 j_coord_offsetD = DIM*jnrD;
667 /* load j atom coordinates */
668 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
669 x+j_coord_offsetC,x+j_coord_offsetD,
670 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
672 /* Calculate displacement vector */
673 dx00 = _mm_sub_ps(ix0,jx0);
674 dy00 = _mm_sub_ps(iy0,jy0);
675 dz00 = _mm_sub_ps(iz0,jz0);
676 dx01 = _mm_sub_ps(ix0,jx1);
677 dy01 = _mm_sub_ps(iy0,jy1);
678 dz01 = _mm_sub_ps(iz0,jz1);
679 dx02 = _mm_sub_ps(ix0,jx2);
680 dy02 = _mm_sub_ps(iy0,jy2);
681 dz02 = _mm_sub_ps(iz0,jz2);
682 dx10 = _mm_sub_ps(ix1,jx0);
683 dy10 = _mm_sub_ps(iy1,jy0);
684 dz10 = _mm_sub_ps(iz1,jz0);
685 dx11 = _mm_sub_ps(ix1,jx1);
686 dy11 = _mm_sub_ps(iy1,jy1);
687 dz11 = _mm_sub_ps(iz1,jz1);
688 dx12 = _mm_sub_ps(ix1,jx2);
689 dy12 = _mm_sub_ps(iy1,jy2);
690 dz12 = _mm_sub_ps(iz1,jz2);
691 dx20 = _mm_sub_ps(ix2,jx0);
692 dy20 = _mm_sub_ps(iy2,jy0);
693 dz20 = _mm_sub_ps(iz2,jz0);
694 dx21 = _mm_sub_ps(ix2,jx1);
695 dy21 = _mm_sub_ps(iy2,jy1);
696 dz21 = _mm_sub_ps(iz2,jz1);
697 dx22 = _mm_sub_ps(ix2,jx2);
698 dy22 = _mm_sub_ps(iy2,jy2);
699 dz22 = _mm_sub_ps(iz2,jz2);
701 /* Calculate squared distance and things based on it */
702 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
703 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
704 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
705 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
706 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
707 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
708 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
709 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
710 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
712 rinv00 = gmx_mm_invsqrt_ps(rsq00);
713 rinv01 = gmx_mm_invsqrt_ps(rsq01);
714 rinv02 = gmx_mm_invsqrt_ps(rsq02);
715 rinv10 = gmx_mm_invsqrt_ps(rsq10);
716 rinv11 = gmx_mm_invsqrt_ps(rsq11);
717 rinv12 = gmx_mm_invsqrt_ps(rsq12);
718 rinv20 = gmx_mm_invsqrt_ps(rsq20);
719 rinv21 = gmx_mm_invsqrt_ps(rsq21);
720 rinv22 = gmx_mm_invsqrt_ps(rsq22);
722 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
723 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
724 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
725 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
726 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
727 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
728 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
729 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
730 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
732 fjx0 = _mm_setzero_ps();
733 fjy0 = _mm_setzero_ps();
734 fjz0 = _mm_setzero_ps();
735 fjx1 = _mm_setzero_ps();
736 fjy1 = _mm_setzero_ps();
737 fjz1 = _mm_setzero_ps();
738 fjx2 = _mm_setzero_ps();
739 fjy2 = _mm_setzero_ps();
740 fjz2 = _mm_setzero_ps();
742 /**************************
743 * CALCULATE INTERACTIONS *
744 **************************/
746 if (gmx_mm_any_lt(rsq00,rcutoff2))
749 /* REACTION-FIELD ELECTROSTATICS */
750 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
751 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
753 /* LENNARD-JONES DISPERSION/REPULSION */
755 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
756 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
757 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
758 vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
759 _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
760 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
762 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
764 /* Update potential sum for this i atom from the interaction with this j atom. */
765 velec = _mm_and_ps(velec,cutoff_mask);
766 velec = _mm_andnot_ps(dummy_mask,velec);
767 velecsum = _mm_add_ps(velecsum,velec);
768 vvdw = _mm_and_ps(vvdw,cutoff_mask);
769 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
770 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
772 fscal = _mm_add_ps(felec,fvdw);
774 fscal = _mm_and_ps(fscal,cutoff_mask);
776 fscal = _mm_andnot_ps(dummy_mask,fscal);
778 /* Calculate temporary vectorial force */
779 tx = _mm_mul_ps(fscal,dx00);
780 ty = _mm_mul_ps(fscal,dy00);
781 tz = _mm_mul_ps(fscal,dz00);
783 /* Update vectorial force */
784 fix0 = _mm_add_ps(fix0,tx);
785 fiy0 = _mm_add_ps(fiy0,ty);
786 fiz0 = _mm_add_ps(fiz0,tz);
788 fjx0 = _mm_add_ps(fjx0,tx);
789 fjy0 = _mm_add_ps(fjy0,ty);
790 fjz0 = _mm_add_ps(fjz0,tz);
794 /**************************
795 * CALCULATE INTERACTIONS *
796 **************************/
798 if (gmx_mm_any_lt(rsq01,rcutoff2))
801 /* REACTION-FIELD ELECTROSTATICS */
802 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
803 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
805 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
807 /* Update potential sum for this i atom from the interaction with this j atom. */
808 velec = _mm_and_ps(velec,cutoff_mask);
809 velec = _mm_andnot_ps(dummy_mask,velec);
810 velecsum = _mm_add_ps(velecsum,velec);
814 fscal = _mm_and_ps(fscal,cutoff_mask);
816 fscal = _mm_andnot_ps(dummy_mask,fscal);
818 /* Calculate temporary vectorial force */
819 tx = _mm_mul_ps(fscal,dx01);
820 ty = _mm_mul_ps(fscal,dy01);
821 tz = _mm_mul_ps(fscal,dz01);
823 /* Update vectorial force */
824 fix0 = _mm_add_ps(fix0,tx);
825 fiy0 = _mm_add_ps(fiy0,ty);
826 fiz0 = _mm_add_ps(fiz0,tz);
828 fjx1 = _mm_add_ps(fjx1,tx);
829 fjy1 = _mm_add_ps(fjy1,ty);
830 fjz1 = _mm_add_ps(fjz1,tz);
834 /**************************
835 * CALCULATE INTERACTIONS *
836 **************************/
838 if (gmx_mm_any_lt(rsq02,rcutoff2))
841 /* REACTION-FIELD ELECTROSTATICS */
842 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
843 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
845 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
847 /* Update potential sum for this i atom from the interaction with this j atom. */
848 velec = _mm_and_ps(velec,cutoff_mask);
849 velec = _mm_andnot_ps(dummy_mask,velec);
850 velecsum = _mm_add_ps(velecsum,velec);
854 fscal = _mm_and_ps(fscal,cutoff_mask);
856 fscal = _mm_andnot_ps(dummy_mask,fscal);
858 /* Calculate temporary vectorial force */
859 tx = _mm_mul_ps(fscal,dx02);
860 ty = _mm_mul_ps(fscal,dy02);
861 tz = _mm_mul_ps(fscal,dz02);
863 /* Update vectorial force */
864 fix0 = _mm_add_ps(fix0,tx);
865 fiy0 = _mm_add_ps(fiy0,ty);
866 fiz0 = _mm_add_ps(fiz0,tz);
868 fjx2 = _mm_add_ps(fjx2,tx);
869 fjy2 = _mm_add_ps(fjy2,ty);
870 fjz2 = _mm_add_ps(fjz2,tz);
874 /**************************
875 * CALCULATE INTERACTIONS *
876 **************************/
878 if (gmx_mm_any_lt(rsq10,rcutoff2))
881 /* REACTION-FIELD ELECTROSTATICS */
882 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
883 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
885 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
887 /* Update potential sum for this i atom from the interaction with this j atom. */
888 velec = _mm_and_ps(velec,cutoff_mask);
889 velec = _mm_andnot_ps(dummy_mask,velec);
890 velecsum = _mm_add_ps(velecsum,velec);
894 fscal = _mm_and_ps(fscal,cutoff_mask);
896 fscal = _mm_andnot_ps(dummy_mask,fscal);
898 /* Calculate temporary vectorial force */
899 tx = _mm_mul_ps(fscal,dx10);
900 ty = _mm_mul_ps(fscal,dy10);
901 tz = _mm_mul_ps(fscal,dz10);
903 /* Update vectorial force */
904 fix1 = _mm_add_ps(fix1,tx);
905 fiy1 = _mm_add_ps(fiy1,ty);
906 fiz1 = _mm_add_ps(fiz1,tz);
908 fjx0 = _mm_add_ps(fjx0,tx);
909 fjy0 = _mm_add_ps(fjy0,ty);
910 fjz0 = _mm_add_ps(fjz0,tz);
914 /**************************
915 * CALCULATE INTERACTIONS *
916 **************************/
918 if (gmx_mm_any_lt(rsq11,rcutoff2))
921 /* REACTION-FIELD ELECTROSTATICS */
922 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
923 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
925 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
927 /* Update potential sum for this i atom from the interaction with this j atom. */
928 velec = _mm_and_ps(velec,cutoff_mask);
929 velec = _mm_andnot_ps(dummy_mask,velec);
930 velecsum = _mm_add_ps(velecsum,velec);
934 fscal = _mm_and_ps(fscal,cutoff_mask);
936 fscal = _mm_andnot_ps(dummy_mask,fscal);
938 /* Calculate temporary vectorial force */
939 tx = _mm_mul_ps(fscal,dx11);
940 ty = _mm_mul_ps(fscal,dy11);
941 tz = _mm_mul_ps(fscal,dz11);
943 /* Update vectorial force */
944 fix1 = _mm_add_ps(fix1,tx);
945 fiy1 = _mm_add_ps(fiy1,ty);
946 fiz1 = _mm_add_ps(fiz1,tz);
948 fjx1 = _mm_add_ps(fjx1,tx);
949 fjy1 = _mm_add_ps(fjy1,ty);
950 fjz1 = _mm_add_ps(fjz1,tz);
954 /**************************
955 * CALCULATE INTERACTIONS *
956 **************************/
958 if (gmx_mm_any_lt(rsq12,rcutoff2))
961 /* REACTION-FIELD ELECTROSTATICS */
962 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
963 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
965 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
967 /* Update potential sum for this i atom from the interaction with this j atom. */
968 velec = _mm_and_ps(velec,cutoff_mask);
969 velec = _mm_andnot_ps(dummy_mask,velec);
970 velecsum = _mm_add_ps(velecsum,velec);
974 fscal = _mm_and_ps(fscal,cutoff_mask);
976 fscal = _mm_andnot_ps(dummy_mask,fscal);
978 /* Calculate temporary vectorial force */
979 tx = _mm_mul_ps(fscal,dx12);
980 ty = _mm_mul_ps(fscal,dy12);
981 tz = _mm_mul_ps(fscal,dz12);
983 /* Update vectorial force */
984 fix1 = _mm_add_ps(fix1,tx);
985 fiy1 = _mm_add_ps(fiy1,ty);
986 fiz1 = _mm_add_ps(fiz1,tz);
988 fjx2 = _mm_add_ps(fjx2,tx);
989 fjy2 = _mm_add_ps(fjy2,ty);
990 fjz2 = _mm_add_ps(fjz2,tz);
994 /**************************
995 * CALCULATE INTERACTIONS *
996 **************************/
998 if (gmx_mm_any_lt(rsq20,rcutoff2))
1001 /* REACTION-FIELD ELECTROSTATICS */
1002 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
1003 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1005 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1007 /* Update potential sum for this i atom from the interaction with this j atom. */
1008 velec = _mm_and_ps(velec,cutoff_mask);
1009 velec = _mm_andnot_ps(dummy_mask,velec);
1010 velecsum = _mm_add_ps(velecsum,velec);
1014 fscal = _mm_and_ps(fscal,cutoff_mask);
1016 fscal = _mm_andnot_ps(dummy_mask,fscal);
1018 /* Calculate temporary vectorial force */
1019 tx = _mm_mul_ps(fscal,dx20);
1020 ty = _mm_mul_ps(fscal,dy20);
1021 tz = _mm_mul_ps(fscal,dz20);
1023 /* Update vectorial force */
1024 fix2 = _mm_add_ps(fix2,tx);
1025 fiy2 = _mm_add_ps(fiy2,ty);
1026 fiz2 = _mm_add_ps(fiz2,tz);
1028 fjx0 = _mm_add_ps(fjx0,tx);
1029 fjy0 = _mm_add_ps(fjy0,ty);
1030 fjz0 = _mm_add_ps(fjz0,tz);
1034 /**************************
1035 * CALCULATE INTERACTIONS *
1036 **************************/
1038 if (gmx_mm_any_lt(rsq21,rcutoff2))
1041 /* REACTION-FIELD ELECTROSTATICS */
1042 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
1043 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1045 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1047 /* Update potential sum for this i atom from the interaction with this j atom. */
1048 velec = _mm_and_ps(velec,cutoff_mask);
1049 velec = _mm_andnot_ps(dummy_mask,velec);
1050 velecsum = _mm_add_ps(velecsum,velec);
1054 fscal = _mm_and_ps(fscal,cutoff_mask);
1056 fscal = _mm_andnot_ps(dummy_mask,fscal);
1058 /* Calculate temporary vectorial force */
1059 tx = _mm_mul_ps(fscal,dx21);
1060 ty = _mm_mul_ps(fscal,dy21);
1061 tz = _mm_mul_ps(fscal,dz21);
1063 /* Update vectorial force */
1064 fix2 = _mm_add_ps(fix2,tx);
1065 fiy2 = _mm_add_ps(fiy2,ty);
1066 fiz2 = _mm_add_ps(fiz2,tz);
1068 fjx1 = _mm_add_ps(fjx1,tx);
1069 fjy1 = _mm_add_ps(fjy1,ty);
1070 fjz1 = _mm_add_ps(fjz1,tz);
1074 /**************************
1075 * CALCULATE INTERACTIONS *
1076 **************************/
1078 if (gmx_mm_any_lt(rsq22,rcutoff2))
1081 /* REACTION-FIELD ELECTROSTATICS */
1082 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
1083 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1085 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1087 /* Update potential sum for this i atom from the interaction with this j atom. */
1088 velec = _mm_and_ps(velec,cutoff_mask);
1089 velec = _mm_andnot_ps(dummy_mask,velec);
1090 velecsum = _mm_add_ps(velecsum,velec);
1094 fscal = _mm_and_ps(fscal,cutoff_mask);
1096 fscal = _mm_andnot_ps(dummy_mask,fscal);
1098 /* Calculate temporary vectorial force */
1099 tx = _mm_mul_ps(fscal,dx22);
1100 ty = _mm_mul_ps(fscal,dy22);
1101 tz = _mm_mul_ps(fscal,dz22);
1103 /* Update vectorial force */
1104 fix2 = _mm_add_ps(fix2,tx);
1105 fiy2 = _mm_add_ps(fiy2,ty);
1106 fiz2 = _mm_add_ps(fiz2,tz);
1108 fjx2 = _mm_add_ps(fjx2,tx);
1109 fjy2 = _mm_add_ps(fjy2,ty);
1110 fjz2 = _mm_add_ps(fjz2,tz);
1114 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1115 f+j_coord_offsetC,f+j_coord_offsetD,
1116 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1118 /* Inner loop uses 342 flops */
1121 /* End of innermost loop */
1123 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1124 f+i_coord_offset,fshift+i_shift_offset);
1127 /* Update potential energies */
1128 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1129 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1131 /* Increment number of inner iterations */
1132 inneriter += j_index_end - j_index_start;
1134 /* Outer loop uses 29 flops */
1137 /* Increment number of outer iterations */
1140 /* Update outer/inner flops */
1142 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*342);
1145 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse2_single
1146 * Electrostatics interaction: ReactionField
1147 * VdW interaction: LennardJones
1148 * Geometry: Water3-Water3
1149 * Calculate force/pot: Force
1152 nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse2_single
1153 (t_nblist * gmx_restrict nlist,
1154 rvec * gmx_restrict xx,
1155 rvec * gmx_restrict ff,
1156 t_forcerec * gmx_restrict fr,
1157 t_mdatoms * gmx_restrict mdatoms,
1158 nb_kernel_data_t * gmx_restrict kernel_data,
1159 t_nrnb * gmx_restrict nrnb)
1161 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1162 * just 0 for non-waters.
1163 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1164 * jnr indices corresponding to data put in the four positions in the SIMD register.
1166 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1167 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1168 int jnrA,jnrB,jnrC,jnrD;
1169 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1170 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1171 real shX,shY,shZ,rcutoff_scalar;
1172 real *shiftvec,*fshift,*x,*f;
1173 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1175 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1177 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1179 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1180 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1181 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1182 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1183 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1184 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1185 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1186 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1187 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1188 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1189 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1190 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1191 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1192 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1193 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1194 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1195 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1198 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1201 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1202 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1203 __m128 dummy_mask,cutoff_mask;
1204 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1205 __m128 one = _mm_set1_ps(1.0);
1206 __m128 two = _mm_set1_ps(2.0);
1212 jindex = nlist->jindex;
1214 shiftidx = nlist->shift;
1216 shiftvec = fr->shift_vec[0];
1217 fshift = fr->fshift[0];
1218 facel = _mm_set1_ps(fr->epsfac);
1219 charge = mdatoms->chargeA;
1220 krf = _mm_set1_ps(fr->ic->k_rf);
1221 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
1222 crf = _mm_set1_ps(fr->ic->c_rf);
1223 nvdwtype = fr->ntype;
1224 vdwparam = fr->nbfp;
1225 vdwtype = mdatoms->typeA;
1227 /* Setup water-specific parameters */
1228 inr = nlist->iinr[0];
1229 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1230 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1231 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1232 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1234 jq0 = _mm_set1_ps(charge[inr+0]);
1235 jq1 = _mm_set1_ps(charge[inr+1]);
1236 jq2 = _mm_set1_ps(charge[inr+2]);
1237 vdwjidx0A = 2*vdwtype[inr+0];
1238 qq00 = _mm_mul_ps(iq0,jq0);
1239 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1240 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1241 qq01 = _mm_mul_ps(iq0,jq1);
1242 qq02 = _mm_mul_ps(iq0,jq2);
1243 qq10 = _mm_mul_ps(iq1,jq0);
1244 qq11 = _mm_mul_ps(iq1,jq1);
1245 qq12 = _mm_mul_ps(iq1,jq2);
1246 qq20 = _mm_mul_ps(iq2,jq0);
1247 qq21 = _mm_mul_ps(iq2,jq1);
1248 qq22 = _mm_mul_ps(iq2,jq2);
1250 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1251 rcutoff_scalar = fr->rcoulomb;
1252 rcutoff = _mm_set1_ps(rcutoff_scalar);
1253 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1255 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
1256 rvdw = _mm_set1_ps(fr->rvdw);
1258 /* Avoid stupid compiler warnings */
1259 jnrA = jnrB = jnrC = jnrD = 0;
1260 j_coord_offsetA = 0;
1261 j_coord_offsetB = 0;
1262 j_coord_offsetC = 0;
1263 j_coord_offsetD = 0;
1268 /* Start outer loop over neighborlists */
1269 for(iidx=0; iidx<nri; iidx++)
1271 /* Load shift vector for this list */
1272 i_shift_offset = DIM*shiftidx[iidx];
1273 shX = shiftvec[i_shift_offset+XX];
1274 shY = shiftvec[i_shift_offset+YY];
1275 shZ = shiftvec[i_shift_offset+ZZ];
1277 /* Load limits for loop over neighbors */
1278 j_index_start = jindex[iidx];
1279 j_index_end = jindex[iidx+1];
1281 /* Get outer coordinate index */
1283 i_coord_offset = DIM*inr;
1285 /* Load i particle coords and add shift vector */
1286 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
1287 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
1288 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
1289 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1290 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1291 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1292 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1293 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1294 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1296 fix0 = _mm_setzero_ps();
1297 fiy0 = _mm_setzero_ps();
1298 fiz0 = _mm_setzero_ps();
1299 fix1 = _mm_setzero_ps();
1300 fiy1 = _mm_setzero_ps();
1301 fiz1 = _mm_setzero_ps();
1302 fix2 = _mm_setzero_ps();
1303 fiy2 = _mm_setzero_ps();
1304 fiz2 = _mm_setzero_ps();
1306 /* Start inner kernel loop */
1307 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1310 /* Get j neighbor index, and coordinate index */
1312 jnrB = jjnr[jidx+1];
1313 jnrC = jjnr[jidx+2];
1314 jnrD = jjnr[jidx+3];
1316 j_coord_offsetA = DIM*jnrA;
1317 j_coord_offsetB = DIM*jnrB;
1318 j_coord_offsetC = DIM*jnrC;
1319 j_coord_offsetD = DIM*jnrD;
1321 /* load j atom coordinates */
1322 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1323 x+j_coord_offsetC,x+j_coord_offsetD,
1324 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1326 /* Calculate displacement vector */
1327 dx00 = _mm_sub_ps(ix0,jx0);
1328 dy00 = _mm_sub_ps(iy0,jy0);
1329 dz00 = _mm_sub_ps(iz0,jz0);
1330 dx01 = _mm_sub_ps(ix0,jx1);
1331 dy01 = _mm_sub_ps(iy0,jy1);
1332 dz01 = _mm_sub_ps(iz0,jz1);
1333 dx02 = _mm_sub_ps(ix0,jx2);
1334 dy02 = _mm_sub_ps(iy0,jy2);
1335 dz02 = _mm_sub_ps(iz0,jz2);
1336 dx10 = _mm_sub_ps(ix1,jx0);
1337 dy10 = _mm_sub_ps(iy1,jy0);
1338 dz10 = _mm_sub_ps(iz1,jz0);
1339 dx11 = _mm_sub_ps(ix1,jx1);
1340 dy11 = _mm_sub_ps(iy1,jy1);
1341 dz11 = _mm_sub_ps(iz1,jz1);
1342 dx12 = _mm_sub_ps(ix1,jx2);
1343 dy12 = _mm_sub_ps(iy1,jy2);
1344 dz12 = _mm_sub_ps(iz1,jz2);
1345 dx20 = _mm_sub_ps(ix2,jx0);
1346 dy20 = _mm_sub_ps(iy2,jy0);
1347 dz20 = _mm_sub_ps(iz2,jz0);
1348 dx21 = _mm_sub_ps(ix2,jx1);
1349 dy21 = _mm_sub_ps(iy2,jy1);
1350 dz21 = _mm_sub_ps(iz2,jz1);
1351 dx22 = _mm_sub_ps(ix2,jx2);
1352 dy22 = _mm_sub_ps(iy2,jy2);
1353 dz22 = _mm_sub_ps(iz2,jz2);
1355 /* Calculate squared distance and things based on it */
1356 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1357 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1358 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1359 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1360 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1361 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1362 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1363 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1364 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1366 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1367 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1368 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1369 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1370 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1371 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1372 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1373 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1374 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1376 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1377 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1378 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1379 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1380 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1381 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1382 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1383 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1384 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1386 fjx0 = _mm_setzero_ps();
1387 fjy0 = _mm_setzero_ps();
1388 fjz0 = _mm_setzero_ps();
1389 fjx1 = _mm_setzero_ps();
1390 fjy1 = _mm_setzero_ps();
1391 fjz1 = _mm_setzero_ps();
1392 fjx2 = _mm_setzero_ps();
1393 fjy2 = _mm_setzero_ps();
1394 fjz2 = _mm_setzero_ps();
1396 /**************************
1397 * CALCULATE INTERACTIONS *
1398 **************************/
1400 if (gmx_mm_any_lt(rsq00,rcutoff2))
1403 /* REACTION-FIELD ELECTROSTATICS */
1404 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1406 /* LENNARD-JONES DISPERSION/REPULSION */
1408 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1409 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1411 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1413 fscal = _mm_add_ps(felec,fvdw);
1415 fscal = _mm_and_ps(fscal,cutoff_mask);
1417 /* Calculate temporary vectorial force */
1418 tx = _mm_mul_ps(fscal,dx00);
1419 ty = _mm_mul_ps(fscal,dy00);
1420 tz = _mm_mul_ps(fscal,dz00);
1422 /* Update vectorial force */
1423 fix0 = _mm_add_ps(fix0,tx);
1424 fiy0 = _mm_add_ps(fiy0,ty);
1425 fiz0 = _mm_add_ps(fiz0,tz);
1427 fjx0 = _mm_add_ps(fjx0,tx);
1428 fjy0 = _mm_add_ps(fjy0,ty);
1429 fjz0 = _mm_add_ps(fjz0,tz);
1433 /**************************
1434 * CALCULATE INTERACTIONS *
1435 **************************/
1437 if (gmx_mm_any_lt(rsq01,rcutoff2))
1440 /* REACTION-FIELD ELECTROSTATICS */
1441 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1443 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1447 fscal = _mm_and_ps(fscal,cutoff_mask);
1449 /* Calculate temporary vectorial force */
1450 tx = _mm_mul_ps(fscal,dx01);
1451 ty = _mm_mul_ps(fscal,dy01);
1452 tz = _mm_mul_ps(fscal,dz01);
1454 /* Update vectorial force */
1455 fix0 = _mm_add_ps(fix0,tx);
1456 fiy0 = _mm_add_ps(fiy0,ty);
1457 fiz0 = _mm_add_ps(fiz0,tz);
1459 fjx1 = _mm_add_ps(fjx1,tx);
1460 fjy1 = _mm_add_ps(fjy1,ty);
1461 fjz1 = _mm_add_ps(fjz1,tz);
1465 /**************************
1466 * CALCULATE INTERACTIONS *
1467 **************************/
1469 if (gmx_mm_any_lt(rsq02,rcutoff2))
1472 /* REACTION-FIELD ELECTROSTATICS */
1473 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1475 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1479 fscal = _mm_and_ps(fscal,cutoff_mask);
1481 /* Calculate temporary vectorial force */
1482 tx = _mm_mul_ps(fscal,dx02);
1483 ty = _mm_mul_ps(fscal,dy02);
1484 tz = _mm_mul_ps(fscal,dz02);
1486 /* Update vectorial force */
1487 fix0 = _mm_add_ps(fix0,tx);
1488 fiy0 = _mm_add_ps(fiy0,ty);
1489 fiz0 = _mm_add_ps(fiz0,tz);
1491 fjx2 = _mm_add_ps(fjx2,tx);
1492 fjy2 = _mm_add_ps(fjy2,ty);
1493 fjz2 = _mm_add_ps(fjz2,tz);
1497 /**************************
1498 * CALCULATE INTERACTIONS *
1499 **************************/
1501 if (gmx_mm_any_lt(rsq10,rcutoff2))
1504 /* REACTION-FIELD ELECTROSTATICS */
1505 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1507 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1511 fscal = _mm_and_ps(fscal,cutoff_mask);
1513 /* Calculate temporary vectorial force */
1514 tx = _mm_mul_ps(fscal,dx10);
1515 ty = _mm_mul_ps(fscal,dy10);
1516 tz = _mm_mul_ps(fscal,dz10);
1518 /* Update vectorial force */
1519 fix1 = _mm_add_ps(fix1,tx);
1520 fiy1 = _mm_add_ps(fiy1,ty);
1521 fiz1 = _mm_add_ps(fiz1,tz);
1523 fjx0 = _mm_add_ps(fjx0,tx);
1524 fjy0 = _mm_add_ps(fjy0,ty);
1525 fjz0 = _mm_add_ps(fjz0,tz);
1529 /**************************
1530 * CALCULATE INTERACTIONS *
1531 **************************/
1533 if (gmx_mm_any_lt(rsq11,rcutoff2))
1536 /* REACTION-FIELD ELECTROSTATICS */
1537 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1539 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1543 fscal = _mm_and_ps(fscal,cutoff_mask);
1545 /* Calculate temporary vectorial force */
1546 tx = _mm_mul_ps(fscal,dx11);
1547 ty = _mm_mul_ps(fscal,dy11);
1548 tz = _mm_mul_ps(fscal,dz11);
1550 /* Update vectorial force */
1551 fix1 = _mm_add_ps(fix1,tx);
1552 fiy1 = _mm_add_ps(fiy1,ty);
1553 fiz1 = _mm_add_ps(fiz1,tz);
1555 fjx1 = _mm_add_ps(fjx1,tx);
1556 fjy1 = _mm_add_ps(fjy1,ty);
1557 fjz1 = _mm_add_ps(fjz1,tz);
1561 /**************************
1562 * CALCULATE INTERACTIONS *
1563 **************************/
1565 if (gmx_mm_any_lt(rsq12,rcutoff2))
1568 /* REACTION-FIELD ELECTROSTATICS */
1569 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1571 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1575 fscal = _mm_and_ps(fscal,cutoff_mask);
1577 /* Calculate temporary vectorial force */
1578 tx = _mm_mul_ps(fscal,dx12);
1579 ty = _mm_mul_ps(fscal,dy12);
1580 tz = _mm_mul_ps(fscal,dz12);
1582 /* Update vectorial force */
1583 fix1 = _mm_add_ps(fix1,tx);
1584 fiy1 = _mm_add_ps(fiy1,ty);
1585 fiz1 = _mm_add_ps(fiz1,tz);
1587 fjx2 = _mm_add_ps(fjx2,tx);
1588 fjy2 = _mm_add_ps(fjy2,ty);
1589 fjz2 = _mm_add_ps(fjz2,tz);
1593 /**************************
1594 * CALCULATE INTERACTIONS *
1595 **************************/
1597 if (gmx_mm_any_lt(rsq20,rcutoff2))
1600 /* REACTION-FIELD ELECTROSTATICS */
1601 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1603 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1607 fscal = _mm_and_ps(fscal,cutoff_mask);
1609 /* Calculate temporary vectorial force */
1610 tx = _mm_mul_ps(fscal,dx20);
1611 ty = _mm_mul_ps(fscal,dy20);
1612 tz = _mm_mul_ps(fscal,dz20);
1614 /* Update vectorial force */
1615 fix2 = _mm_add_ps(fix2,tx);
1616 fiy2 = _mm_add_ps(fiy2,ty);
1617 fiz2 = _mm_add_ps(fiz2,tz);
1619 fjx0 = _mm_add_ps(fjx0,tx);
1620 fjy0 = _mm_add_ps(fjy0,ty);
1621 fjz0 = _mm_add_ps(fjz0,tz);
1625 /**************************
1626 * CALCULATE INTERACTIONS *
1627 **************************/
1629 if (gmx_mm_any_lt(rsq21,rcutoff2))
1632 /* REACTION-FIELD ELECTROSTATICS */
1633 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1635 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1639 fscal = _mm_and_ps(fscal,cutoff_mask);
1641 /* Calculate temporary vectorial force */
1642 tx = _mm_mul_ps(fscal,dx21);
1643 ty = _mm_mul_ps(fscal,dy21);
1644 tz = _mm_mul_ps(fscal,dz21);
1646 /* Update vectorial force */
1647 fix2 = _mm_add_ps(fix2,tx);
1648 fiy2 = _mm_add_ps(fiy2,ty);
1649 fiz2 = _mm_add_ps(fiz2,tz);
1651 fjx1 = _mm_add_ps(fjx1,tx);
1652 fjy1 = _mm_add_ps(fjy1,ty);
1653 fjz1 = _mm_add_ps(fjz1,tz);
1657 /**************************
1658 * CALCULATE INTERACTIONS *
1659 **************************/
1661 if (gmx_mm_any_lt(rsq22,rcutoff2))
1664 /* REACTION-FIELD ELECTROSTATICS */
1665 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1667 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1671 fscal = _mm_and_ps(fscal,cutoff_mask);
1673 /* Calculate temporary vectorial force */
1674 tx = _mm_mul_ps(fscal,dx22);
1675 ty = _mm_mul_ps(fscal,dy22);
1676 tz = _mm_mul_ps(fscal,dz22);
1678 /* Update vectorial force */
1679 fix2 = _mm_add_ps(fix2,tx);
1680 fiy2 = _mm_add_ps(fiy2,ty);
1681 fiz2 = _mm_add_ps(fiz2,tz);
1683 fjx2 = _mm_add_ps(fjx2,tx);
1684 fjy2 = _mm_add_ps(fjy2,ty);
1685 fjz2 = _mm_add_ps(fjz2,tz);
1689 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1690 f+j_coord_offsetC,f+j_coord_offsetD,
1691 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1693 /* Inner loop uses 277 flops */
1696 if(jidx<j_index_end)
1699 /* Get j neighbor index, and coordinate index */
1701 jnrB = jjnr[jidx+1];
1702 jnrC = jjnr[jidx+2];
1703 jnrD = jjnr[jidx+3];
1705 /* Sign of each element will be negative for non-real atoms.
1706 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1707 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1709 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1710 jnrA = (jnrA>=0) ? jnrA : 0;
1711 jnrB = (jnrB>=0) ? jnrB : 0;
1712 jnrC = (jnrC>=0) ? jnrC : 0;
1713 jnrD = (jnrD>=0) ? jnrD : 0;
1715 j_coord_offsetA = DIM*jnrA;
1716 j_coord_offsetB = DIM*jnrB;
1717 j_coord_offsetC = DIM*jnrC;
1718 j_coord_offsetD = DIM*jnrD;
1720 /* load j atom coordinates */
1721 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1722 x+j_coord_offsetC,x+j_coord_offsetD,
1723 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1725 /* Calculate displacement vector */
1726 dx00 = _mm_sub_ps(ix0,jx0);
1727 dy00 = _mm_sub_ps(iy0,jy0);
1728 dz00 = _mm_sub_ps(iz0,jz0);
1729 dx01 = _mm_sub_ps(ix0,jx1);
1730 dy01 = _mm_sub_ps(iy0,jy1);
1731 dz01 = _mm_sub_ps(iz0,jz1);
1732 dx02 = _mm_sub_ps(ix0,jx2);
1733 dy02 = _mm_sub_ps(iy0,jy2);
1734 dz02 = _mm_sub_ps(iz0,jz2);
1735 dx10 = _mm_sub_ps(ix1,jx0);
1736 dy10 = _mm_sub_ps(iy1,jy0);
1737 dz10 = _mm_sub_ps(iz1,jz0);
1738 dx11 = _mm_sub_ps(ix1,jx1);
1739 dy11 = _mm_sub_ps(iy1,jy1);
1740 dz11 = _mm_sub_ps(iz1,jz1);
1741 dx12 = _mm_sub_ps(ix1,jx2);
1742 dy12 = _mm_sub_ps(iy1,jy2);
1743 dz12 = _mm_sub_ps(iz1,jz2);
1744 dx20 = _mm_sub_ps(ix2,jx0);
1745 dy20 = _mm_sub_ps(iy2,jy0);
1746 dz20 = _mm_sub_ps(iz2,jz0);
1747 dx21 = _mm_sub_ps(ix2,jx1);
1748 dy21 = _mm_sub_ps(iy2,jy1);
1749 dz21 = _mm_sub_ps(iz2,jz1);
1750 dx22 = _mm_sub_ps(ix2,jx2);
1751 dy22 = _mm_sub_ps(iy2,jy2);
1752 dz22 = _mm_sub_ps(iz2,jz2);
1754 /* Calculate squared distance and things based on it */
1755 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1756 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1757 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1758 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1759 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1760 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1761 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1762 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1763 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1765 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1766 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1767 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1768 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1769 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1770 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1771 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1772 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1773 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1775 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1776 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1777 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1778 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1779 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1780 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1781 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1782 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1783 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1785 fjx0 = _mm_setzero_ps();
1786 fjy0 = _mm_setzero_ps();
1787 fjz0 = _mm_setzero_ps();
1788 fjx1 = _mm_setzero_ps();
1789 fjy1 = _mm_setzero_ps();
1790 fjz1 = _mm_setzero_ps();
1791 fjx2 = _mm_setzero_ps();
1792 fjy2 = _mm_setzero_ps();
1793 fjz2 = _mm_setzero_ps();
1795 /**************************
1796 * CALCULATE INTERACTIONS *
1797 **************************/
1799 if (gmx_mm_any_lt(rsq00,rcutoff2))
1802 /* REACTION-FIELD ELECTROSTATICS */
1803 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1805 /* LENNARD-JONES DISPERSION/REPULSION */
1807 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1808 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1810 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1812 fscal = _mm_add_ps(felec,fvdw);
1814 fscal = _mm_and_ps(fscal,cutoff_mask);
1816 fscal = _mm_andnot_ps(dummy_mask,fscal);
1818 /* Calculate temporary vectorial force */
1819 tx = _mm_mul_ps(fscal,dx00);
1820 ty = _mm_mul_ps(fscal,dy00);
1821 tz = _mm_mul_ps(fscal,dz00);
1823 /* Update vectorial force */
1824 fix0 = _mm_add_ps(fix0,tx);
1825 fiy0 = _mm_add_ps(fiy0,ty);
1826 fiz0 = _mm_add_ps(fiz0,tz);
1828 fjx0 = _mm_add_ps(fjx0,tx);
1829 fjy0 = _mm_add_ps(fjy0,ty);
1830 fjz0 = _mm_add_ps(fjz0,tz);
1834 /**************************
1835 * CALCULATE INTERACTIONS *
1836 **************************/
1838 if (gmx_mm_any_lt(rsq01,rcutoff2))
1841 /* REACTION-FIELD ELECTROSTATICS */
1842 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1844 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1848 fscal = _mm_and_ps(fscal,cutoff_mask);
1850 fscal = _mm_andnot_ps(dummy_mask,fscal);
1852 /* Calculate temporary vectorial force */
1853 tx = _mm_mul_ps(fscal,dx01);
1854 ty = _mm_mul_ps(fscal,dy01);
1855 tz = _mm_mul_ps(fscal,dz01);
1857 /* Update vectorial force */
1858 fix0 = _mm_add_ps(fix0,tx);
1859 fiy0 = _mm_add_ps(fiy0,ty);
1860 fiz0 = _mm_add_ps(fiz0,tz);
1862 fjx1 = _mm_add_ps(fjx1,tx);
1863 fjy1 = _mm_add_ps(fjy1,ty);
1864 fjz1 = _mm_add_ps(fjz1,tz);
1868 /**************************
1869 * CALCULATE INTERACTIONS *
1870 **************************/
1872 if (gmx_mm_any_lt(rsq02,rcutoff2))
1875 /* REACTION-FIELD ELECTROSTATICS */
1876 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1878 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1882 fscal = _mm_and_ps(fscal,cutoff_mask);
1884 fscal = _mm_andnot_ps(dummy_mask,fscal);
1886 /* Calculate temporary vectorial force */
1887 tx = _mm_mul_ps(fscal,dx02);
1888 ty = _mm_mul_ps(fscal,dy02);
1889 tz = _mm_mul_ps(fscal,dz02);
1891 /* Update vectorial force */
1892 fix0 = _mm_add_ps(fix0,tx);
1893 fiy0 = _mm_add_ps(fiy0,ty);
1894 fiz0 = _mm_add_ps(fiz0,tz);
1896 fjx2 = _mm_add_ps(fjx2,tx);
1897 fjy2 = _mm_add_ps(fjy2,ty);
1898 fjz2 = _mm_add_ps(fjz2,tz);
1902 /**************************
1903 * CALCULATE INTERACTIONS *
1904 **************************/
1906 if (gmx_mm_any_lt(rsq10,rcutoff2))
1909 /* REACTION-FIELD ELECTROSTATICS */
1910 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1912 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1916 fscal = _mm_and_ps(fscal,cutoff_mask);
1918 fscal = _mm_andnot_ps(dummy_mask,fscal);
1920 /* Calculate temporary vectorial force */
1921 tx = _mm_mul_ps(fscal,dx10);
1922 ty = _mm_mul_ps(fscal,dy10);
1923 tz = _mm_mul_ps(fscal,dz10);
1925 /* Update vectorial force */
1926 fix1 = _mm_add_ps(fix1,tx);
1927 fiy1 = _mm_add_ps(fiy1,ty);
1928 fiz1 = _mm_add_ps(fiz1,tz);
1930 fjx0 = _mm_add_ps(fjx0,tx);
1931 fjy0 = _mm_add_ps(fjy0,ty);
1932 fjz0 = _mm_add_ps(fjz0,tz);
1936 /**************************
1937 * CALCULATE INTERACTIONS *
1938 **************************/
1940 if (gmx_mm_any_lt(rsq11,rcutoff2))
1943 /* REACTION-FIELD ELECTROSTATICS */
1944 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1946 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1950 fscal = _mm_and_ps(fscal,cutoff_mask);
1952 fscal = _mm_andnot_ps(dummy_mask,fscal);
1954 /* Calculate temporary vectorial force */
1955 tx = _mm_mul_ps(fscal,dx11);
1956 ty = _mm_mul_ps(fscal,dy11);
1957 tz = _mm_mul_ps(fscal,dz11);
1959 /* Update vectorial force */
1960 fix1 = _mm_add_ps(fix1,tx);
1961 fiy1 = _mm_add_ps(fiy1,ty);
1962 fiz1 = _mm_add_ps(fiz1,tz);
1964 fjx1 = _mm_add_ps(fjx1,tx);
1965 fjy1 = _mm_add_ps(fjy1,ty);
1966 fjz1 = _mm_add_ps(fjz1,tz);
1970 /**************************
1971 * CALCULATE INTERACTIONS *
1972 **************************/
1974 if (gmx_mm_any_lt(rsq12,rcutoff2))
1977 /* REACTION-FIELD ELECTROSTATICS */
1978 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1980 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1984 fscal = _mm_and_ps(fscal,cutoff_mask);
1986 fscal = _mm_andnot_ps(dummy_mask,fscal);
1988 /* Calculate temporary vectorial force */
1989 tx = _mm_mul_ps(fscal,dx12);
1990 ty = _mm_mul_ps(fscal,dy12);
1991 tz = _mm_mul_ps(fscal,dz12);
1993 /* Update vectorial force */
1994 fix1 = _mm_add_ps(fix1,tx);
1995 fiy1 = _mm_add_ps(fiy1,ty);
1996 fiz1 = _mm_add_ps(fiz1,tz);
1998 fjx2 = _mm_add_ps(fjx2,tx);
1999 fjy2 = _mm_add_ps(fjy2,ty);
2000 fjz2 = _mm_add_ps(fjz2,tz);
2004 /**************************
2005 * CALCULATE INTERACTIONS *
2006 **************************/
2008 if (gmx_mm_any_lt(rsq20,rcutoff2))
2011 /* REACTION-FIELD ELECTROSTATICS */
2012 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
2014 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2018 fscal = _mm_and_ps(fscal,cutoff_mask);
2020 fscal = _mm_andnot_ps(dummy_mask,fscal);
2022 /* Calculate temporary vectorial force */
2023 tx = _mm_mul_ps(fscal,dx20);
2024 ty = _mm_mul_ps(fscal,dy20);
2025 tz = _mm_mul_ps(fscal,dz20);
2027 /* Update vectorial force */
2028 fix2 = _mm_add_ps(fix2,tx);
2029 fiy2 = _mm_add_ps(fiy2,ty);
2030 fiz2 = _mm_add_ps(fiz2,tz);
2032 fjx0 = _mm_add_ps(fjx0,tx);
2033 fjy0 = _mm_add_ps(fjy0,ty);
2034 fjz0 = _mm_add_ps(fjz0,tz);
2038 /**************************
2039 * CALCULATE INTERACTIONS *
2040 **************************/
2042 if (gmx_mm_any_lt(rsq21,rcutoff2))
2045 /* REACTION-FIELD ELECTROSTATICS */
2046 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
2048 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2052 fscal = _mm_and_ps(fscal,cutoff_mask);
2054 fscal = _mm_andnot_ps(dummy_mask,fscal);
2056 /* Calculate temporary vectorial force */
2057 tx = _mm_mul_ps(fscal,dx21);
2058 ty = _mm_mul_ps(fscal,dy21);
2059 tz = _mm_mul_ps(fscal,dz21);
2061 /* Update vectorial force */
2062 fix2 = _mm_add_ps(fix2,tx);
2063 fiy2 = _mm_add_ps(fiy2,ty);
2064 fiz2 = _mm_add_ps(fiz2,tz);
2066 fjx1 = _mm_add_ps(fjx1,tx);
2067 fjy1 = _mm_add_ps(fjy1,ty);
2068 fjz1 = _mm_add_ps(fjz1,tz);
2072 /**************************
2073 * CALCULATE INTERACTIONS *
2074 **************************/
2076 if (gmx_mm_any_lt(rsq22,rcutoff2))
2079 /* REACTION-FIELD ELECTROSTATICS */
2080 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
2082 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2086 fscal = _mm_and_ps(fscal,cutoff_mask);
2088 fscal = _mm_andnot_ps(dummy_mask,fscal);
2090 /* Calculate temporary vectorial force */
2091 tx = _mm_mul_ps(fscal,dx22);
2092 ty = _mm_mul_ps(fscal,dy22);
2093 tz = _mm_mul_ps(fscal,dz22);
2095 /* Update vectorial force */
2096 fix2 = _mm_add_ps(fix2,tx);
2097 fiy2 = _mm_add_ps(fiy2,ty);
2098 fiz2 = _mm_add_ps(fiz2,tz);
2100 fjx2 = _mm_add_ps(fjx2,tx);
2101 fjy2 = _mm_add_ps(fjy2,ty);
2102 fjz2 = _mm_add_ps(fjz2,tz);
2106 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
2107 f+j_coord_offsetC,f+j_coord_offsetD,
2108 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2110 /* Inner loop uses 277 flops */
2113 /* End of innermost loop */
2115 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2116 f+i_coord_offset,fshift+i_shift_offset);
2118 /* Increment number of inner iterations */
2119 inneriter += j_index_end - j_index_start;
2121 /* Outer loop uses 27 flops */
2124 /* Increment number of outer iterations */
2127 /* Update outer/inner flops */
2129 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*277);