2 * Note: this file was generated by the Gromacs sse2_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse2_double
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse2_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72 int vdwjidx0A,vdwjidx0B;
73 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74 int vdwjidx1A,vdwjidx1B;
75 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76 int vdwjidx2A,vdwjidx2B;
77 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
90 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
93 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
94 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
95 __m128d dummy_mask,cutoff_mask;
96 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
97 __m128d one = _mm_set1_pd(1.0);
98 __m128d two = _mm_set1_pd(2.0);
104 jindex = nlist->jindex;
106 shiftidx = nlist->shift;
108 shiftvec = fr->shift_vec[0];
109 fshift = fr->fshift[0];
110 facel = _mm_set1_pd(fr->epsfac);
111 charge = mdatoms->chargeA;
112 krf = _mm_set1_pd(fr->ic->k_rf);
113 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
114 crf = _mm_set1_pd(fr->ic->c_rf);
115 nvdwtype = fr->ntype;
117 vdwtype = mdatoms->typeA;
119 /* Setup water-specific parameters */
120 inr = nlist->iinr[0];
121 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
122 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
123 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
124 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
126 jq0 = _mm_set1_pd(charge[inr+0]);
127 jq1 = _mm_set1_pd(charge[inr+1]);
128 jq2 = _mm_set1_pd(charge[inr+2]);
129 vdwjidx0A = 2*vdwtype[inr+0];
130 qq00 = _mm_mul_pd(iq0,jq0);
131 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
132 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
133 qq01 = _mm_mul_pd(iq0,jq1);
134 qq02 = _mm_mul_pd(iq0,jq2);
135 qq10 = _mm_mul_pd(iq1,jq0);
136 qq11 = _mm_mul_pd(iq1,jq1);
137 qq12 = _mm_mul_pd(iq1,jq2);
138 qq20 = _mm_mul_pd(iq2,jq0);
139 qq21 = _mm_mul_pd(iq2,jq1);
140 qq22 = _mm_mul_pd(iq2,jq2);
142 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
143 rcutoff_scalar = fr->rcoulomb;
144 rcutoff = _mm_set1_pd(rcutoff_scalar);
145 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
147 sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6);
148 rvdw = _mm_set1_pd(fr->rvdw);
150 /* Avoid stupid compiler warnings */
158 /* Start outer loop over neighborlists */
159 for(iidx=0; iidx<nri; iidx++)
161 /* Load shift vector for this list */
162 i_shift_offset = DIM*shiftidx[iidx];
164 /* Load limits for loop over neighbors */
165 j_index_start = jindex[iidx];
166 j_index_end = jindex[iidx+1];
168 /* Get outer coordinate index */
170 i_coord_offset = DIM*inr;
172 /* Load i particle coords and add shift vector */
173 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
174 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
176 fix0 = _mm_setzero_pd();
177 fiy0 = _mm_setzero_pd();
178 fiz0 = _mm_setzero_pd();
179 fix1 = _mm_setzero_pd();
180 fiy1 = _mm_setzero_pd();
181 fiz1 = _mm_setzero_pd();
182 fix2 = _mm_setzero_pd();
183 fiy2 = _mm_setzero_pd();
184 fiz2 = _mm_setzero_pd();
186 /* Reset potential sums */
187 velecsum = _mm_setzero_pd();
188 vvdwsum = _mm_setzero_pd();
190 /* Start inner kernel loop */
191 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
194 /* Get j neighbor index, and coordinate index */
197 j_coord_offsetA = DIM*jnrA;
198 j_coord_offsetB = DIM*jnrB;
200 /* load j atom coordinates */
201 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
202 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
204 /* Calculate displacement vector */
205 dx00 = _mm_sub_pd(ix0,jx0);
206 dy00 = _mm_sub_pd(iy0,jy0);
207 dz00 = _mm_sub_pd(iz0,jz0);
208 dx01 = _mm_sub_pd(ix0,jx1);
209 dy01 = _mm_sub_pd(iy0,jy1);
210 dz01 = _mm_sub_pd(iz0,jz1);
211 dx02 = _mm_sub_pd(ix0,jx2);
212 dy02 = _mm_sub_pd(iy0,jy2);
213 dz02 = _mm_sub_pd(iz0,jz2);
214 dx10 = _mm_sub_pd(ix1,jx0);
215 dy10 = _mm_sub_pd(iy1,jy0);
216 dz10 = _mm_sub_pd(iz1,jz0);
217 dx11 = _mm_sub_pd(ix1,jx1);
218 dy11 = _mm_sub_pd(iy1,jy1);
219 dz11 = _mm_sub_pd(iz1,jz1);
220 dx12 = _mm_sub_pd(ix1,jx2);
221 dy12 = _mm_sub_pd(iy1,jy2);
222 dz12 = _mm_sub_pd(iz1,jz2);
223 dx20 = _mm_sub_pd(ix2,jx0);
224 dy20 = _mm_sub_pd(iy2,jy0);
225 dz20 = _mm_sub_pd(iz2,jz0);
226 dx21 = _mm_sub_pd(ix2,jx1);
227 dy21 = _mm_sub_pd(iy2,jy1);
228 dz21 = _mm_sub_pd(iz2,jz1);
229 dx22 = _mm_sub_pd(ix2,jx2);
230 dy22 = _mm_sub_pd(iy2,jy2);
231 dz22 = _mm_sub_pd(iz2,jz2);
233 /* Calculate squared distance and things based on it */
234 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
235 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
236 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
237 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
238 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
239 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
240 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
241 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
242 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
244 rinv00 = gmx_mm_invsqrt_pd(rsq00);
245 rinv01 = gmx_mm_invsqrt_pd(rsq01);
246 rinv02 = gmx_mm_invsqrt_pd(rsq02);
247 rinv10 = gmx_mm_invsqrt_pd(rsq10);
248 rinv11 = gmx_mm_invsqrt_pd(rsq11);
249 rinv12 = gmx_mm_invsqrt_pd(rsq12);
250 rinv20 = gmx_mm_invsqrt_pd(rsq20);
251 rinv21 = gmx_mm_invsqrt_pd(rsq21);
252 rinv22 = gmx_mm_invsqrt_pd(rsq22);
254 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
255 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
256 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
257 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
258 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
259 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
260 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
261 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
262 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
264 fjx0 = _mm_setzero_pd();
265 fjy0 = _mm_setzero_pd();
266 fjz0 = _mm_setzero_pd();
267 fjx1 = _mm_setzero_pd();
268 fjy1 = _mm_setzero_pd();
269 fjz1 = _mm_setzero_pd();
270 fjx2 = _mm_setzero_pd();
271 fjy2 = _mm_setzero_pd();
272 fjz2 = _mm_setzero_pd();
274 /**************************
275 * CALCULATE INTERACTIONS *
276 **************************/
278 if (gmx_mm_any_lt(rsq00,rcutoff2))
281 /* REACTION-FIELD ELECTROSTATICS */
282 velec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_add_pd(rinv00,_mm_mul_pd(krf,rsq00)),crf));
283 felec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_mul_pd(rinv00,rinvsq00),krf2));
285 /* LENNARD-JONES DISPERSION/REPULSION */
287 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
288 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
289 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
290 vvdw = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
291 _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
292 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
294 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
296 /* Update potential sum for this i atom from the interaction with this j atom. */
297 velec = _mm_and_pd(velec,cutoff_mask);
298 velecsum = _mm_add_pd(velecsum,velec);
299 vvdw = _mm_and_pd(vvdw,cutoff_mask);
300 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
302 fscal = _mm_add_pd(felec,fvdw);
304 fscal = _mm_and_pd(fscal,cutoff_mask);
306 /* Calculate temporary vectorial force */
307 tx = _mm_mul_pd(fscal,dx00);
308 ty = _mm_mul_pd(fscal,dy00);
309 tz = _mm_mul_pd(fscal,dz00);
311 /* Update vectorial force */
312 fix0 = _mm_add_pd(fix0,tx);
313 fiy0 = _mm_add_pd(fiy0,ty);
314 fiz0 = _mm_add_pd(fiz0,tz);
316 fjx0 = _mm_add_pd(fjx0,tx);
317 fjy0 = _mm_add_pd(fjy0,ty);
318 fjz0 = _mm_add_pd(fjz0,tz);
322 /**************************
323 * CALCULATE INTERACTIONS *
324 **************************/
326 if (gmx_mm_any_lt(rsq01,rcutoff2))
329 /* REACTION-FIELD ELECTROSTATICS */
330 velec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_add_pd(rinv01,_mm_mul_pd(krf,rsq01)),crf));
331 felec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_mul_pd(rinv01,rinvsq01),krf2));
333 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
335 /* Update potential sum for this i atom from the interaction with this j atom. */
336 velec = _mm_and_pd(velec,cutoff_mask);
337 velecsum = _mm_add_pd(velecsum,velec);
341 fscal = _mm_and_pd(fscal,cutoff_mask);
343 /* Calculate temporary vectorial force */
344 tx = _mm_mul_pd(fscal,dx01);
345 ty = _mm_mul_pd(fscal,dy01);
346 tz = _mm_mul_pd(fscal,dz01);
348 /* Update vectorial force */
349 fix0 = _mm_add_pd(fix0,tx);
350 fiy0 = _mm_add_pd(fiy0,ty);
351 fiz0 = _mm_add_pd(fiz0,tz);
353 fjx1 = _mm_add_pd(fjx1,tx);
354 fjy1 = _mm_add_pd(fjy1,ty);
355 fjz1 = _mm_add_pd(fjz1,tz);
359 /**************************
360 * CALCULATE INTERACTIONS *
361 **************************/
363 if (gmx_mm_any_lt(rsq02,rcutoff2))
366 /* REACTION-FIELD ELECTROSTATICS */
367 velec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_add_pd(rinv02,_mm_mul_pd(krf,rsq02)),crf));
368 felec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_mul_pd(rinv02,rinvsq02),krf2));
370 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
372 /* Update potential sum for this i atom from the interaction with this j atom. */
373 velec = _mm_and_pd(velec,cutoff_mask);
374 velecsum = _mm_add_pd(velecsum,velec);
378 fscal = _mm_and_pd(fscal,cutoff_mask);
380 /* Calculate temporary vectorial force */
381 tx = _mm_mul_pd(fscal,dx02);
382 ty = _mm_mul_pd(fscal,dy02);
383 tz = _mm_mul_pd(fscal,dz02);
385 /* Update vectorial force */
386 fix0 = _mm_add_pd(fix0,tx);
387 fiy0 = _mm_add_pd(fiy0,ty);
388 fiz0 = _mm_add_pd(fiz0,tz);
390 fjx2 = _mm_add_pd(fjx2,tx);
391 fjy2 = _mm_add_pd(fjy2,ty);
392 fjz2 = _mm_add_pd(fjz2,tz);
396 /**************************
397 * CALCULATE INTERACTIONS *
398 **************************/
400 if (gmx_mm_any_lt(rsq10,rcutoff2))
403 /* REACTION-FIELD ELECTROSTATICS */
404 velec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_add_pd(rinv10,_mm_mul_pd(krf,rsq10)),crf));
405 felec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_mul_pd(rinv10,rinvsq10),krf2));
407 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
409 /* Update potential sum for this i atom from the interaction with this j atom. */
410 velec = _mm_and_pd(velec,cutoff_mask);
411 velecsum = _mm_add_pd(velecsum,velec);
415 fscal = _mm_and_pd(fscal,cutoff_mask);
417 /* Calculate temporary vectorial force */
418 tx = _mm_mul_pd(fscal,dx10);
419 ty = _mm_mul_pd(fscal,dy10);
420 tz = _mm_mul_pd(fscal,dz10);
422 /* Update vectorial force */
423 fix1 = _mm_add_pd(fix1,tx);
424 fiy1 = _mm_add_pd(fiy1,ty);
425 fiz1 = _mm_add_pd(fiz1,tz);
427 fjx0 = _mm_add_pd(fjx0,tx);
428 fjy0 = _mm_add_pd(fjy0,ty);
429 fjz0 = _mm_add_pd(fjz0,tz);
433 /**************************
434 * CALCULATE INTERACTIONS *
435 **************************/
437 if (gmx_mm_any_lt(rsq11,rcutoff2))
440 /* REACTION-FIELD ELECTROSTATICS */
441 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
442 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
444 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
446 /* Update potential sum for this i atom from the interaction with this j atom. */
447 velec = _mm_and_pd(velec,cutoff_mask);
448 velecsum = _mm_add_pd(velecsum,velec);
452 fscal = _mm_and_pd(fscal,cutoff_mask);
454 /* Calculate temporary vectorial force */
455 tx = _mm_mul_pd(fscal,dx11);
456 ty = _mm_mul_pd(fscal,dy11);
457 tz = _mm_mul_pd(fscal,dz11);
459 /* Update vectorial force */
460 fix1 = _mm_add_pd(fix1,tx);
461 fiy1 = _mm_add_pd(fiy1,ty);
462 fiz1 = _mm_add_pd(fiz1,tz);
464 fjx1 = _mm_add_pd(fjx1,tx);
465 fjy1 = _mm_add_pd(fjy1,ty);
466 fjz1 = _mm_add_pd(fjz1,tz);
470 /**************************
471 * CALCULATE INTERACTIONS *
472 **************************/
474 if (gmx_mm_any_lt(rsq12,rcutoff2))
477 /* REACTION-FIELD ELECTROSTATICS */
478 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
479 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
481 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
483 /* Update potential sum for this i atom from the interaction with this j atom. */
484 velec = _mm_and_pd(velec,cutoff_mask);
485 velecsum = _mm_add_pd(velecsum,velec);
489 fscal = _mm_and_pd(fscal,cutoff_mask);
491 /* Calculate temporary vectorial force */
492 tx = _mm_mul_pd(fscal,dx12);
493 ty = _mm_mul_pd(fscal,dy12);
494 tz = _mm_mul_pd(fscal,dz12);
496 /* Update vectorial force */
497 fix1 = _mm_add_pd(fix1,tx);
498 fiy1 = _mm_add_pd(fiy1,ty);
499 fiz1 = _mm_add_pd(fiz1,tz);
501 fjx2 = _mm_add_pd(fjx2,tx);
502 fjy2 = _mm_add_pd(fjy2,ty);
503 fjz2 = _mm_add_pd(fjz2,tz);
507 /**************************
508 * CALCULATE INTERACTIONS *
509 **************************/
511 if (gmx_mm_any_lt(rsq20,rcutoff2))
514 /* REACTION-FIELD ELECTROSTATICS */
515 velec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_add_pd(rinv20,_mm_mul_pd(krf,rsq20)),crf));
516 felec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_mul_pd(rinv20,rinvsq20),krf2));
518 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
520 /* Update potential sum for this i atom from the interaction with this j atom. */
521 velec = _mm_and_pd(velec,cutoff_mask);
522 velecsum = _mm_add_pd(velecsum,velec);
526 fscal = _mm_and_pd(fscal,cutoff_mask);
528 /* Calculate temporary vectorial force */
529 tx = _mm_mul_pd(fscal,dx20);
530 ty = _mm_mul_pd(fscal,dy20);
531 tz = _mm_mul_pd(fscal,dz20);
533 /* Update vectorial force */
534 fix2 = _mm_add_pd(fix2,tx);
535 fiy2 = _mm_add_pd(fiy2,ty);
536 fiz2 = _mm_add_pd(fiz2,tz);
538 fjx0 = _mm_add_pd(fjx0,tx);
539 fjy0 = _mm_add_pd(fjy0,ty);
540 fjz0 = _mm_add_pd(fjz0,tz);
544 /**************************
545 * CALCULATE INTERACTIONS *
546 **************************/
548 if (gmx_mm_any_lt(rsq21,rcutoff2))
551 /* REACTION-FIELD ELECTROSTATICS */
552 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
553 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
555 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
557 /* Update potential sum for this i atom from the interaction with this j atom. */
558 velec = _mm_and_pd(velec,cutoff_mask);
559 velecsum = _mm_add_pd(velecsum,velec);
563 fscal = _mm_and_pd(fscal,cutoff_mask);
565 /* Calculate temporary vectorial force */
566 tx = _mm_mul_pd(fscal,dx21);
567 ty = _mm_mul_pd(fscal,dy21);
568 tz = _mm_mul_pd(fscal,dz21);
570 /* Update vectorial force */
571 fix2 = _mm_add_pd(fix2,tx);
572 fiy2 = _mm_add_pd(fiy2,ty);
573 fiz2 = _mm_add_pd(fiz2,tz);
575 fjx1 = _mm_add_pd(fjx1,tx);
576 fjy1 = _mm_add_pd(fjy1,ty);
577 fjz1 = _mm_add_pd(fjz1,tz);
581 /**************************
582 * CALCULATE INTERACTIONS *
583 **************************/
585 if (gmx_mm_any_lt(rsq22,rcutoff2))
588 /* REACTION-FIELD ELECTROSTATICS */
589 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
590 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
592 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
594 /* Update potential sum for this i atom from the interaction with this j atom. */
595 velec = _mm_and_pd(velec,cutoff_mask);
596 velecsum = _mm_add_pd(velecsum,velec);
600 fscal = _mm_and_pd(fscal,cutoff_mask);
602 /* Calculate temporary vectorial force */
603 tx = _mm_mul_pd(fscal,dx22);
604 ty = _mm_mul_pd(fscal,dy22);
605 tz = _mm_mul_pd(fscal,dz22);
607 /* Update vectorial force */
608 fix2 = _mm_add_pd(fix2,tx);
609 fiy2 = _mm_add_pd(fiy2,ty);
610 fiz2 = _mm_add_pd(fiz2,tz);
612 fjx2 = _mm_add_pd(fjx2,tx);
613 fjy2 = _mm_add_pd(fjy2,ty);
614 fjz2 = _mm_add_pd(fjz2,tz);
618 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
620 /* Inner loop uses 342 flops */
627 j_coord_offsetA = DIM*jnrA;
629 /* load j atom coordinates */
630 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
631 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
633 /* Calculate displacement vector */
634 dx00 = _mm_sub_pd(ix0,jx0);
635 dy00 = _mm_sub_pd(iy0,jy0);
636 dz00 = _mm_sub_pd(iz0,jz0);
637 dx01 = _mm_sub_pd(ix0,jx1);
638 dy01 = _mm_sub_pd(iy0,jy1);
639 dz01 = _mm_sub_pd(iz0,jz1);
640 dx02 = _mm_sub_pd(ix0,jx2);
641 dy02 = _mm_sub_pd(iy0,jy2);
642 dz02 = _mm_sub_pd(iz0,jz2);
643 dx10 = _mm_sub_pd(ix1,jx0);
644 dy10 = _mm_sub_pd(iy1,jy0);
645 dz10 = _mm_sub_pd(iz1,jz0);
646 dx11 = _mm_sub_pd(ix1,jx1);
647 dy11 = _mm_sub_pd(iy1,jy1);
648 dz11 = _mm_sub_pd(iz1,jz1);
649 dx12 = _mm_sub_pd(ix1,jx2);
650 dy12 = _mm_sub_pd(iy1,jy2);
651 dz12 = _mm_sub_pd(iz1,jz2);
652 dx20 = _mm_sub_pd(ix2,jx0);
653 dy20 = _mm_sub_pd(iy2,jy0);
654 dz20 = _mm_sub_pd(iz2,jz0);
655 dx21 = _mm_sub_pd(ix2,jx1);
656 dy21 = _mm_sub_pd(iy2,jy1);
657 dz21 = _mm_sub_pd(iz2,jz1);
658 dx22 = _mm_sub_pd(ix2,jx2);
659 dy22 = _mm_sub_pd(iy2,jy2);
660 dz22 = _mm_sub_pd(iz2,jz2);
662 /* Calculate squared distance and things based on it */
663 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
664 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
665 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
666 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
667 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
668 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
669 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
670 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
671 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
673 rinv00 = gmx_mm_invsqrt_pd(rsq00);
674 rinv01 = gmx_mm_invsqrt_pd(rsq01);
675 rinv02 = gmx_mm_invsqrt_pd(rsq02);
676 rinv10 = gmx_mm_invsqrt_pd(rsq10);
677 rinv11 = gmx_mm_invsqrt_pd(rsq11);
678 rinv12 = gmx_mm_invsqrt_pd(rsq12);
679 rinv20 = gmx_mm_invsqrt_pd(rsq20);
680 rinv21 = gmx_mm_invsqrt_pd(rsq21);
681 rinv22 = gmx_mm_invsqrt_pd(rsq22);
683 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
684 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
685 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
686 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
687 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
688 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
689 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
690 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
691 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
693 fjx0 = _mm_setzero_pd();
694 fjy0 = _mm_setzero_pd();
695 fjz0 = _mm_setzero_pd();
696 fjx1 = _mm_setzero_pd();
697 fjy1 = _mm_setzero_pd();
698 fjz1 = _mm_setzero_pd();
699 fjx2 = _mm_setzero_pd();
700 fjy2 = _mm_setzero_pd();
701 fjz2 = _mm_setzero_pd();
703 /**************************
704 * CALCULATE INTERACTIONS *
705 **************************/
707 if (gmx_mm_any_lt(rsq00,rcutoff2))
710 /* REACTION-FIELD ELECTROSTATICS */
711 velec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_add_pd(rinv00,_mm_mul_pd(krf,rsq00)),crf));
712 felec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_mul_pd(rinv00,rinvsq00),krf2));
714 /* LENNARD-JONES DISPERSION/REPULSION */
716 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
717 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
718 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
719 vvdw = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
720 _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
721 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
723 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
725 /* Update potential sum for this i atom from the interaction with this j atom. */
726 velec = _mm_and_pd(velec,cutoff_mask);
727 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
728 velecsum = _mm_add_pd(velecsum,velec);
729 vvdw = _mm_and_pd(vvdw,cutoff_mask);
730 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
731 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
733 fscal = _mm_add_pd(felec,fvdw);
735 fscal = _mm_and_pd(fscal,cutoff_mask);
737 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
739 /* Calculate temporary vectorial force */
740 tx = _mm_mul_pd(fscal,dx00);
741 ty = _mm_mul_pd(fscal,dy00);
742 tz = _mm_mul_pd(fscal,dz00);
744 /* Update vectorial force */
745 fix0 = _mm_add_pd(fix0,tx);
746 fiy0 = _mm_add_pd(fiy0,ty);
747 fiz0 = _mm_add_pd(fiz0,tz);
749 fjx0 = _mm_add_pd(fjx0,tx);
750 fjy0 = _mm_add_pd(fjy0,ty);
751 fjz0 = _mm_add_pd(fjz0,tz);
755 /**************************
756 * CALCULATE INTERACTIONS *
757 **************************/
759 if (gmx_mm_any_lt(rsq01,rcutoff2))
762 /* REACTION-FIELD ELECTROSTATICS */
763 velec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_add_pd(rinv01,_mm_mul_pd(krf,rsq01)),crf));
764 felec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_mul_pd(rinv01,rinvsq01),krf2));
766 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
768 /* Update potential sum for this i atom from the interaction with this j atom. */
769 velec = _mm_and_pd(velec,cutoff_mask);
770 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
771 velecsum = _mm_add_pd(velecsum,velec);
775 fscal = _mm_and_pd(fscal,cutoff_mask);
777 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
779 /* Calculate temporary vectorial force */
780 tx = _mm_mul_pd(fscal,dx01);
781 ty = _mm_mul_pd(fscal,dy01);
782 tz = _mm_mul_pd(fscal,dz01);
784 /* Update vectorial force */
785 fix0 = _mm_add_pd(fix0,tx);
786 fiy0 = _mm_add_pd(fiy0,ty);
787 fiz0 = _mm_add_pd(fiz0,tz);
789 fjx1 = _mm_add_pd(fjx1,tx);
790 fjy1 = _mm_add_pd(fjy1,ty);
791 fjz1 = _mm_add_pd(fjz1,tz);
795 /**************************
796 * CALCULATE INTERACTIONS *
797 **************************/
799 if (gmx_mm_any_lt(rsq02,rcutoff2))
802 /* REACTION-FIELD ELECTROSTATICS */
803 velec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_add_pd(rinv02,_mm_mul_pd(krf,rsq02)),crf));
804 felec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_mul_pd(rinv02,rinvsq02),krf2));
806 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
808 /* Update potential sum for this i atom from the interaction with this j atom. */
809 velec = _mm_and_pd(velec,cutoff_mask);
810 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
811 velecsum = _mm_add_pd(velecsum,velec);
815 fscal = _mm_and_pd(fscal,cutoff_mask);
817 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
819 /* Calculate temporary vectorial force */
820 tx = _mm_mul_pd(fscal,dx02);
821 ty = _mm_mul_pd(fscal,dy02);
822 tz = _mm_mul_pd(fscal,dz02);
824 /* Update vectorial force */
825 fix0 = _mm_add_pd(fix0,tx);
826 fiy0 = _mm_add_pd(fiy0,ty);
827 fiz0 = _mm_add_pd(fiz0,tz);
829 fjx2 = _mm_add_pd(fjx2,tx);
830 fjy2 = _mm_add_pd(fjy2,ty);
831 fjz2 = _mm_add_pd(fjz2,tz);
835 /**************************
836 * CALCULATE INTERACTIONS *
837 **************************/
839 if (gmx_mm_any_lt(rsq10,rcutoff2))
842 /* REACTION-FIELD ELECTROSTATICS */
843 velec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_add_pd(rinv10,_mm_mul_pd(krf,rsq10)),crf));
844 felec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_mul_pd(rinv10,rinvsq10),krf2));
846 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
848 /* Update potential sum for this i atom from the interaction with this j atom. */
849 velec = _mm_and_pd(velec,cutoff_mask);
850 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
851 velecsum = _mm_add_pd(velecsum,velec);
855 fscal = _mm_and_pd(fscal,cutoff_mask);
857 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
859 /* Calculate temporary vectorial force */
860 tx = _mm_mul_pd(fscal,dx10);
861 ty = _mm_mul_pd(fscal,dy10);
862 tz = _mm_mul_pd(fscal,dz10);
864 /* Update vectorial force */
865 fix1 = _mm_add_pd(fix1,tx);
866 fiy1 = _mm_add_pd(fiy1,ty);
867 fiz1 = _mm_add_pd(fiz1,tz);
869 fjx0 = _mm_add_pd(fjx0,tx);
870 fjy0 = _mm_add_pd(fjy0,ty);
871 fjz0 = _mm_add_pd(fjz0,tz);
875 /**************************
876 * CALCULATE INTERACTIONS *
877 **************************/
879 if (gmx_mm_any_lt(rsq11,rcutoff2))
882 /* REACTION-FIELD ELECTROSTATICS */
883 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
884 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
886 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
888 /* Update potential sum for this i atom from the interaction with this j atom. */
889 velec = _mm_and_pd(velec,cutoff_mask);
890 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
891 velecsum = _mm_add_pd(velecsum,velec);
895 fscal = _mm_and_pd(fscal,cutoff_mask);
897 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
899 /* Calculate temporary vectorial force */
900 tx = _mm_mul_pd(fscal,dx11);
901 ty = _mm_mul_pd(fscal,dy11);
902 tz = _mm_mul_pd(fscal,dz11);
904 /* Update vectorial force */
905 fix1 = _mm_add_pd(fix1,tx);
906 fiy1 = _mm_add_pd(fiy1,ty);
907 fiz1 = _mm_add_pd(fiz1,tz);
909 fjx1 = _mm_add_pd(fjx1,tx);
910 fjy1 = _mm_add_pd(fjy1,ty);
911 fjz1 = _mm_add_pd(fjz1,tz);
915 /**************************
916 * CALCULATE INTERACTIONS *
917 **************************/
919 if (gmx_mm_any_lt(rsq12,rcutoff2))
922 /* REACTION-FIELD ELECTROSTATICS */
923 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
924 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
926 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
928 /* Update potential sum for this i atom from the interaction with this j atom. */
929 velec = _mm_and_pd(velec,cutoff_mask);
930 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
931 velecsum = _mm_add_pd(velecsum,velec);
935 fscal = _mm_and_pd(fscal,cutoff_mask);
937 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
939 /* Calculate temporary vectorial force */
940 tx = _mm_mul_pd(fscal,dx12);
941 ty = _mm_mul_pd(fscal,dy12);
942 tz = _mm_mul_pd(fscal,dz12);
944 /* Update vectorial force */
945 fix1 = _mm_add_pd(fix1,tx);
946 fiy1 = _mm_add_pd(fiy1,ty);
947 fiz1 = _mm_add_pd(fiz1,tz);
949 fjx2 = _mm_add_pd(fjx2,tx);
950 fjy2 = _mm_add_pd(fjy2,ty);
951 fjz2 = _mm_add_pd(fjz2,tz);
955 /**************************
956 * CALCULATE INTERACTIONS *
957 **************************/
959 if (gmx_mm_any_lt(rsq20,rcutoff2))
962 /* REACTION-FIELD ELECTROSTATICS */
963 velec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_add_pd(rinv20,_mm_mul_pd(krf,rsq20)),crf));
964 felec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_mul_pd(rinv20,rinvsq20),krf2));
966 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
968 /* Update potential sum for this i atom from the interaction with this j atom. */
969 velec = _mm_and_pd(velec,cutoff_mask);
970 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
971 velecsum = _mm_add_pd(velecsum,velec);
975 fscal = _mm_and_pd(fscal,cutoff_mask);
977 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
979 /* Calculate temporary vectorial force */
980 tx = _mm_mul_pd(fscal,dx20);
981 ty = _mm_mul_pd(fscal,dy20);
982 tz = _mm_mul_pd(fscal,dz20);
984 /* Update vectorial force */
985 fix2 = _mm_add_pd(fix2,tx);
986 fiy2 = _mm_add_pd(fiy2,ty);
987 fiz2 = _mm_add_pd(fiz2,tz);
989 fjx0 = _mm_add_pd(fjx0,tx);
990 fjy0 = _mm_add_pd(fjy0,ty);
991 fjz0 = _mm_add_pd(fjz0,tz);
995 /**************************
996 * CALCULATE INTERACTIONS *
997 **************************/
999 if (gmx_mm_any_lt(rsq21,rcutoff2))
1002 /* REACTION-FIELD ELECTROSTATICS */
1003 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
1004 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
1006 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1008 /* Update potential sum for this i atom from the interaction with this j atom. */
1009 velec = _mm_and_pd(velec,cutoff_mask);
1010 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1011 velecsum = _mm_add_pd(velecsum,velec);
1015 fscal = _mm_and_pd(fscal,cutoff_mask);
1017 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1019 /* Calculate temporary vectorial force */
1020 tx = _mm_mul_pd(fscal,dx21);
1021 ty = _mm_mul_pd(fscal,dy21);
1022 tz = _mm_mul_pd(fscal,dz21);
1024 /* Update vectorial force */
1025 fix2 = _mm_add_pd(fix2,tx);
1026 fiy2 = _mm_add_pd(fiy2,ty);
1027 fiz2 = _mm_add_pd(fiz2,tz);
1029 fjx1 = _mm_add_pd(fjx1,tx);
1030 fjy1 = _mm_add_pd(fjy1,ty);
1031 fjz1 = _mm_add_pd(fjz1,tz);
1035 /**************************
1036 * CALCULATE INTERACTIONS *
1037 **************************/
1039 if (gmx_mm_any_lt(rsq22,rcutoff2))
1042 /* REACTION-FIELD ELECTROSTATICS */
1043 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
1044 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1046 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1048 /* Update potential sum for this i atom from the interaction with this j atom. */
1049 velec = _mm_and_pd(velec,cutoff_mask);
1050 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1051 velecsum = _mm_add_pd(velecsum,velec);
1055 fscal = _mm_and_pd(fscal,cutoff_mask);
1057 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1059 /* Calculate temporary vectorial force */
1060 tx = _mm_mul_pd(fscal,dx22);
1061 ty = _mm_mul_pd(fscal,dy22);
1062 tz = _mm_mul_pd(fscal,dz22);
1064 /* Update vectorial force */
1065 fix2 = _mm_add_pd(fix2,tx);
1066 fiy2 = _mm_add_pd(fiy2,ty);
1067 fiz2 = _mm_add_pd(fiz2,tz);
1069 fjx2 = _mm_add_pd(fjx2,tx);
1070 fjy2 = _mm_add_pd(fjy2,ty);
1071 fjz2 = _mm_add_pd(fjz2,tz);
1075 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1077 /* Inner loop uses 342 flops */
1080 /* End of innermost loop */
1082 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1083 f+i_coord_offset,fshift+i_shift_offset);
1086 /* Update potential energies */
1087 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1088 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1090 /* Increment number of inner iterations */
1091 inneriter += j_index_end - j_index_start;
1093 /* Outer loop uses 20 flops */
1096 /* Increment number of outer iterations */
1099 /* Update outer/inner flops */
1101 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*342);
1104 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse2_double
1105 * Electrostatics interaction: ReactionField
1106 * VdW interaction: LennardJones
1107 * Geometry: Water3-Water3
1108 * Calculate force/pot: Force
1111 nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse2_double
1112 (t_nblist * gmx_restrict nlist,
1113 rvec * gmx_restrict xx,
1114 rvec * gmx_restrict ff,
1115 t_forcerec * gmx_restrict fr,
1116 t_mdatoms * gmx_restrict mdatoms,
1117 nb_kernel_data_t * gmx_restrict kernel_data,
1118 t_nrnb * gmx_restrict nrnb)
1120 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1121 * just 0 for non-waters.
1122 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1123 * jnr indices corresponding to data put in the four positions in the SIMD register.
1125 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1126 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1128 int j_coord_offsetA,j_coord_offsetB;
1129 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1130 real rcutoff_scalar;
1131 real *shiftvec,*fshift,*x,*f;
1132 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1134 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1136 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1138 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1139 int vdwjidx0A,vdwjidx0B;
1140 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1141 int vdwjidx1A,vdwjidx1B;
1142 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1143 int vdwjidx2A,vdwjidx2B;
1144 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1145 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1146 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1147 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1148 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1149 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1150 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1151 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1152 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1153 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1154 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1157 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1160 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1161 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1162 __m128d dummy_mask,cutoff_mask;
1163 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1164 __m128d one = _mm_set1_pd(1.0);
1165 __m128d two = _mm_set1_pd(2.0);
1171 jindex = nlist->jindex;
1173 shiftidx = nlist->shift;
1175 shiftvec = fr->shift_vec[0];
1176 fshift = fr->fshift[0];
1177 facel = _mm_set1_pd(fr->epsfac);
1178 charge = mdatoms->chargeA;
1179 krf = _mm_set1_pd(fr->ic->k_rf);
1180 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
1181 crf = _mm_set1_pd(fr->ic->c_rf);
1182 nvdwtype = fr->ntype;
1183 vdwparam = fr->nbfp;
1184 vdwtype = mdatoms->typeA;
1186 /* Setup water-specific parameters */
1187 inr = nlist->iinr[0];
1188 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1189 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1190 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1191 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1193 jq0 = _mm_set1_pd(charge[inr+0]);
1194 jq1 = _mm_set1_pd(charge[inr+1]);
1195 jq2 = _mm_set1_pd(charge[inr+2]);
1196 vdwjidx0A = 2*vdwtype[inr+0];
1197 qq00 = _mm_mul_pd(iq0,jq0);
1198 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1199 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1200 qq01 = _mm_mul_pd(iq0,jq1);
1201 qq02 = _mm_mul_pd(iq0,jq2);
1202 qq10 = _mm_mul_pd(iq1,jq0);
1203 qq11 = _mm_mul_pd(iq1,jq1);
1204 qq12 = _mm_mul_pd(iq1,jq2);
1205 qq20 = _mm_mul_pd(iq2,jq0);
1206 qq21 = _mm_mul_pd(iq2,jq1);
1207 qq22 = _mm_mul_pd(iq2,jq2);
1209 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1210 rcutoff_scalar = fr->rcoulomb;
1211 rcutoff = _mm_set1_pd(rcutoff_scalar);
1212 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1214 sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6);
1215 rvdw = _mm_set1_pd(fr->rvdw);
1217 /* Avoid stupid compiler warnings */
1219 j_coord_offsetA = 0;
1220 j_coord_offsetB = 0;
1225 /* Start outer loop over neighborlists */
1226 for(iidx=0; iidx<nri; iidx++)
1228 /* Load shift vector for this list */
1229 i_shift_offset = DIM*shiftidx[iidx];
1231 /* Load limits for loop over neighbors */
1232 j_index_start = jindex[iidx];
1233 j_index_end = jindex[iidx+1];
1235 /* Get outer coordinate index */
1237 i_coord_offset = DIM*inr;
1239 /* Load i particle coords and add shift vector */
1240 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1241 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1243 fix0 = _mm_setzero_pd();
1244 fiy0 = _mm_setzero_pd();
1245 fiz0 = _mm_setzero_pd();
1246 fix1 = _mm_setzero_pd();
1247 fiy1 = _mm_setzero_pd();
1248 fiz1 = _mm_setzero_pd();
1249 fix2 = _mm_setzero_pd();
1250 fiy2 = _mm_setzero_pd();
1251 fiz2 = _mm_setzero_pd();
1253 /* Start inner kernel loop */
1254 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1257 /* Get j neighbor index, and coordinate index */
1259 jnrB = jjnr[jidx+1];
1260 j_coord_offsetA = DIM*jnrA;
1261 j_coord_offsetB = DIM*jnrB;
1263 /* load j atom coordinates */
1264 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1265 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1267 /* Calculate displacement vector */
1268 dx00 = _mm_sub_pd(ix0,jx0);
1269 dy00 = _mm_sub_pd(iy0,jy0);
1270 dz00 = _mm_sub_pd(iz0,jz0);
1271 dx01 = _mm_sub_pd(ix0,jx1);
1272 dy01 = _mm_sub_pd(iy0,jy1);
1273 dz01 = _mm_sub_pd(iz0,jz1);
1274 dx02 = _mm_sub_pd(ix0,jx2);
1275 dy02 = _mm_sub_pd(iy0,jy2);
1276 dz02 = _mm_sub_pd(iz0,jz2);
1277 dx10 = _mm_sub_pd(ix1,jx0);
1278 dy10 = _mm_sub_pd(iy1,jy0);
1279 dz10 = _mm_sub_pd(iz1,jz0);
1280 dx11 = _mm_sub_pd(ix1,jx1);
1281 dy11 = _mm_sub_pd(iy1,jy1);
1282 dz11 = _mm_sub_pd(iz1,jz1);
1283 dx12 = _mm_sub_pd(ix1,jx2);
1284 dy12 = _mm_sub_pd(iy1,jy2);
1285 dz12 = _mm_sub_pd(iz1,jz2);
1286 dx20 = _mm_sub_pd(ix2,jx0);
1287 dy20 = _mm_sub_pd(iy2,jy0);
1288 dz20 = _mm_sub_pd(iz2,jz0);
1289 dx21 = _mm_sub_pd(ix2,jx1);
1290 dy21 = _mm_sub_pd(iy2,jy1);
1291 dz21 = _mm_sub_pd(iz2,jz1);
1292 dx22 = _mm_sub_pd(ix2,jx2);
1293 dy22 = _mm_sub_pd(iy2,jy2);
1294 dz22 = _mm_sub_pd(iz2,jz2);
1296 /* Calculate squared distance and things based on it */
1297 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1298 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1299 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1300 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1301 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1302 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1303 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1304 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1305 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1307 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1308 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1309 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1310 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1311 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1312 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1313 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1314 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1315 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1317 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1318 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1319 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1320 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1321 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1322 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1323 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1324 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1325 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1327 fjx0 = _mm_setzero_pd();
1328 fjy0 = _mm_setzero_pd();
1329 fjz0 = _mm_setzero_pd();
1330 fjx1 = _mm_setzero_pd();
1331 fjy1 = _mm_setzero_pd();
1332 fjz1 = _mm_setzero_pd();
1333 fjx2 = _mm_setzero_pd();
1334 fjy2 = _mm_setzero_pd();
1335 fjz2 = _mm_setzero_pd();
1337 /**************************
1338 * CALCULATE INTERACTIONS *
1339 **************************/
1341 if (gmx_mm_any_lt(rsq00,rcutoff2))
1344 /* REACTION-FIELD ELECTROSTATICS */
1345 felec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_mul_pd(rinv00,rinvsq00),krf2));
1347 /* LENNARD-JONES DISPERSION/REPULSION */
1349 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1350 fvdw = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1352 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1354 fscal = _mm_add_pd(felec,fvdw);
1356 fscal = _mm_and_pd(fscal,cutoff_mask);
1358 /* Calculate temporary vectorial force */
1359 tx = _mm_mul_pd(fscal,dx00);
1360 ty = _mm_mul_pd(fscal,dy00);
1361 tz = _mm_mul_pd(fscal,dz00);
1363 /* Update vectorial force */
1364 fix0 = _mm_add_pd(fix0,tx);
1365 fiy0 = _mm_add_pd(fiy0,ty);
1366 fiz0 = _mm_add_pd(fiz0,tz);
1368 fjx0 = _mm_add_pd(fjx0,tx);
1369 fjy0 = _mm_add_pd(fjy0,ty);
1370 fjz0 = _mm_add_pd(fjz0,tz);
1374 /**************************
1375 * CALCULATE INTERACTIONS *
1376 **************************/
1378 if (gmx_mm_any_lt(rsq01,rcutoff2))
1381 /* REACTION-FIELD ELECTROSTATICS */
1382 felec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_mul_pd(rinv01,rinvsq01),krf2));
1384 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
1388 fscal = _mm_and_pd(fscal,cutoff_mask);
1390 /* Calculate temporary vectorial force */
1391 tx = _mm_mul_pd(fscal,dx01);
1392 ty = _mm_mul_pd(fscal,dy01);
1393 tz = _mm_mul_pd(fscal,dz01);
1395 /* Update vectorial force */
1396 fix0 = _mm_add_pd(fix0,tx);
1397 fiy0 = _mm_add_pd(fiy0,ty);
1398 fiz0 = _mm_add_pd(fiz0,tz);
1400 fjx1 = _mm_add_pd(fjx1,tx);
1401 fjy1 = _mm_add_pd(fjy1,ty);
1402 fjz1 = _mm_add_pd(fjz1,tz);
1406 /**************************
1407 * CALCULATE INTERACTIONS *
1408 **************************/
1410 if (gmx_mm_any_lt(rsq02,rcutoff2))
1413 /* REACTION-FIELD ELECTROSTATICS */
1414 felec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_mul_pd(rinv02,rinvsq02),krf2));
1416 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
1420 fscal = _mm_and_pd(fscal,cutoff_mask);
1422 /* Calculate temporary vectorial force */
1423 tx = _mm_mul_pd(fscal,dx02);
1424 ty = _mm_mul_pd(fscal,dy02);
1425 tz = _mm_mul_pd(fscal,dz02);
1427 /* Update vectorial force */
1428 fix0 = _mm_add_pd(fix0,tx);
1429 fiy0 = _mm_add_pd(fiy0,ty);
1430 fiz0 = _mm_add_pd(fiz0,tz);
1432 fjx2 = _mm_add_pd(fjx2,tx);
1433 fjy2 = _mm_add_pd(fjy2,ty);
1434 fjz2 = _mm_add_pd(fjz2,tz);
1438 /**************************
1439 * CALCULATE INTERACTIONS *
1440 **************************/
1442 if (gmx_mm_any_lt(rsq10,rcutoff2))
1445 /* REACTION-FIELD ELECTROSTATICS */
1446 felec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_mul_pd(rinv10,rinvsq10),krf2));
1448 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1452 fscal = _mm_and_pd(fscal,cutoff_mask);
1454 /* Calculate temporary vectorial force */
1455 tx = _mm_mul_pd(fscal,dx10);
1456 ty = _mm_mul_pd(fscal,dy10);
1457 tz = _mm_mul_pd(fscal,dz10);
1459 /* Update vectorial force */
1460 fix1 = _mm_add_pd(fix1,tx);
1461 fiy1 = _mm_add_pd(fiy1,ty);
1462 fiz1 = _mm_add_pd(fiz1,tz);
1464 fjx0 = _mm_add_pd(fjx0,tx);
1465 fjy0 = _mm_add_pd(fjy0,ty);
1466 fjz0 = _mm_add_pd(fjz0,tz);
1470 /**************************
1471 * CALCULATE INTERACTIONS *
1472 **************************/
1474 if (gmx_mm_any_lt(rsq11,rcutoff2))
1477 /* REACTION-FIELD ELECTROSTATICS */
1478 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
1480 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1484 fscal = _mm_and_pd(fscal,cutoff_mask);
1486 /* Calculate temporary vectorial force */
1487 tx = _mm_mul_pd(fscal,dx11);
1488 ty = _mm_mul_pd(fscal,dy11);
1489 tz = _mm_mul_pd(fscal,dz11);
1491 /* Update vectorial force */
1492 fix1 = _mm_add_pd(fix1,tx);
1493 fiy1 = _mm_add_pd(fiy1,ty);
1494 fiz1 = _mm_add_pd(fiz1,tz);
1496 fjx1 = _mm_add_pd(fjx1,tx);
1497 fjy1 = _mm_add_pd(fjy1,ty);
1498 fjz1 = _mm_add_pd(fjz1,tz);
1502 /**************************
1503 * CALCULATE INTERACTIONS *
1504 **************************/
1506 if (gmx_mm_any_lt(rsq12,rcutoff2))
1509 /* REACTION-FIELD ELECTROSTATICS */
1510 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
1512 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1516 fscal = _mm_and_pd(fscal,cutoff_mask);
1518 /* Calculate temporary vectorial force */
1519 tx = _mm_mul_pd(fscal,dx12);
1520 ty = _mm_mul_pd(fscal,dy12);
1521 tz = _mm_mul_pd(fscal,dz12);
1523 /* Update vectorial force */
1524 fix1 = _mm_add_pd(fix1,tx);
1525 fiy1 = _mm_add_pd(fiy1,ty);
1526 fiz1 = _mm_add_pd(fiz1,tz);
1528 fjx2 = _mm_add_pd(fjx2,tx);
1529 fjy2 = _mm_add_pd(fjy2,ty);
1530 fjz2 = _mm_add_pd(fjz2,tz);
1534 /**************************
1535 * CALCULATE INTERACTIONS *
1536 **************************/
1538 if (gmx_mm_any_lt(rsq20,rcutoff2))
1541 /* REACTION-FIELD ELECTROSTATICS */
1542 felec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_mul_pd(rinv20,rinvsq20),krf2));
1544 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1548 fscal = _mm_and_pd(fscal,cutoff_mask);
1550 /* Calculate temporary vectorial force */
1551 tx = _mm_mul_pd(fscal,dx20);
1552 ty = _mm_mul_pd(fscal,dy20);
1553 tz = _mm_mul_pd(fscal,dz20);
1555 /* Update vectorial force */
1556 fix2 = _mm_add_pd(fix2,tx);
1557 fiy2 = _mm_add_pd(fiy2,ty);
1558 fiz2 = _mm_add_pd(fiz2,tz);
1560 fjx0 = _mm_add_pd(fjx0,tx);
1561 fjy0 = _mm_add_pd(fjy0,ty);
1562 fjz0 = _mm_add_pd(fjz0,tz);
1566 /**************************
1567 * CALCULATE INTERACTIONS *
1568 **************************/
1570 if (gmx_mm_any_lt(rsq21,rcutoff2))
1573 /* REACTION-FIELD ELECTROSTATICS */
1574 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
1576 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1580 fscal = _mm_and_pd(fscal,cutoff_mask);
1582 /* Calculate temporary vectorial force */
1583 tx = _mm_mul_pd(fscal,dx21);
1584 ty = _mm_mul_pd(fscal,dy21);
1585 tz = _mm_mul_pd(fscal,dz21);
1587 /* Update vectorial force */
1588 fix2 = _mm_add_pd(fix2,tx);
1589 fiy2 = _mm_add_pd(fiy2,ty);
1590 fiz2 = _mm_add_pd(fiz2,tz);
1592 fjx1 = _mm_add_pd(fjx1,tx);
1593 fjy1 = _mm_add_pd(fjy1,ty);
1594 fjz1 = _mm_add_pd(fjz1,tz);
1598 /**************************
1599 * CALCULATE INTERACTIONS *
1600 **************************/
1602 if (gmx_mm_any_lt(rsq22,rcutoff2))
1605 /* REACTION-FIELD ELECTROSTATICS */
1606 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1608 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1612 fscal = _mm_and_pd(fscal,cutoff_mask);
1614 /* Calculate temporary vectorial force */
1615 tx = _mm_mul_pd(fscal,dx22);
1616 ty = _mm_mul_pd(fscal,dy22);
1617 tz = _mm_mul_pd(fscal,dz22);
1619 /* Update vectorial force */
1620 fix2 = _mm_add_pd(fix2,tx);
1621 fiy2 = _mm_add_pd(fiy2,ty);
1622 fiz2 = _mm_add_pd(fiz2,tz);
1624 fjx2 = _mm_add_pd(fjx2,tx);
1625 fjy2 = _mm_add_pd(fjy2,ty);
1626 fjz2 = _mm_add_pd(fjz2,tz);
1630 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1632 /* Inner loop uses 277 flops */
1635 if(jidx<j_index_end)
1639 j_coord_offsetA = DIM*jnrA;
1641 /* load j atom coordinates */
1642 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1643 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1645 /* Calculate displacement vector */
1646 dx00 = _mm_sub_pd(ix0,jx0);
1647 dy00 = _mm_sub_pd(iy0,jy0);
1648 dz00 = _mm_sub_pd(iz0,jz0);
1649 dx01 = _mm_sub_pd(ix0,jx1);
1650 dy01 = _mm_sub_pd(iy0,jy1);
1651 dz01 = _mm_sub_pd(iz0,jz1);
1652 dx02 = _mm_sub_pd(ix0,jx2);
1653 dy02 = _mm_sub_pd(iy0,jy2);
1654 dz02 = _mm_sub_pd(iz0,jz2);
1655 dx10 = _mm_sub_pd(ix1,jx0);
1656 dy10 = _mm_sub_pd(iy1,jy0);
1657 dz10 = _mm_sub_pd(iz1,jz0);
1658 dx11 = _mm_sub_pd(ix1,jx1);
1659 dy11 = _mm_sub_pd(iy1,jy1);
1660 dz11 = _mm_sub_pd(iz1,jz1);
1661 dx12 = _mm_sub_pd(ix1,jx2);
1662 dy12 = _mm_sub_pd(iy1,jy2);
1663 dz12 = _mm_sub_pd(iz1,jz2);
1664 dx20 = _mm_sub_pd(ix2,jx0);
1665 dy20 = _mm_sub_pd(iy2,jy0);
1666 dz20 = _mm_sub_pd(iz2,jz0);
1667 dx21 = _mm_sub_pd(ix2,jx1);
1668 dy21 = _mm_sub_pd(iy2,jy1);
1669 dz21 = _mm_sub_pd(iz2,jz1);
1670 dx22 = _mm_sub_pd(ix2,jx2);
1671 dy22 = _mm_sub_pd(iy2,jy2);
1672 dz22 = _mm_sub_pd(iz2,jz2);
1674 /* Calculate squared distance and things based on it */
1675 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1676 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1677 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1678 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1679 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1680 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1681 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1682 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1683 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1685 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1686 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1687 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1688 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1689 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1690 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1691 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1692 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1693 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1695 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1696 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1697 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1698 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1699 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1700 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1701 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1702 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1703 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1705 fjx0 = _mm_setzero_pd();
1706 fjy0 = _mm_setzero_pd();
1707 fjz0 = _mm_setzero_pd();
1708 fjx1 = _mm_setzero_pd();
1709 fjy1 = _mm_setzero_pd();
1710 fjz1 = _mm_setzero_pd();
1711 fjx2 = _mm_setzero_pd();
1712 fjy2 = _mm_setzero_pd();
1713 fjz2 = _mm_setzero_pd();
1715 /**************************
1716 * CALCULATE INTERACTIONS *
1717 **************************/
1719 if (gmx_mm_any_lt(rsq00,rcutoff2))
1722 /* REACTION-FIELD ELECTROSTATICS */
1723 felec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_mul_pd(rinv00,rinvsq00),krf2));
1725 /* LENNARD-JONES DISPERSION/REPULSION */
1727 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1728 fvdw = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1730 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1732 fscal = _mm_add_pd(felec,fvdw);
1734 fscal = _mm_and_pd(fscal,cutoff_mask);
1736 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1738 /* Calculate temporary vectorial force */
1739 tx = _mm_mul_pd(fscal,dx00);
1740 ty = _mm_mul_pd(fscal,dy00);
1741 tz = _mm_mul_pd(fscal,dz00);
1743 /* Update vectorial force */
1744 fix0 = _mm_add_pd(fix0,tx);
1745 fiy0 = _mm_add_pd(fiy0,ty);
1746 fiz0 = _mm_add_pd(fiz0,tz);
1748 fjx0 = _mm_add_pd(fjx0,tx);
1749 fjy0 = _mm_add_pd(fjy0,ty);
1750 fjz0 = _mm_add_pd(fjz0,tz);
1754 /**************************
1755 * CALCULATE INTERACTIONS *
1756 **************************/
1758 if (gmx_mm_any_lt(rsq01,rcutoff2))
1761 /* REACTION-FIELD ELECTROSTATICS */
1762 felec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_mul_pd(rinv01,rinvsq01),krf2));
1764 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
1768 fscal = _mm_and_pd(fscal,cutoff_mask);
1770 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1772 /* Calculate temporary vectorial force */
1773 tx = _mm_mul_pd(fscal,dx01);
1774 ty = _mm_mul_pd(fscal,dy01);
1775 tz = _mm_mul_pd(fscal,dz01);
1777 /* Update vectorial force */
1778 fix0 = _mm_add_pd(fix0,tx);
1779 fiy0 = _mm_add_pd(fiy0,ty);
1780 fiz0 = _mm_add_pd(fiz0,tz);
1782 fjx1 = _mm_add_pd(fjx1,tx);
1783 fjy1 = _mm_add_pd(fjy1,ty);
1784 fjz1 = _mm_add_pd(fjz1,tz);
1788 /**************************
1789 * CALCULATE INTERACTIONS *
1790 **************************/
1792 if (gmx_mm_any_lt(rsq02,rcutoff2))
1795 /* REACTION-FIELD ELECTROSTATICS */
1796 felec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_mul_pd(rinv02,rinvsq02),krf2));
1798 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
1802 fscal = _mm_and_pd(fscal,cutoff_mask);
1804 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1806 /* Calculate temporary vectorial force */
1807 tx = _mm_mul_pd(fscal,dx02);
1808 ty = _mm_mul_pd(fscal,dy02);
1809 tz = _mm_mul_pd(fscal,dz02);
1811 /* Update vectorial force */
1812 fix0 = _mm_add_pd(fix0,tx);
1813 fiy0 = _mm_add_pd(fiy0,ty);
1814 fiz0 = _mm_add_pd(fiz0,tz);
1816 fjx2 = _mm_add_pd(fjx2,tx);
1817 fjy2 = _mm_add_pd(fjy2,ty);
1818 fjz2 = _mm_add_pd(fjz2,tz);
1822 /**************************
1823 * CALCULATE INTERACTIONS *
1824 **************************/
1826 if (gmx_mm_any_lt(rsq10,rcutoff2))
1829 /* REACTION-FIELD ELECTROSTATICS */
1830 felec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_mul_pd(rinv10,rinvsq10),krf2));
1832 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1836 fscal = _mm_and_pd(fscal,cutoff_mask);
1838 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1840 /* Calculate temporary vectorial force */
1841 tx = _mm_mul_pd(fscal,dx10);
1842 ty = _mm_mul_pd(fscal,dy10);
1843 tz = _mm_mul_pd(fscal,dz10);
1845 /* Update vectorial force */
1846 fix1 = _mm_add_pd(fix1,tx);
1847 fiy1 = _mm_add_pd(fiy1,ty);
1848 fiz1 = _mm_add_pd(fiz1,tz);
1850 fjx0 = _mm_add_pd(fjx0,tx);
1851 fjy0 = _mm_add_pd(fjy0,ty);
1852 fjz0 = _mm_add_pd(fjz0,tz);
1856 /**************************
1857 * CALCULATE INTERACTIONS *
1858 **************************/
1860 if (gmx_mm_any_lt(rsq11,rcutoff2))
1863 /* REACTION-FIELD ELECTROSTATICS */
1864 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
1866 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1870 fscal = _mm_and_pd(fscal,cutoff_mask);
1872 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1874 /* Calculate temporary vectorial force */
1875 tx = _mm_mul_pd(fscal,dx11);
1876 ty = _mm_mul_pd(fscal,dy11);
1877 tz = _mm_mul_pd(fscal,dz11);
1879 /* Update vectorial force */
1880 fix1 = _mm_add_pd(fix1,tx);
1881 fiy1 = _mm_add_pd(fiy1,ty);
1882 fiz1 = _mm_add_pd(fiz1,tz);
1884 fjx1 = _mm_add_pd(fjx1,tx);
1885 fjy1 = _mm_add_pd(fjy1,ty);
1886 fjz1 = _mm_add_pd(fjz1,tz);
1890 /**************************
1891 * CALCULATE INTERACTIONS *
1892 **************************/
1894 if (gmx_mm_any_lt(rsq12,rcutoff2))
1897 /* REACTION-FIELD ELECTROSTATICS */
1898 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
1900 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1904 fscal = _mm_and_pd(fscal,cutoff_mask);
1906 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1908 /* Calculate temporary vectorial force */
1909 tx = _mm_mul_pd(fscal,dx12);
1910 ty = _mm_mul_pd(fscal,dy12);
1911 tz = _mm_mul_pd(fscal,dz12);
1913 /* Update vectorial force */
1914 fix1 = _mm_add_pd(fix1,tx);
1915 fiy1 = _mm_add_pd(fiy1,ty);
1916 fiz1 = _mm_add_pd(fiz1,tz);
1918 fjx2 = _mm_add_pd(fjx2,tx);
1919 fjy2 = _mm_add_pd(fjy2,ty);
1920 fjz2 = _mm_add_pd(fjz2,tz);
1924 /**************************
1925 * CALCULATE INTERACTIONS *
1926 **************************/
1928 if (gmx_mm_any_lt(rsq20,rcutoff2))
1931 /* REACTION-FIELD ELECTROSTATICS */
1932 felec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_mul_pd(rinv20,rinvsq20),krf2));
1934 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1938 fscal = _mm_and_pd(fscal,cutoff_mask);
1940 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1942 /* Calculate temporary vectorial force */
1943 tx = _mm_mul_pd(fscal,dx20);
1944 ty = _mm_mul_pd(fscal,dy20);
1945 tz = _mm_mul_pd(fscal,dz20);
1947 /* Update vectorial force */
1948 fix2 = _mm_add_pd(fix2,tx);
1949 fiy2 = _mm_add_pd(fiy2,ty);
1950 fiz2 = _mm_add_pd(fiz2,tz);
1952 fjx0 = _mm_add_pd(fjx0,tx);
1953 fjy0 = _mm_add_pd(fjy0,ty);
1954 fjz0 = _mm_add_pd(fjz0,tz);
1958 /**************************
1959 * CALCULATE INTERACTIONS *
1960 **************************/
1962 if (gmx_mm_any_lt(rsq21,rcutoff2))
1965 /* REACTION-FIELD ELECTROSTATICS */
1966 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
1968 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1972 fscal = _mm_and_pd(fscal,cutoff_mask);
1974 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1976 /* Calculate temporary vectorial force */
1977 tx = _mm_mul_pd(fscal,dx21);
1978 ty = _mm_mul_pd(fscal,dy21);
1979 tz = _mm_mul_pd(fscal,dz21);
1981 /* Update vectorial force */
1982 fix2 = _mm_add_pd(fix2,tx);
1983 fiy2 = _mm_add_pd(fiy2,ty);
1984 fiz2 = _mm_add_pd(fiz2,tz);
1986 fjx1 = _mm_add_pd(fjx1,tx);
1987 fjy1 = _mm_add_pd(fjy1,ty);
1988 fjz1 = _mm_add_pd(fjz1,tz);
1992 /**************************
1993 * CALCULATE INTERACTIONS *
1994 **************************/
1996 if (gmx_mm_any_lt(rsq22,rcutoff2))
1999 /* REACTION-FIELD ELECTROSTATICS */
2000 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
2002 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2006 fscal = _mm_and_pd(fscal,cutoff_mask);
2008 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2010 /* Calculate temporary vectorial force */
2011 tx = _mm_mul_pd(fscal,dx22);
2012 ty = _mm_mul_pd(fscal,dy22);
2013 tz = _mm_mul_pd(fscal,dz22);
2015 /* Update vectorial force */
2016 fix2 = _mm_add_pd(fix2,tx);
2017 fiy2 = _mm_add_pd(fiy2,ty);
2018 fiz2 = _mm_add_pd(fiz2,tz);
2020 fjx2 = _mm_add_pd(fjx2,tx);
2021 fjy2 = _mm_add_pd(fjy2,ty);
2022 fjz2 = _mm_add_pd(fjz2,tz);
2026 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2028 /* Inner loop uses 277 flops */
2031 /* End of innermost loop */
2033 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2034 f+i_coord_offset,fshift+i_shift_offset);
2036 /* Increment number of inner iterations */
2037 inneriter += j_index_end - j_index_start;
2039 /* Outer loop uses 18 flops */
2042 /* Increment number of outer iterations */
2045 /* Update outer/inner flops */
2047 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*277);