2 * Note: this file was generated by the Gromacs sse2_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse2_double
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse2_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
73 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74 int vdwjidx0A,vdwjidx0B;
75 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76 int vdwjidx1A,vdwjidx1B;
77 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
78 int vdwjidx2A,vdwjidx2B;
79 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
80 int vdwjidx3A,vdwjidx3B;
81 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
82 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
84 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
85 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
86 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
87 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
88 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
89 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
90 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
91 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
92 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
95 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
98 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
99 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
100 __m128d dummy_mask,cutoff_mask;
101 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
102 __m128d one = _mm_set1_pd(1.0);
103 __m128d two = _mm_set1_pd(2.0);
109 jindex = nlist->jindex;
111 shiftidx = nlist->shift;
113 shiftvec = fr->shift_vec[0];
114 fshift = fr->fshift[0];
115 facel = _mm_set1_pd(fr->epsfac);
116 charge = mdatoms->chargeA;
117 krf = _mm_set1_pd(fr->ic->k_rf);
118 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
119 crf = _mm_set1_pd(fr->ic->c_rf);
120 nvdwtype = fr->ntype;
122 vdwtype = mdatoms->typeA;
124 /* Setup water-specific parameters */
125 inr = nlist->iinr[0];
126 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
127 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
128 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
129 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
131 jq1 = _mm_set1_pd(charge[inr+1]);
132 jq2 = _mm_set1_pd(charge[inr+2]);
133 jq3 = _mm_set1_pd(charge[inr+3]);
134 vdwjidx0A = 2*vdwtype[inr+0];
135 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
136 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
137 qq11 = _mm_mul_pd(iq1,jq1);
138 qq12 = _mm_mul_pd(iq1,jq2);
139 qq13 = _mm_mul_pd(iq1,jq3);
140 qq21 = _mm_mul_pd(iq2,jq1);
141 qq22 = _mm_mul_pd(iq2,jq2);
142 qq23 = _mm_mul_pd(iq2,jq3);
143 qq31 = _mm_mul_pd(iq3,jq1);
144 qq32 = _mm_mul_pd(iq3,jq2);
145 qq33 = _mm_mul_pd(iq3,jq3);
147 /* Avoid stupid compiler warnings */
155 /* Start outer loop over neighborlists */
156 for(iidx=0; iidx<nri; iidx++)
158 /* Load shift vector for this list */
159 i_shift_offset = DIM*shiftidx[iidx];
161 /* Load limits for loop over neighbors */
162 j_index_start = jindex[iidx];
163 j_index_end = jindex[iidx+1];
165 /* Get outer coordinate index */
167 i_coord_offset = DIM*inr;
169 /* Load i particle coords and add shift vector */
170 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
171 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
173 fix0 = _mm_setzero_pd();
174 fiy0 = _mm_setzero_pd();
175 fiz0 = _mm_setzero_pd();
176 fix1 = _mm_setzero_pd();
177 fiy1 = _mm_setzero_pd();
178 fiz1 = _mm_setzero_pd();
179 fix2 = _mm_setzero_pd();
180 fiy2 = _mm_setzero_pd();
181 fiz2 = _mm_setzero_pd();
182 fix3 = _mm_setzero_pd();
183 fiy3 = _mm_setzero_pd();
184 fiz3 = _mm_setzero_pd();
186 /* Reset potential sums */
187 velecsum = _mm_setzero_pd();
188 vvdwsum = _mm_setzero_pd();
190 /* Start inner kernel loop */
191 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
194 /* Get j neighbor index, and coordinate index */
197 j_coord_offsetA = DIM*jnrA;
198 j_coord_offsetB = DIM*jnrB;
200 /* load j atom coordinates */
201 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
202 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
203 &jy2,&jz2,&jx3,&jy3,&jz3);
205 /* Calculate displacement vector */
206 dx00 = _mm_sub_pd(ix0,jx0);
207 dy00 = _mm_sub_pd(iy0,jy0);
208 dz00 = _mm_sub_pd(iz0,jz0);
209 dx11 = _mm_sub_pd(ix1,jx1);
210 dy11 = _mm_sub_pd(iy1,jy1);
211 dz11 = _mm_sub_pd(iz1,jz1);
212 dx12 = _mm_sub_pd(ix1,jx2);
213 dy12 = _mm_sub_pd(iy1,jy2);
214 dz12 = _mm_sub_pd(iz1,jz2);
215 dx13 = _mm_sub_pd(ix1,jx3);
216 dy13 = _mm_sub_pd(iy1,jy3);
217 dz13 = _mm_sub_pd(iz1,jz3);
218 dx21 = _mm_sub_pd(ix2,jx1);
219 dy21 = _mm_sub_pd(iy2,jy1);
220 dz21 = _mm_sub_pd(iz2,jz1);
221 dx22 = _mm_sub_pd(ix2,jx2);
222 dy22 = _mm_sub_pd(iy2,jy2);
223 dz22 = _mm_sub_pd(iz2,jz2);
224 dx23 = _mm_sub_pd(ix2,jx3);
225 dy23 = _mm_sub_pd(iy2,jy3);
226 dz23 = _mm_sub_pd(iz2,jz3);
227 dx31 = _mm_sub_pd(ix3,jx1);
228 dy31 = _mm_sub_pd(iy3,jy1);
229 dz31 = _mm_sub_pd(iz3,jz1);
230 dx32 = _mm_sub_pd(ix3,jx2);
231 dy32 = _mm_sub_pd(iy3,jy2);
232 dz32 = _mm_sub_pd(iz3,jz2);
233 dx33 = _mm_sub_pd(ix3,jx3);
234 dy33 = _mm_sub_pd(iy3,jy3);
235 dz33 = _mm_sub_pd(iz3,jz3);
237 /* Calculate squared distance and things based on it */
238 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
239 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
240 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
241 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
242 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
243 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
244 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
245 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
246 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
247 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
249 rinv11 = gmx_mm_invsqrt_pd(rsq11);
250 rinv12 = gmx_mm_invsqrt_pd(rsq12);
251 rinv13 = gmx_mm_invsqrt_pd(rsq13);
252 rinv21 = gmx_mm_invsqrt_pd(rsq21);
253 rinv22 = gmx_mm_invsqrt_pd(rsq22);
254 rinv23 = gmx_mm_invsqrt_pd(rsq23);
255 rinv31 = gmx_mm_invsqrt_pd(rsq31);
256 rinv32 = gmx_mm_invsqrt_pd(rsq32);
257 rinv33 = gmx_mm_invsqrt_pd(rsq33);
259 rinvsq00 = gmx_mm_inv_pd(rsq00);
260 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
261 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
262 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
263 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
264 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
265 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
266 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
267 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
268 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
270 fjx0 = _mm_setzero_pd();
271 fjy0 = _mm_setzero_pd();
272 fjz0 = _mm_setzero_pd();
273 fjx1 = _mm_setzero_pd();
274 fjy1 = _mm_setzero_pd();
275 fjz1 = _mm_setzero_pd();
276 fjx2 = _mm_setzero_pd();
277 fjy2 = _mm_setzero_pd();
278 fjz2 = _mm_setzero_pd();
279 fjx3 = _mm_setzero_pd();
280 fjy3 = _mm_setzero_pd();
281 fjz3 = _mm_setzero_pd();
283 /**************************
284 * CALCULATE INTERACTIONS *
285 **************************/
287 /* LENNARD-JONES DISPERSION/REPULSION */
289 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
290 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
291 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
292 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
293 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
295 /* Update potential sum for this i atom from the interaction with this j atom. */
296 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
300 /* Calculate temporary vectorial force */
301 tx = _mm_mul_pd(fscal,dx00);
302 ty = _mm_mul_pd(fscal,dy00);
303 tz = _mm_mul_pd(fscal,dz00);
305 /* Update vectorial force */
306 fix0 = _mm_add_pd(fix0,tx);
307 fiy0 = _mm_add_pd(fiy0,ty);
308 fiz0 = _mm_add_pd(fiz0,tz);
310 fjx0 = _mm_add_pd(fjx0,tx);
311 fjy0 = _mm_add_pd(fjy0,ty);
312 fjz0 = _mm_add_pd(fjz0,tz);
314 /**************************
315 * CALCULATE INTERACTIONS *
316 **************************/
318 /* REACTION-FIELD ELECTROSTATICS */
319 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
320 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
322 /* Update potential sum for this i atom from the interaction with this j atom. */
323 velecsum = _mm_add_pd(velecsum,velec);
327 /* Calculate temporary vectorial force */
328 tx = _mm_mul_pd(fscal,dx11);
329 ty = _mm_mul_pd(fscal,dy11);
330 tz = _mm_mul_pd(fscal,dz11);
332 /* Update vectorial force */
333 fix1 = _mm_add_pd(fix1,tx);
334 fiy1 = _mm_add_pd(fiy1,ty);
335 fiz1 = _mm_add_pd(fiz1,tz);
337 fjx1 = _mm_add_pd(fjx1,tx);
338 fjy1 = _mm_add_pd(fjy1,ty);
339 fjz1 = _mm_add_pd(fjz1,tz);
341 /**************************
342 * CALCULATE INTERACTIONS *
343 **************************/
345 /* REACTION-FIELD ELECTROSTATICS */
346 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
347 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
349 /* Update potential sum for this i atom from the interaction with this j atom. */
350 velecsum = _mm_add_pd(velecsum,velec);
354 /* Calculate temporary vectorial force */
355 tx = _mm_mul_pd(fscal,dx12);
356 ty = _mm_mul_pd(fscal,dy12);
357 tz = _mm_mul_pd(fscal,dz12);
359 /* Update vectorial force */
360 fix1 = _mm_add_pd(fix1,tx);
361 fiy1 = _mm_add_pd(fiy1,ty);
362 fiz1 = _mm_add_pd(fiz1,tz);
364 fjx2 = _mm_add_pd(fjx2,tx);
365 fjy2 = _mm_add_pd(fjy2,ty);
366 fjz2 = _mm_add_pd(fjz2,tz);
368 /**************************
369 * CALCULATE INTERACTIONS *
370 **************************/
372 /* REACTION-FIELD ELECTROSTATICS */
373 velec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_add_pd(rinv13,_mm_mul_pd(krf,rsq13)),crf));
374 felec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
376 /* Update potential sum for this i atom from the interaction with this j atom. */
377 velecsum = _mm_add_pd(velecsum,velec);
381 /* Calculate temporary vectorial force */
382 tx = _mm_mul_pd(fscal,dx13);
383 ty = _mm_mul_pd(fscal,dy13);
384 tz = _mm_mul_pd(fscal,dz13);
386 /* Update vectorial force */
387 fix1 = _mm_add_pd(fix1,tx);
388 fiy1 = _mm_add_pd(fiy1,ty);
389 fiz1 = _mm_add_pd(fiz1,tz);
391 fjx3 = _mm_add_pd(fjx3,tx);
392 fjy3 = _mm_add_pd(fjy3,ty);
393 fjz3 = _mm_add_pd(fjz3,tz);
395 /**************************
396 * CALCULATE INTERACTIONS *
397 **************************/
399 /* REACTION-FIELD ELECTROSTATICS */
400 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
401 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
403 /* Update potential sum for this i atom from the interaction with this j atom. */
404 velecsum = _mm_add_pd(velecsum,velec);
408 /* Calculate temporary vectorial force */
409 tx = _mm_mul_pd(fscal,dx21);
410 ty = _mm_mul_pd(fscal,dy21);
411 tz = _mm_mul_pd(fscal,dz21);
413 /* Update vectorial force */
414 fix2 = _mm_add_pd(fix2,tx);
415 fiy2 = _mm_add_pd(fiy2,ty);
416 fiz2 = _mm_add_pd(fiz2,tz);
418 fjx1 = _mm_add_pd(fjx1,tx);
419 fjy1 = _mm_add_pd(fjy1,ty);
420 fjz1 = _mm_add_pd(fjz1,tz);
422 /**************************
423 * CALCULATE INTERACTIONS *
424 **************************/
426 /* REACTION-FIELD ELECTROSTATICS */
427 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
428 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
430 /* Update potential sum for this i atom from the interaction with this j atom. */
431 velecsum = _mm_add_pd(velecsum,velec);
435 /* Calculate temporary vectorial force */
436 tx = _mm_mul_pd(fscal,dx22);
437 ty = _mm_mul_pd(fscal,dy22);
438 tz = _mm_mul_pd(fscal,dz22);
440 /* Update vectorial force */
441 fix2 = _mm_add_pd(fix2,tx);
442 fiy2 = _mm_add_pd(fiy2,ty);
443 fiz2 = _mm_add_pd(fiz2,tz);
445 fjx2 = _mm_add_pd(fjx2,tx);
446 fjy2 = _mm_add_pd(fjy2,ty);
447 fjz2 = _mm_add_pd(fjz2,tz);
449 /**************************
450 * CALCULATE INTERACTIONS *
451 **************************/
453 /* REACTION-FIELD ELECTROSTATICS */
454 velec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_add_pd(rinv23,_mm_mul_pd(krf,rsq23)),crf));
455 felec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
457 /* Update potential sum for this i atom from the interaction with this j atom. */
458 velecsum = _mm_add_pd(velecsum,velec);
462 /* Calculate temporary vectorial force */
463 tx = _mm_mul_pd(fscal,dx23);
464 ty = _mm_mul_pd(fscal,dy23);
465 tz = _mm_mul_pd(fscal,dz23);
467 /* Update vectorial force */
468 fix2 = _mm_add_pd(fix2,tx);
469 fiy2 = _mm_add_pd(fiy2,ty);
470 fiz2 = _mm_add_pd(fiz2,tz);
472 fjx3 = _mm_add_pd(fjx3,tx);
473 fjy3 = _mm_add_pd(fjy3,ty);
474 fjz3 = _mm_add_pd(fjz3,tz);
476 /**************************
477 * CALCULATE INTERACTIONS *
478 **************************/
480 /* REACTION-FIELD ELECTROSTATICS */
481 velec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_add_pd(rinv31,_mm_mul_pd(krf,rsq31)),crf));
482 felec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
484 /* Update potential sum for this i atom from the interaction with this j atom. */
485 velecsum = _mm_add_pd(velecsum,velec);
489 /* Calculate temporary vectorial force */
490 tx = _mm_mul_pd(fscal,dx31);
491 ty = _mm_mul_pd(fscal,dy31);
492 tz = _mm_mul_pd(fscal,dz31);
494 /* Update vectorial force */
495 fix3 = _mm_add_pd(fix3,tx);
496 fiy3 = _mm_add_pd(fiy3,ty);
497 fiz3 = _mm_add_pd(fiz3,tz);
499 fjx1 = _mm_add_pd(fjx1,tx);
500 fjy1 = _mm_add_pd(fjy1,ty);
501 fjz1 = _mm_add_pd(fjz1,tz);
503 /**************************
504 * CALCULATE INTERACTIONS *
505 **************************/
507 /* REACTION-FIELD ELECTROSTATICS */
508 velec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_add_pd(rinv32,_mm_mul_pd(krf,rsq32)),crf));
509 felec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
511 /* Update potential sum for this i atom from the interaction with this j atom. */
512 velecsum = _mm_add_pd(velecsum,velec);
516 /* Calculate temporary vectorial force */
517 tx = _mm_mul_pd(fscal,dx32);
518 ty = _mm_mul_pd(fscal,dy32);
519 tz = _mm_mul_pd(fscal,dz32);
521 /* Update vectorial force */
522 fix3 = _mm_add_pd(fix3,tx);
523 fiy3 = _mm_add_pd(fiy3,ty);
524 fiz3 = _mm_add_pd(fiz3,tz);
526 fjx2 = _mm_add_pd(fjx2,tx);
527 fjy2 = _mm_add_pd(fjy2,ty);
528 fjz2 = _mm_add_pd(fjz2,tz);
530 /**************************
531 * CALCULATE INTERACTIONS *
532 **************************/
534 /* REACTION-FIELD ELECTROSTATICS */
535 velec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_add_pd(rinv33,_mm_mul_pd(krf,rsq33)),crf));
536 felec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
538 /* Update potential sum for this i atom from the interaction with this j atom. */
539 velecsum = _mm_add_pd(velecsum,velec);
543 /* Calculate temporary vectorial force */
544 tx = _mm_mul_pd(fscal,dx33);
545 ty = _mm_mul_pd(fscal,dy33);
546 tz = _mm_mul_pd(fscal,dz33);
548 /* Update vectorial force */
549 fix3 = _mm_add_pd(fix3,tx);
550 fiy3 = _mm_add_pd(fiy3,ty);
551 fiz3 = _mm_add_pd(fiz3,tz);
553 fjx3 = _mm_add_pd(fjx3,tx);
554 fjy3 = _mm_add_pd(fjy3,ty);
555 fjz3 = _mm_add_pd(fjz3,tz);
557 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
559 /* Inner loop uses 323 flops */
566 j_coord_offsetA = DIM*jnrA;
568 /* load j atom coordinates */
569 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
570 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
571 &jy2,&jz2,&jx3,&jy3,&jz3);
573 /* Calculate displacement vector */
574 dx00 = _mm_sub_pd(ix0,jx0);
575 dy00 = _mm_sub_pd(iy0,jy0);
576 dz00 = _mm_sub_pd(iz0,jz0);
577 dx11 = _mm_sub_pd(ix1,jx1);
578 dy11 = _mm_sub_pd(iy1,jy1);
579 dz11 = _mm_sub_pd(iz1,jz1);
580 dx12 = _mm_sub_pd(ix1,jx2);
581 dy12 = _mm_sub_pd(iy1,jy2);
582 dz12 = _mm_sub_pd(iz1,jz2);
583 dx13 = _mm_sub_pd(ix1,jx3);
584 dy13 = _mm_sub_pd(iy1,jy3);
585 dz13 = _mm_sub_pd(iz1,jz3);
586 dx21 = _mm_sub_pd(ix2,jx1);
587 dy21 = _mm_sub_pd(iy2,jy1);
588 dz21 = _mm_sub_pd(iz2,jz1);
589 dx22 = _mm_sub_pd(ix2,jx2);
590 dy22 = _mm_sub_pd(iy2,jy2);
591 dz22 = _mm_sub_pd(iz2,jz2);
592 dx23 = _mm_sub_pd(ix2,jx3);
593 dy23 = _mm_sub_pd(iy2,jy3);
594 dz23 = _mm_sub_pd(iz2,jz3);
595 dx31 = _mm_sub_pd(ix3,jx1);
596 dy31 = _mm_sub_pd(iy3,jy1);
597 dz31 = _mm_sub_pd(iz3,jz1);
598 dx32 = _mm_sub_pd(ix3,jx2);
599 dy32 = _mm_sub_pd(iy3,jy2);
600 dz32 = _mm_sub_pd(iz3,jz2);
601 dx33 = _mm_sub_pd(ix3,jx3);
602 dy33 = _mm_sub_pd(iy3,jy3);
603 dz33 = _mm_sub_pd(iz3,jz3);
605 /* Calculate squared distance and things based on it */
606 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
607 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
608 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
609 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
610 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
611 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
612 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
613 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
614 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
615 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
617 rinv11 = gmx_mm_invsqrt_pd(rsq11);
618 rinv12 = gmx_mm_invsqrt_pd(rsq12);
619 rinv13 = gmx_mm_invsqrt_pd(rsq13);
620 rinv21 = gmx_mm_invsqrt_pd(rsq21);
621 rinv22 = gmx_mm_invsqrt_pd(rsq22);
622 rinv23 = gmx_mm_invsqrt_pd(rsq23);
623 rinv31 = gmx_mm_invsqrt_pd(rsq31);
624 rinv32 = gmx_mm_invsqrt_pd(rsq32);
625 rinv33 = gmx_mm_invsqrt_pd(rsq33);
627 rinvsq00 = gmx_mm_inv_pd(rsq00);
628 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
629 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
630 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
631 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
632 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
633 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
634 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
635 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
636 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
638 fjx0 = _mm_setzero_pd();
639 fjy0 = _mm_setzero_pd();
640 fjz0 = _mm_setzero_pd();
641 fjx1 = _mm_setzero_pd();
642 fjy1 = _mm_setzero_pd();
643 fjz1 = _mm_setzero_pd();
644 fjx2 = _mm_setzero_pd();
645 fjy2 = _mm_setzero_pd();
646 fjz2 = _mm_setzero_pd();
647 fjx3 = _mm_setzero_pd();
648 fjy3 = _mm_setzero_pd();
649 fjz3 = _mm_setzero_pd();
651 /**************************
652 * CALCULATE INTERACTIONS *
653 **************************/
655 /* LENNARD-JONES DISPERSION/REPULSION */
657 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
658 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
659 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
660 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
661 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
663 /* Update potential sum for this i atom from the interaction with this j atom. */
664 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
665 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
669 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
671 /* Calculate temporary vectorial force */
672 tx = _mm_mul_pd(fscal,dx00);
673 ty = _mm_mul_pd(fscal,dy00);
674 tz = _mm_mul_pd(fscal,dz00);
676 /* Update vectorial force */
677 fix0 = _mm_add_pd(fix0,tx);
678 fiy0 = _mm_add_pd(fiy0,ty);
679 fiz0 = _mm_add_pd(fiz0,tz);
681 fjx0 = _mm_add_pd(fjx0,tx);
682 fjy0 = _mm_add_pd(fjy0,ty);
683 fjz0 = _mm_add_pd(fjz0,tz);
685 /**************************
686 * CALCULATE INTERACTIONS *
687 **************************/
689 /* REACTION-FIELD ELECTROSTATICS */
690 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
691 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
693 /* Update potential sum for this i atom from the interaction with this j atom. */
694 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
695 velecsum = _mm_add_pd(velecsum,velec);
699 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
701 /* Calculate temporary vectorial force */
702 tx = _mm_mul_pd(fscal,dx11);
703 ty = _mm_mul_pd(fscal,dy11);
704 tz = _mm_mul_pd(fscal,dz11);
706 /* Update vectorial force */
707 fix1 = _mm_add_pd(fix1,tx);
708 fiy1 = _mm_add_pd(fiy1,ty);
709 fiz1 = _mm_add_pd(fiz1,tz);
711 fjx1 = _mm_add_pd(fjx1,tx);
712 fjy1 = _mm_add_pd(fjy1,ty);
713 fjz1 = _mm_add_pd(fjz1,tz);
715 /**************************
716 * CALCULATE INTERACTIONS *
717 **************************/
719 /* REACTION-FIELD ELECTROSTATICS */
720 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
721 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
723 /* Update potential sum for this i atom from the interaction with this j atom. */
724 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
725 velecsum = _mm_add_pd(velecsum,velec);
729 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
731 /* Calculate temporary vectorial force */
732 tx = _mm_mul_pd(fscal,dx12);
733 ty = _mm_mul_pd(fscal,dy12);
734 tz = _mm_mul_pd(fscal,dz12);
736 /* Update vectorial force */
737 fix1 = _mm_add_pd(fix1,tx);
738 fiy1 = _mm_add_pd(fiy1,ty);
739 fiz1 = _mm_add_pd(fiz1,tz);
741 fjx2 = _mm_add_pd(fjx2,tx);
742 fjy2 = _mm_add_pd(fjy2,ty);
743 fjz2 = _mm_add_pd(fjz2,tz);
745 /**************************
746 * CALCULATE INTERACTIONS *
747 **************************/
749 /* REACTION-FIELD ELECTROSTATICS */
750 velec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_add_pd(rinv13,_mm_mul_pd(krf,rsq13)),crf));
751 felec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
753 /* Update potential sum for this i atom from the interaction with this j atom. */
754 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
755 velecsum = _mm_add_pd(velecsum,velec);
759 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
761 /* Calculate temporary vectorial force */
762 tx = _mm_mul_pd(fscal,dx13);
763 ty = _mm_mul_pd(fscal,dy13);
764 tz = _mm_mul_pd(fscal,dz13);
766 /* Update vectorial force */
767 fix1 = _mm_add_pd(fix1,tx);
768 fiy1 = _mm_add_pd(fiy1,ty);
769 fiz1 = _mm_add_pd(fiz1,tz);
771 fjx3 = _mm_add_pd(fjx3,tx);
772 fjy3 = _mm_add_pd(fjy3,ty);
773 fjz3 = _mm_add_pd(fjz3,tz);
775 /**************************
776 * CALCULATE INTERACTIONS *
777 **************************/
779 /* REACTION-FIELD ELECTROSTATICS */
780 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
781 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
783 /* Update potential sum for this i atom from the interaction with this j atom. */
784 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
785 velecsum = _mm_add_pd(velecsum,velec);
789 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
791 /* Calculate temporary vectorial force */
792 tx = _mm_mul_pd(fscal,dx21);
793 ty = _mm_mul_pd(fscal,dy21);
794 tz = _mm_mul_pd(fscal,dz21);
796 /* Update vectorial force */
797 fix2 = _mm_add_pd(fix2,tx);
798 fiy2 = _mm_add_pd(fiy2,ty);
799 fiz2 = _mm_add_pd(fiz2,tz);
801 fjx1 = _mm_add_pd(fjx1,tx);
802 fjy1 = _mm_add_pd(fjy1,ty);
803 fjz1 = _mm_add_pd(fjz1,tz);
805 /**************************
806 * CALCULATE INTERACTIONS *
807 **************************/
809 /* REACTION-FIELD ELECTROSTATICS */
810 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
811 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
813 /* Update potential sum for this i atom from the interaction with this j atom. */
814 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
815 velecsum = _mm_add_pd(velecsum,velec);
819 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
821 /* Calculate temporary vectorial force */
822 tx = _mm_mul_pd(fscal,dx22);
823 ty = _mm_mul_pd(fscal,dy22);
824 tz = _mm_mul_pd(fscal,dz22);
826 /* Update vectorial force */
827 fix2 = _mm_add_pd(fix2,tx);
828 fiy2 = _mm_add_pd(fiy2,ty);
829 fiz2 = _mm_add_pd(fiz2,tz);
831 fjx2 = _mm_add_pd(fjx2,tx);
832 fjy2 = _mm_add_pd(fjy2,ty);
833 fjz2 = _mm_add_pd(fjz2,tz);
835 /**************************
836 * CALCULATE INTERACTIONS *
837 **************************/
839 /* REACTION-FIELD ELECTROSTATICS */
840 velec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_add_pd(rinv23,_mm_mul_pd(krf,rsq23)),crf));
841 felec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
843 /* Update potential sum for this i atom from the interaction with this j atom. */
844 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
845 velecsum = _mm_add_pd(velecsum,velec);
849 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
851 /* Calculate temporary vectorial force */
852 tx = _mm_mul_pd(fscal,dx23);
853 ty = _mm_mul_pd(fscal,dy23);
854 tz = _mm_mul_pd(fscal,dz23);
856 /* Update vectorial force */
857 fix2 = _mm_add_pd(fix2,tx);
858 fiy2 = _mm_add_pd(fiy2,ty);
859 fiz2 = _mm_add_pd(fiz2,tz);
861 fjx3 = _mm_add_pd(fjx3,tx);
862 fjy3 = _mm_add_pd(fjy3,ty);
863 fjz3 = _mm_add_pd(fjz3,tz);
865 /**************************
866 * CALCULATE INTERACTIONS *
867 **************************/
869 /* REACTION-FIELD ELECTROSTATICS */
870 velec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_add_pd(rinv31,_mm_mul_pd(krf,rsq31)),crf));
871 felec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
873 /* Update potential sum for this i atom from the interaction with this j atom. */
874 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
875 velecsum = _mm_add_pd(velecsum,velec);
879 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
881 /* Calculate temporary vectorial force */
882 tx = _mm_mul_pd(fscal,dx31);
883 ty = _mm_mul_pd(fscal,dy31);
884 tz = _mm_mul_pd(fscal,dz31);
886 /* Update vectorial force */
887 fix3 = _mm_add_pd(fix3,tx);
888 fiy3 = _mm_add_pd(fiy3,ty);
889 fiz3 = _mm_add_pd(fiz3,tz);
891 fjx1 = _mm_add_pd(fjx1,tx);
892 fjy1 = _mm_add_pd(fjy1,ty);
893 fjz1 = _mm_add_pd(fjz1,tz);
895 /**************************
896 * CALCULATE INTERACTIONS *
897 **************************/
899 /* REACTION-FIELD ELECTROSTATICS */
900 velec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_add_pd(rinv32,_mm_mul_pd(krf,rsq32)),crf));
901 felec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
903 /* Update potential sum for this i atom from the interaction with this j atom. */
904 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
905 velecsum = _mm_add_pd(velecsum,velec);
909 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
911 /* Calculate temporary vectorial force */
912 tx = _mm_mul_pd(fscal,dx32);
913 ty = _mm_mul_pd(fscal,dy32);
914 tz = _mm_mul_pd(fscal,dz32);
916 /* Update vectorial force */
917 fix3 = _mm_add_pd(fix3,tx);
918 fiy3 = _mm_add_pd(fiy3,ty);
919 fiz3 = _mm_add_pd(fiz3,tz);
921 fjx2 = _mm_add_pd(fjx2,tx);
922 fjy2 = _mm_add_pd(fjy2,ty);
923 fjz2 = _mm_add_pd(fjz2,tz);
925 /**************************
926 * CALCULATE INTERACTIONS *
927 **************************/
929 /* REACTION-FIELD ELECTROSTATICS */
930 velec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_add_pd(rinv33,_mm_mul_pd(krf,rsq33)),crf));
931 felec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
933 /* Update potential sum for this i atom from the interaction with this j atom. */
934 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
935 velecsum = _mm_add_pd(velecsum,velec);
939 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
941 /* Calculate temporary vectorial force */
942 tx = _mm_mul_pd(fscal,dx33);
943 ty = _mm_mul_pd(fscal,dy33);
944 tz = _mm_mul_pd(fscal,dz33);
946 /* Update vectorial force */
947 fix3 = _mm_add_pd(fix3,tx);
948 fiy3 = _mm_add_pd(fiy3,ty);
949 fiz3 = _mm_add_pd(fiz3,tz);
951 fjx3 = _mm_add_pd(fjx3,tx);
952 fjy3 = _mm_add_pd(fjy3,ty);
953 fjz3 = _mm_add_pd(fjz3,tz);
955 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
957 /* Inner loop uses 323 flops */
960 /* End of innermost loop */
962 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
963 f+i_coord_offset,fshift+i_shift_offset);
966 /* Update potential energies */
967 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
968 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
970 /* Increment number of inner iterations */
971 inneriter += j_index_end - j_index_start;
973 /* Outer loop uses 26 flops */
976 /* Increment number of outer iterations */
979 /* Update outer/inner flops */
981 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*323);
984 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse2_double
985 * Electrostatics interaction: ReactionField
986 * VdW interaction: LennardJones
987 * Geometry: Water4-Water4
988 * Calculate force/pot: Force
991 nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse2_double
992 (t_nblist * gmx_restrict nlist,
993 rvec * gmx_restrict xx,
994 rvec * gmx_restrict ff,
995 t_forcerec * gmx_restrict fr,
996 t_mdatoms * gmx_restrict mdatoms,
997 nb_kernel_data_t * gmx_restrict kernel_data,
998 t_nrnb * gmx_restrict nrnb)
1000 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1001 * just 0 for non-waters.
1002 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1003 * jnr indices corresponding to data put in the four positions in the SIMD register.
1005 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1006 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1008 int j_coord_offsetA,j_coord_offsetB;
1009 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1010 real rcutoff_scalar;
1011 real *shiftvec,*fshift,*x,*f;
1012 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1014 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1016 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1018 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1020 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1021 int vdwjidx0A,vdwjidx0B;
1022 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1023 int vdwjidx1A,vdwjidx1B;
1024 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1025 int vdwjidx2A,vdwjidx2B;
1026 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1027 int vdwjidx3A,vdwjidx3B;
1028 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1029 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1030 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1031 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1032 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1033 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1034 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1035 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1036 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1037 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1038 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1039 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1042 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1045 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1046 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1047 __m128d dummy_mask,cutoff_mask;
1048 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1049 __m128d one = _mm_set1_pd(1.0);
1050 __m128d two = _mm_set1_pd(2.0);
1056 jindex = nlist->jindex;
1058 shiftidx = nlist->shift;
1060 shiftvec = fr->shift_vec[0];
1061 fshift = fr->fshift[0];
1062 facel = _mm_set1_pd(fr->epsfac);
1063 charge = mdatoms->chargeA;
1064 krf = _mm_set1_pd(fr->ic->k_rf);
1065 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
1066 crf = _mm_set1_pd(fr->ic->c_rf);
1067 nvdwtype = fr->ntype;
1068 vdwparam = fr->nbfp;
1069 vdwtype = mdatoms->typeA;
1071 /* Setup water-specific parameters */
1072 inr = nlist->iinr[0];
1073 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1074 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1075 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1076 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1078 jq1 = _mm_set1_pd(charge[inr+1]);
1079 jq2 = _mm_set1_pd(charge[inr+2]);
1080 jq3 = _mm_set1_pd(charge[inr+3]);
1081 vdwjidx0A = 2*vdwtype[inr+0];
1082 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1083 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1084 qq11 = _mm_mul_pd(iq1,jq1);
1085 qq12 = _mm_mul_pd(iq1,jq2);
1086 qq13 = _mm_mul_pd(iq1,jq3);
1087 qq21 = _mm_mul_pd(iq2,jq1);
1088 qq22 = _mm_mul_pd(iq2,jq2);
1089 qq23 = _mm_mul_pd(iq2,jq3);
1090 qq31 = _mm_mul_pd(iq3,jq1);
1091 qq32 = _mm_mul_pd(iq3,jq2);
1092 qq33 = _mm_mul_pd(iq3,jq3);
1094 /* Avoid stupid compiler warnings */
1096 j_coord_offsetA = 0;
1097 j_coord_offsetB = 0;
1102 /* Start outer loop over neighborlists */
1103 for(iidx=0; iidx<nri; iidx++)
1105 /* Load shift vector for this list */
1106 i_shift_offset = DIM*shiftidx[iidx];
1108 /* Load limits for loop over neighbors */
1109 j_index_start = jindex[iidx];
1110 j_index_end = jindex[iidx+1];
1112 /* Get outer coordinate index */
1114 i_coord_offset = DIM*inr;
1116 /* Load i particle coords and add shift vector */
1117 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1118 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1120 fix0 = _mm_setzero_pd();
1121 fiy0 = _mm_setzero_pd();
1122 fiz0 = _mm_setzero_pd();
1123 fix1 = _mm_setzero_pd();
1124 fiy1 = _mm_setzero_pd();
1125 fiz1 = _mm_setzero_pd();
1126 fix2 = _mm_setzero_pd();
1127 fiy2 = _mm_setzero_pd();
1128 fiz2 = _mm_setzero_pd();
1129 fix3 = _mm_setzero_pd();
1130 fiy3 = _mm_setzero_pd();
1131 fiz3 = _mm_setzero_pd();
1133 /* Start inner kernel loop */
1134 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1137 /* Get j neighbor index, and coordinate index */
1139 jnrB = jjnr[jidx+1];
1140 j_coord_offsetA = DIM*jnrA;
1141 j_coord_offsetB = DIM*jnrB;
1143 /* load j atom coordinates */
1144 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1145 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1146 &jy2,&jz2,&jx3,&jy3,&jz3);
1148 /* Calculate displacement vector */
1149 dx00 = _mm_sub_pd(ix0,jx0);
1150 dy00 = _mm_sub_pd(iy0,jy0);
1151 dz00 = _mm_sub_pd(iz0,jz0);
1152 dx11 = _mm_sub_pd(ix1,jx1);
1153 dy11 = _mm_sub_pd(iy1,jy1);
1154 dz11 = _mm_sub_pd(iz1,jz1);
1155 dx12 = _mm_sub_pd(ix1,jx2);
1156 dy12 = _mm_sub_pd(iy1,jy2);
1157 dz12 = _mm_sub_pd(iz1,jz2);
1158 dx13 = _mm_sub_pd(ix1,jx3);
1159 dy13 = _mm_sub_pd(iy1,jy3);
1160 dz13 = _mm_sub_pd(iz1,jz3);
1161 dx21 = _mm_sub_pd(ix2,jx1);
1162 dy21 = _mm_sub_pd(iy2,jy1);
1163 dz21 = _mm_sub_pd(iz2,jz1);
1164 dx22 = _mm_sub_pd(ix2,jx2);
1165 dy22 = _mm_sub_pd(iy2,jy2);
1166 dz22 = _mm_sub_pd(iz2,jz2);
1167 dx23 = _mm_sub_pd(ix2,jx3);
1168 dy23 = _mm_sub_pd(iy2,jy3);
1169 dz23 = _mm_sub_pd(iz2,jz3);
1170 dx31 = _mm_sub_pd(ix3,jx1);
1171 dy31 = _mm_sub_pd(iy3,jy1);
1172 dz31 = _mm_sub_pd(iz3,jz1);
1173 dx32 = _mm_sub_pd(ix3,jx2);
1174 dy32 = _mm_sub_pd(iy3,jy2);
1175 dz32 = _mm_sub_pd(iz3,jz2);
1176 dx33 = _mm_sub_pd(ix3,jx3);
1177 dy33 = _mm_sub_pd(iy3,jy3);
1178 dz33 = _mm_sub_pd(iz3,jz3);
1180 /* Calculate squared distance and things based on it */
1181 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1182 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1183 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1184 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1185 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1186 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1187 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1188 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1189 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1190 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1192 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1193 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1194 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1195 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1196 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1197 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1198 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1199 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1200 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1202 rinvsq00 = gmx_mm_inv_pd(rsq00);
1203 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1204 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1205 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1206 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1207 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1208 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1209 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1210 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1211 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1213 fjx0 = _mm_setzero_pd();
1214 fjy0 = _mm_setzero_pd();
1215 fjz0 = _mm_setzero_pd();
1216 fjx1 = _mm_setzero_pd();
1217 fjy1 = _mm_setzero_pd();
1218 fjz1 = _mm_setzero_pd();
1219 fjx2 = _mm_setzero_pd();
1220 fjy2 = _mm_setzero_pd();
1221 fjz2 = _mm_setzero_pd();
1222 fjx3 = _mm_setzero_pd();
1223 fjy3 = _mm_setzero_pd();
1224 fjz3 = _mm_setzero_pd();
1226 /**************************
1227 * CALCULATE INTERACTIONS *
1228 **************************/
1230 /* LENNARD-JONES DISPERSION/REPULSION */
1232 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1233 fvdw = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1237 /* Calculate temporary vectorial force */
1238 tx = _mm_mul_pd(fscal,dx00);
1239 ty = _mm_mul_pd(fscal,dy00);
1240 tz = _mm_mul_pd(fscal,dz00);
1242 /* Update vectorial force */
1243 fix0 = _mm_add_pd(fix0,tx);
1244 fiy0 = _mm_add_pd(fiy0,ty);
1245 fiz0 = _mm_add_pd(fiz0,tz);
1247 fjx0 = _mm_add_pd(fjx0,tx);
1248 fjy0 = _mm_add_pd(fjy0,ty);
1249 fjz0 = _mm_add_pd(fjz0,tz);
1251 /**************************
1252 * CALCULATE INTERACTIONS *
1253 **************************/
1255 /* REACTION-FIELD ELECTROSTATICS */
1256 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
1260 /* Calculate temporary vectorial force */
1261 tx = _mm_mul_pd(fscal,dx11);
1262 ty = _mm_mul_pd(fscal,dy11);
1263 tz = _mm_mul_pd(fscal,dz11);
1265 /* Update vectorial force */
1266 fix1 = _mm_add_pd(fix1,tx);
1267 fiy1 = _mm_add_pd(fiy1,ty);
1268 fiz1 = _mm_add_pd(fiz1,tz);
1270 fjx1 = _mm_add_pd(fjx1,tx);
1271 fjy1 = _mm_add_pd(fjy1,ty);
1272 fjz1 = _mm_add_pd(fjz1,tz);
1274 /**************************
1275 * CALCULATE INTERACTIONS *
1276 **************************/
1278 /* REACTION-FIELD ELECTROSTATICS */
1279 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
1283 /* Calculate temporary vectorial force */
1284 tx = _mm_mul_pd(fscal,dx12);
1285 ty = _mm_mul_pd(fscal,dy12);
1286 tz = _mm_mul_pd(fscal,dz12);
1288 /* Update vectorial force */
1289 fix1 = _mm_add_pd(fix1,tx);
1290 fiy1 = _mm_add_pd(fiy1,ty);
1291 fiz1 = _mm_add_pd(fiz1,tz);
1293 fjx2 = _mm_add_pd(fjx2,tx);
1294 fjy2 = _mm_add_pd(fjy2,ty);
1295 fjz2 = _mm_add_pd(fjz2,tz);
1297 /**************************
1298 * CALCULATE INTERACTIONS *
1299 **************************/
1301 /* REACTION-FIELD ELECTROSTATICS */
1302 felec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
1306 /* Calculate temporary vectorial force */
1307 tx = _mm_mul_pd(fscal,dx13);
1308 ty = _mm_mul_pd(fscal,dy13);
1309 tz = _mm_mul_pd(fscal,dz13);
1311 /* Update vectorial force */
1312 fix1 = _mm_add_pd(fix1,tx);
1313 fiy1 = _mm_add_pd(fiy1,ty);
1314 fiz1 = _mm_add_pd(fiz1,tz);
1316 fjx3 = _mm_add_pd(fjx3,tx);
1317 fjy3 = _mm_add_pd(fjy3,ty);
1318 fjz3 = _mm_add_pd(fjz3,tz);
1320 /**************************
1321 * CALCULATE INTERACTIONS *
1322 **************************/
1324 /* REACTION-FIELD ELECTROSTATICS */
1325 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
1329 /* Calculate temporary vectorial force */
1330 tx = _mm_mul_pd(fscal,dx21);
1331 ty = _mm_mul_pd(fscal,dy21);
1332 tz = _mm_mul_pd(fscal,dz21);
1334 /* Update vectorial force */
1335 fix2 = _mm_add_pd(fix2,tx);
1336 fiy2 = _mm_add_pd(fiy2,ty);
1337 fiz2 = _mm_add_pd(fiz2,tz);
1339 fjx1 = _mm_add_pd(fjx1,tx);
1340 fjy1 = _mm_add_pd(fjy1,ty);
1341 fjz1 = _mm_add_pd(fjz1,tz);
1343 /**************************
1344 * CALCULATE INTERACTIONS *
1345 **************************/
1347 /* REACTION-FIELD ELECTROSTATICS */
1348 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1352 /* Calculate temporary vectorial force */
1353 tx = _mm_mul_pd(fscal,dx22);
1354 ty = _mm_mul_pd(fscal,dy22);
1355 tz = _mm_mul_pd(fscal,dz22);
1357 /* Update vectorial force */
1358 fix2 = _mm_add_pd(fix2,tx);
1359 fiy2 = _mm_add_pd(fiy2,ty);
1360 fiz2 = _mm_add_pd(fiz2,tz);
1362 fjx2 = _mm_add_pd(fjx2,tx);
1363 fjy2 = _mm_add_pd(fjy2,ty);
1364 fjz2 = _mm_add_pd(fjz2,tz);
1366 /**************************
1367 * CALCULATE INTERACTIONS *
1368 **************************/
1370 /* REACTION-FIELD ELECTROSTATICS */
1371 felec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
1375 /* Calculate temporary vectorial force */
1376 tx = _mm_mul_pd(fscal,dx23);
1377 ty = _mm_mul_pd(fscal,dy23);
1378 tz = _mm_mul_pd(fscal,dz23);
1380 /* Update vectorial force */
1381 fix2 = _mm_add_pd(fix2,tx);
1382 fiy2 = _mm_add_pd(fiy2,ty);
1383 fiz2 = _mm_add_pd(fiz2,tz);
1385 fjx3 = _mm_add_pd(fjx3,tx);
1386 fjy3 = _mm_add_pd(fjy3,ty);
1387 fjz3 = _mm_add_pd(fjz3,tz);
1389 /**************************
1390 * CALCULATE INTERACTIONS *
1391 **************************/
1393 /* REACTION-FIELD ELECTROSTATICS */
1394 felec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
1398 /* Calculate temporary vectorial force */
1399 tx = _mm_mul_pd(fscal,dx31);
1400 ty = _mm_mul_pd(fscal,dy31);
1401 tz = _mm_mul_pd(fscal,dz31);
1403 /* Update vectorial force */
1404 fix3 = _mm_add_pd(fix3,tx);
1405 fiy3 = _mm_add_pd(fiy3,ty);
1406 fiz3 = _mm_add_pd(fiz3,tz);
1408 fjx1 = _mm_add_pd(fjx1,tx);
1409 fjy1 = _mm_add_pd(fjy1,ty);
1410 fjz1 = _mm_add_pd(fjz1,tz);
1412 /**************************
1413 * CALCULATE INTERACTIONS *
1414 **************************/
1416 /* REACTION-FIELD ELECTROSTATICS */
1417 felec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
1421 /* Calculate temporary vectorial force */
1422 tx = _mm_mul_pd(fscal,dx32);
1423 ty = _mm_mul_pd(fscal,dy32);
1424 tz = _mm_mul_pd(fscal,dz32);
1426 /* Update vectorial force */
1427 fix3 = _mm_add_pd(fix3,tx);
1428 fiy3 = _mm_add_pd(fiy3,ty);
1429 fiz3 = _mm_add_pd(fiz3,tz);
1431 fjx2 = _mm_add_pd(fjx2,tx);
1432 fjy2 = _mm_add_pd(fjy2,ty);
1433 fjz2 = _mm_add_pd(fjz2,tz);
1435 /**************************
1436 * CALCULATE INTERACTIONS *
1437 **************************/
1439 /* REACTION-FIELD ELECTROSTATICS */
1440 felec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
1444 /* Calculate temporary vectorial force */
1445 tx = _mm_mul_pd(fscal,dx33);
1446 ty = _mm_mul_pd(fscal,dy33);
1447 tz = _mm_mul_pd(fscal,dz33);
1449 /* Update vectorial force */
1450 fix3 = _mm_add_pd(fix3,tx);
1451 fiy3 = _mm_add_pd(fiy3,ty);
1452 fiz3 = _mm_add_pd(fiz3,tz);
1454 fjx3 = _mm_add_pd(fjx3,tx);
1455 fjy3 = _mm_add_pd(fjy3,ty);
1456 fjz3 = _mm_add_pd(fjz3,tz);
1458 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1460 /* Inner loop uses 273 flops */
1463 if(jidx<j_index_end)
1467 j_coord_offsetA = DIM*jnrA;
1469 /* load j atom coordinates */
1470 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1471 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1472 &jy2,&jz2,&jx3,&jy3,&jz3);
1474 /* Calculate displacement vector */
1475 dx00 = _mm_sub_pd(ix0,jx0);
1476 dy00 = _mm_sub_pd(iy0,jy0);
1477 dz00 = _mm_sub_pd(iz0,jz0);
1478 dx11 = _mm_sub_pd(ix1,jx1);
1479 dy11 = _mm_sub_pd(iy1,jy1);
1480 dz11 = _mm_sub_pd(iz1,jz1);
1481 dx12 = _mm_sub_pd(ix1,jx2);
1482 dy12 = _mm_sub_pd(iy1,jy2);
1483 dz12 = _mm_sub_pd(iz1,jz2);
1484 dx13 = _mm_sub_pd(ix1,jx3);
1485 dy13 = _mm_sub_pd(iy1,jy3);
1486 dz13 = _mm_sub_pd(iz1,jz3);
1487 dx21 = _mm_sub_pd(ix2,jx1);
1488 dy21 = _mm_sub_pd(iy2,jy1);
1489 dz21 = _mm_sub_pd(iz2,jz1);
1490 dx22 = _mm_sub_pd(ix2,jx2);
1491 dy22 = _mm_sub_pd(iy2,jy2);
1492 dz22 = _mm_sub_pd(iz2,jz2);
1493 dx23 = _mm_sub_pd(ix2,jx3);
1494 dy23 = _mm_sub_pd(iy2,jy3);
1495 dz23 = _mm_sub_pd(iz2,jz3);
1496 dx31 = _mm_sub_pd(ix3,jx1);
1497 dy31 = _mm_sub_pd(iy3,jy1);
1498 dz31 = _mm_sub_pd(iz3,jz1);
1499 dx32 = _mm_sub_pd(ix3,jx2);
1500 dy32 = _mm_sub_pd(iy3,jy2);
1501 dz32 = _mm_sub_pd(iz3,jz2);
1502 dx33 = _mm_sub_pd(ix3,jx3);
1503 dy33 = _mm_sub_pd(iy3,jy3);
1504 dz33 = _mm_sub_pd(iz3,jz3);
1506 /* Calculate squared distance and things based on it */
1507 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1508 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1509 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1510 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1511 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1512 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1513 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1514 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1515 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1516 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1518 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1519 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1520 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1521 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1522 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1523 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1524 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1525 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1526 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1528 rinvsq00 = gmx_mm_inv_pd(rsq00);
1529 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1530 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1531 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1532 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1533 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1534 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1535 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1536 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1537 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1539 fjx0 = _mm_setzero_pd();
1540 fjy0 = _mm_setzero_pd();
1541 fjz0 = _mm_setzero_pd();
1542 fjx1 = _mm_setzero_pd();
1543 fjy1 = _mm_setzero_pd();
1544 fjz1 = _mm_setzero_pd();
1545 fjx2 = _mm_setzero_pd();
1546 fjy2 = _mm_setzero_pd();
1547 fjz2 = _mm_setzero_pd();
1548 fjx3 = _mm_setzero_pd();
1549 fjy3 = _mm_setzero_pd();
1550 fjz3 = _mm_setzero_pd();
1552 /**************************
1553 * CALCULATE INTERACTIONS *
1554 **************************/
1556 /* LENNARD-JONES DISPERSION/REPULSION */
1558 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1559 fvdw = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1563 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1565 /* Calculate temporary vectorial force */
1566 tx = _mm_mul_pd(fscal,dx00);
1567 ty = _mm_mul_pd(fscal,dy00);
1568 tz = _mm_mul_pd(fscal,dz00);
1570 /* Update vectorial force */
1571 fix0 = _mm_add_pd(fix0,tx);
1572 fiy0 = _mm_add_pd(fiy0,ty);
1573 fiz0 = _mm_add_pd(fiz0,tz);
1575 fjx0 = _mm_add_pd(fjx0,tx);
1576 fjy0 = _mm_add_pd(fjy0,ty);
1577 fjz0 = _mm_add_pd(fjz0,tz);
1579 /**************************
1580 * CALCULATE INTERACTIONS *
1581 **************************/
1583 /* REACTION-FIELD ELECTROSTATICS */
1584 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
1588 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1590 /* Calculate temporary vectorial force */
1591 tx = _mm_mul_pd(fscal,dx11);
1592 ty = _mm_mul_pd(fscal,dy11);
1593 tz = _mm_mul_pd(fscal,dz11);
1595 /* Update vectorial force */
1596 fix1 = _mm_add_pd(fix1,tx);
1597 fiy1 = _mm_add_pd(fiy1,ty);
1598 fiz1 = _mm_add_pd(fiz1,tz);
1600 fjx1 = _mm_add_pd(fjx1,tx);
1601 fjy1 = _mm_add_pd(fjy1,ty);
1602 fjz1 = _mm_add_pd(fjz1,tz);
1604 /**************************
1605 * CALCULATE INTERACTIONS *
1606 **************************/
1608 /* REACTION-FIELD ELECTROSTATICS */
1609 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
1613 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1615 /* Calculate temporary vectorial force */
1616 tx = _mm_mul_pd(fscal,dx12);
1617 ty = _mm_mul_pd(fscal,dy12);
1618 tz = _mm_mul_pd(fscal,dz12);
1620 /* Update vectorial force */
1621 fix1 = _mm_add_pd(fix1,tx);
1622 fiy1 = _mm_add_pd(fiy1,ty);
1623 fiz1 = _mm_add_pd(fiz1,tz);
1625 fjx2 = _mm_add_pd(fjx2,tx);
1626 fjy2 = _mm_add_pd(fjy2,ty);
1627 fjz2 = _mm_add_pd(fjz2,tz);
1629 /**************************
1630 * CALCULATE INTERACTIONS *
1631 **************************/
1633 /* REACTION-FIELD ELECTROSTATICS */
1634 felec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
1638 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1640 /* Calculate temporary vectorial force */
1641 tx = _mm_mul_pd(fscal,dx13);
1642 ty = _mm_mul_pd(fscal,dy13);
1643 tz = _mm_mul_pd(fscal,dz13);
1645 /* Update vectorial force */
1646 fix1 = _mm_add_pd(fix1,tx);
1647 fiy1 = _mm_add_pd(fiy1,ty);
1648 fiz1 = _mm_add_pd(fiz1,tz);
1650 fjx3 = _mm_add_pd(fjx3,tx);
1651 fjy3 = _mm_add_pd(fjy3,ty);
1652 fjz3 = _mm_add_pd(fjz3,tz);
1654 /**************************
1655 * CALCULATE INTERACTIONS *
1656 **************************/
1658 /* REACTION-FIELD ELECTROSTATICS */
1659 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
1663 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1665 /* Calculate temporary vectorial force */
1666 tx = _mm_mul_pd(fscal,dx21);
1667 ty = _mm_mul_pd(fscal,dy21);
1668 tz = _mm_mul_pd(fscal,dz21);
1670 /* Update vectorial force */
1671 fix2 = _mm_add_pd(fix2,tx);
1672 fiy2 = _mm_add_pd(fiy2,ty);
1673 fiz2 = _mm_add_pd(fiz2,tz);
1675 fjx1 = _mm_add_pd(fjx1,tx);
1676 fjy1 = _mm_add_pd(fjy1,ty);
1677 fjz1 = _mm_add_pd(fjz1,tz);
1679 /**************************
1680 * CALCULATE INTERACTIONS *
1681 **************************/
1683 /* REACTION-FIELD ELECTROSTATICS */
1684 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1688 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1690 /* Calculate temporary vectorial force */
1691 tx = _mm_mul_pd(fscal,dx22);
1692 ty = _mm_mul_pd(fscal,dy22);
1693 tz = _mm_mul_pd(fscal,dz22);
1695 /* Update vectorial force */
1696 fix2 = _mm_add_pd(fix2,tx);
1697 fiy2 = _mm_add_pd(fiy2,ty);
1698 fiz2 = _mm_add_pd(fiz2,tz);
1700 fjx2 = _mm_add_pd(fjx2,tx);
1701 fjy2 = _mm_add_pd(fjy2,ty);
1702 fjz2 = _mm_add_pd(fjz2,tz);
1704 /**************************
1705 * CALCULATE INTERACTIONS *
1706 **************************/
1708 /* REACTION-FIELD ELECTROSTATICS */
1709 felec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
1713 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1715 /* Calculate temporary vectorial force */
1716 tx = _mm_mul_pd(fscal,dx23);
1717 ty = _mm_mul_pd(fscal,dy23);
1718 tz = _mm_mul_pd(fscal,dz23);
1720 /* Update vectorial force */
1721 fix2 = _mm_add_pd(fix2,tx);
1722 fiy2 = _mm_add_pd(fiy2,ty);
1723 fiz2 = _mm_add_pd(fiz2,tz);
1725 fjx3 = _mm_add_pd(fjx3,tx);
1726 fjy3 = _mm_add_pd(fjy3,ty);
1727 fjz3 = _mm_add_pd(fjz3,tz);
1729 /**************************
1730 * CALCULATE INTERACTIONS *
1731 **************************/
1733 /* REACTION-FIELD ELECTROSTATICS */
1734 felec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
1738 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1740 /* Calculate temporary vectorial force */
1741 tx = _mm_mul_pd(fscal,dx31);
1742 ty = _mm_mul_pd(fscal,dy31);
1743 tz = _mm_mul_pd(fscal,dz31);
1745 /* Update vectorial force */
1746 fix3 = _mm_add_pd(fix3,tx);
1747 fiy3 = _mm_add_pd(fiy3,ty);
1748 fiz3 = _mm_add_pd(fiz3,tz);
1750 fjx1 = _mm_add_pd(fjx1,tx);
1751 fjy1 = _mm_add_pd(fjy1,ty);
1752 fjz1 = _mm_add_pd(fjz1,tz);
1754 /**************************
1755 * CALCULATE INTERACTIONS *
1756 **************************/
1758 /* REACTION-FIELD ELECTROSTATICS */
1759 felec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
1763 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1765 /* Calculate temporary vectorial force */
1766 tx = _mm_mul_pd(fscal,dx32);
1767 ty = _mm_mul_pd(fscal,dy32);
1768 tz = _mm_mul_pd(fscal,dz32);
1770 /* Update vectorial force */
1771 fix3 = _mm_add_pd(fix3,tx);
1772 fiy3 = _mm_add_pd(fiy3,ty);
1773 fiz3 = _mm_add_pd(fiz3,tz);
1775 fjx2 = _mm_add_pd(fjx2,tx);
1776 fjy2 = _mm_add_pd(fjy2,ty);
1777 fjz2 = _mm_add_pd(fjz2,tz);
1779 /**************************
1780 * CALCULATE INTERACTIONS *
1781 **************************/
1783 /* REACTION-FIELD ELECTROSTATICS */
1784 felec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
1788 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1790 /* Calculate temporary vectorial force */
1791 tx = _mm_mul_pd(fscal,dx33);
1792 ty = _mm_mul_pd(fscal,dy33);
1793 tz = _mm_mul_pd(fscal,dz33);
1795 /* Update vectorial force */
1796 fix3 = _mm_add_pd(fix3,tx);
1797 fiy3 = _mm_add_pd(fiy3,ty);
1798 fiz3 = _mm_add_pd(fiz3,tz);
1800 fjx3 = _mm_add_pd(fjx3,tx);
1801 fjy3 = _mm_add_pd(fjy3,ty);
1802 fjz3 = _mm_add_pd(fjz3,tz);
1804 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1806 /* Inner loop uses 273 flops */
1809 /* End of innermost loop */
1811 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1812 f+i_coord_offset,fshift+i_shift_offset);
1814 /* Increment number of inner iterations */
1815 inneriter += j_index_end - j_index_start;
1817 /* Outer loop uses 24 flops */
1820 /* Increment number of outer iterations */
1823 /* Update outer/inner flops */
1825 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*273);