2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse2_single
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real shX,shY,shZ,rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
73 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
75 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
77 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
90 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
93 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
94 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
95 __m128 dummy_mask,cutoff_mask;
96 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
97 __m128 one = _mm_set1_ps(1.0);
98 __m128 two = _mm_set1_ps(2.0);
104 jindex = nlist->jindex;
106 shiftidx = nlist->shift;
108 shiftvec = fr->shift_vec[0];
109 fshift = fr->fshift[0];
110 facel = _mm_set1_ps(fr->epsfac);
111 charge = mdatoms->chargeA;
112 krf = _mm_set1_ps(fr->ic->k_rf);
113 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
114 crf = _mm_set1_ps(fr->ic->c_rf);
115 nvdwtype = fr->ntype;
117 vdwtype = mdatoms->typeA;
119 /* Setup water-specific parameters */
120 inr = nlist->iinr[0];
121 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
122 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
123 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
124 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
126 jq0 = _mm_set1_ps(charge[inr+0]);
127 jq1 = _mm_set1_ps(charge[inr+1]);
128 jq2 = _mm_set1_ps(charge[inr+2]);
129 vdwjidx0A = 2*vdwtype[inr+0];
130 qq00 = _mm_mul_ps(iq0,jq0);
131 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
132 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
133 qq01 = _mm_mul_ps(iq0,jq1);
134 qq02 = _mm_mul_ps(iq0,jq2);
135 qq10 = _mm_mul_ps(iq1,jq0);
136 qq11 = _mm_mul_ps(iq1,jq1);
137 qq12 = _mm_mul_ps(iq1,jq2);
138 qq20 = _mm_mul_ps(iq2,jq0);
139 qq21 = _mm_mul_ps(iq2,jq1);
140 qq22 = _mm_mul_ps(iq2,jq2);
142 /* Avoid stupid compiler warnings */
143 jnrA = jnrB = jnrC = jnrD = 0;
152 /* Start outer loop over neighborlists */
153 for(iidx=0; iidx<nri; iidx++)
155 /* Load shift vector for this list */
156 i_shift_offset = DIM*shiftidx[iidx];
157 shX = shiftvec[i_shift_offset+XX];
158 shY = shiftvec[i_shift_offset+YY];
159 shZ = shiftvec[i_shift_offset+ZZ];
161 /* Load limits for loop over neighbors */
162 j_index_start = jindex[iidx];
163 j_index_end = jindex[iidx+1];
165 /* Get outer coordinate index */
167 i_coord_offset = DIM*inr;
169 /* Load i particle coords and add shift vector */
170 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
171 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
172 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
173 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
174 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
175 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
176 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
177 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
178 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
180 fix0 = _mm_setzero_ps();
181 fiy0 = _mm_setzero_ps();
182 fiz0 = _mm_setzero_ps();
183 fix1 = _mm_setzero_ps();
184 fiy1 = _mm_setzero_ps();
185 fiz1 = _mm_setzero_ps();
186 fix2 = _mm_setzero_ps();
187 fiy2 = _mm_setzero_ps();
188 fiz2 = _mm_setzero_ps();
190 /* Reset potential sums */
191 velecsum = _mm_setzero_ps();
192 vvdwsum = _mm_setzero_ps();
194 /* Start inner kernel loop */
195 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
198 /* Get j neighbor index, and coordinate index */
204 j_coord_offsetA = DIM*jnrA;
205 j_coord_offsetB = DIM*jnrB;
206 j_coord_offsetC = DIM*jnrC;
207 j_coord_offsetD = DIM*jnrD;
209 /* load j atom coordinates */
210 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
211 x+j_coord_offsetC,x+j_coord_offsetD,
212 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
214 /* Calculate displacement vector */
215 dx00 = _mm_sub_ps(ix0,jx0);
216 dy00 = _mm_sub_ps(iy0,jy0);
217 dz00 = _mm_sub_ps(iz0,jz0);
218 dx01 = _mm_sub_ps(ix0,jx1);
219 dy01 = _mm_sub_ps(iy0,jy1);
220 dz01 = _mm_sub_ps(iz0,jz1);
221 dx02 = _mm_sub_ps(ix0,jx2);
222 dy02 = _mm_sub_ps(iy0,jy2);
223 dz02 = _mm_sub_ps(iz0,jz2);
224 dx10 = _mm_sub_ps(ix1,jx0);
225 dy10 = _mm_sub_ps(iy1,jy0);
226 dz10 = _mm_sub_ps(iz1,jz0);
227 dx11 = _mm_sub_ps(ix1,jx1);
228 dy11 = _mm_sub_ps(iy1,jy1);
229 dz11 = _mm_sub_ps(iz1,jz1);
230 dx12 = _mm_sub_ps(ix1,jx2);
231 dy12 = _mm_sub_ps(iy1,jy2);
232 dz12 = _mm_sub_ps(iz1,jz2);
233 dx20 = _mm_sub_ps(ix2,jx0);
234 dy20 = _mm_sub_ps(iy2,jy0);
235 dz20 = _mm_sub_ps(iz2,jz0);
236 dx21 = _mm_sub_ps(ix2,jx1);
237 dy21 = _mm_sub_ps(iy2,jy1);
238 dz21 = _mm_sub_ps(iz2,jz1);
239 dx22 = _mm_sub_ps(ix2,jx2);
240 dy22 = _mm_sub_ps(iy2,jy2);
241 dz22 = _mm_sub_ps(iz2,jz2);
243 /* Calculate squared distance and things based on it */
244 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
245 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
246 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
247 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
248 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
249 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
250 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
251 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
252 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
254 rinv00 = gmx_mm_invsqrt_ps(rsq00);
255 rinv01 = gmx_mm_invsqrt_ps(rsq01);
256 rinv02 = gmx_mm_invsqrt_ps(rsq02);
257 rinv10 = gmx_mm_invsqrt_ps(rsq10);
258 rinv11 = gmx_mm_invsqrt_ps(rsq11);
259 rinv12 = gmx_mm_invsqrt_ps(rsq12);
260 rinv20 = gmx_mm_invsqrt_ps(rsq20);
261 rinv21 = gmx_mm_invsqrt_ps(rsq21);
262 rinv22 = gmx_mm_invsqrt_ps(rsq22);
264 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
265 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
266 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
267 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
268 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
269 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
270 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
271 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
272 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
274 fjx0 = _mm_setzero_ps();
275 fjy0 = _mm_setzero_ps();
276 fjz0 = _mm_setzero_ps();
277 fjx1 = _mm_setzero_ps();
278 fjy1 = _mm_setzero_ps();
279 fjz1 = _mm_setzero_ps();
280 fjx2 = _mm_setzero_ps();
281 fjy2 = _mm_setzero_ps();
282 fjz2 = _mm_setzero_ps();
284 /**************************
285 * CALCULATE INTERACTIONS *
286 **************************/
288 /* REACTION-FIELD ELECTROSTATICS */
289 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
290 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
292 /* LENNARD-JONES DISPERSION/REPULSION */
294 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
295 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
296 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
297 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
298 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
300 /* Update potential sum for this i atom from the interaction with this j atom. */
301 velecsum = _mm_add_ps(velecsum,velec);
302 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
304 fscal = _mm_add_ps(felec,fvdw);
306 /* Calculate temporary vectorial force */
307 tx = _mm_mul_ps(fscal,dx00);
308 ty = _mm_mul_ps(fscal,dy00);
309 tz = _mm_mul_ps(fscal,dz00);
311 /* Update vectorial force */
312 fix0 = _mm_add_ps(fix0,tx);
313 fiy0 = _mm_add_ps(fiy0,ty);
314 fiz0 = _mm_add_ps(fiz0,tz);
316 fjx0 = _mm_add_ps(fjx0,tx);
317 fjy0 = _mm_add_ps(fjy0,ty);
318 fjz0 = _mm_add_ps(fjz0,tz);
320 /**************************
321 * CALCULATE INTERACTIONS *
322 **************************/
324 /* REACTION-FIELD ELECTROSTATICS */
325 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
326 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
328 /* Update potential sum for this i atom from the interaction with this j atom. */
329 velecsum = _mm_add_ps(velecsum,velec);
333 /* Calculate temporary vectorial force */
334 tx = _mm_mul_ps(fscal,dx01);
335 ty = _mm_mul_ps(fscal,dy01);
336 tz = _mm_mul_ps(fscal,dz01);
338 /* Update vectorial force */
339 fix0 = _mm_add_ps(fix0,tx);
340 fiy0 = _mm_add_ps(fiy0,ty);
341 fiz0 = _mm_add_ps(fiz0,tz);
343 fjx1 = _mm_add_ps(fjx1,tx);
344 fjy1 = _mm_add_ps(fjy1,ty);
345 fjz1 = _mm_add_ps(fjz1,tz);
347 /**************************
348 * CALCULATE INTERACTIONS *
349 **************************/
351 /* REACTION-FIELD ELECTROSTATICS */
352 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
353 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
355 /* Update potential sum for this i atom from the interaction with this j atom. */
356 velecsum = _mm_add_ps(velecsum,velec);
360 /* Calculate temporary vectorial force */
361 tx = _mm_mul_ps(fscal,dx02);
362 ty = _mm_mul_ps(fscal,dy02);
363 tz = _mm_mul_ps(fscal,dz02);
365 /* Update vectorial force */
366 fix0 = _mm_add_ps(fix0,tx);
367 fiy0 = _mm_add_ps(fiy0,ty);
368 fiz0 = _mm_add_ps(fiz0,tz);
370 fjx2 = _mm_add_ps(fjx2,tx);
371 fjy2 = _mm_add_ps(fjy2,ty);
372 fjz2 = _mm_add_ps(fjz2,tz);
374 /**************************
375 * CALCULATE INTERACTIONS *
376 **************************/
378 /* REACTION-FIELD ELECTROSTATICS */
379 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
380 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
382 /* Update potential sum for this i atom from the interaction with this j atom. */
383 velecsum = _mm_add_ps(velecsum,velec);
387 /* Calculate temporary vectorial force */
388 tx = _mm_mul_ps(fscal,dx10);
389 ty = _mm_mul_ps(fscal,dy10);
390 tz = _mm_mul_ps(fscal,dz10);
392 /* Update vectorial force */
393 fix1 = _mm_add_ps(fix1,tx);
394 fiy1 = _mm_add_ps(fiy1,ty);
395 fiz1 = _mm_add_ps(fiz1,tz);
397 fjx0 = _mm_add_ps(fjx0,tx);
398 fjy0 = _mm_add_ps(fjy0,ty);
399 fjz0 = _mm_add_ps(fjz0,tz);
401 /**************************
402 * CALCULATE INTERACTIONS *
403 **************************/
405 /* REACTION-FIELD ELECTROSTATICS */
406 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
407 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
409 /* Update potential sum for this i atom from the interaction with this j atom. */
410 velecsum = _mm_add_ps(velecsum,velec);
414 /* Calculate temporary vectorial force */
415 tx = _mm_mul_ps(fscal,dx11);
416 ty = _mm_mul_ps(fscal,dy11);
417 tz = _mm_mul_ps(fscal,dz11);
419 /* Update vectorial force */
420 fix1 = _mm_add_ps(fix1,tx);
421 fiy1 = _mm_add_ps(fiy1,ty);
422 fiz1 = _mm_add_ps(fiz1,tz);
424 fjx1 = _mm_add_ps(fjx1,tx);
425 fjy1 = _mm_add_ps(fjy1,ty);
426 fjz1 = _mm_add_ps(fjz1,tz);
428 /**************************
429 * CALCULATE INTERACTIONS *
430 **************************/
432 /* REACTION-FIELD ELECTROSTATICS */
433 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
434 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
436 /* Update potential sum for this i atom from the interaction with this j atom. */
437 velecsum = _mm_add_ps(velecsum,velec);
441 /* Calculate temporary vectorial force */
442 tx = _mm_mul_ps(fscal,dx12);
443 ty = _mm_mul_ps(fscal,dy12);
444 tz = _mm_mul_ps(fscal,dz12);
446 /* Update vectorial force */
447 fix1 = _mm_add_ps(fix1,tx);
448 fiy1 = _mm_add_ps(fiy1,ty);
449 fiz1 = _mm_add_ps(fiz1,tz);
451 fjx2 = _mm_add_ps(fjx2,tx);
452 fjy2 = _mm_add_ps(fjy2,ty);
453 fjz2 = _mm_add_ps(fjz2,tz);
455 /**************************
456 * CALCULATE INTERACTIONS *
457 **************************/
459 /* REACTION-FIELD ELECTROSTATICS */
460 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
461 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
463 /* Update potential sum for this i atom from the interaction with this j atom. */
464 velecsum = _mm_add_ps(velecsum,velec);
468 /* Calculate temporary vectorial force */
469 tx = _mm_mul_ps(fscal,dx20);
470 ty = _mm_mul_ps(fscal,dy20);
471 tz = _mm_mul_ps(fscal,dz20);
473 /* Update vectorial force */
474 fix2 = _mm_add_ps(fix2,tx);
475 fiy2 = _mm_add_ps(fiy2,ty);
476 fiz2 = _mm_add_ps(fiz2,tz);
478 fjx0 = _mm_add_ps(fjx0,tx);
479 fjy0 = _mm_add_ps(fjy0,ty);
480 fjz0 = _mm_add_ps(fjz0,tz);
482 /**************************
483 * CALCULATE INTERACTIONS *
484 **************************/
486 /* REACTION-FIELD ELECTROSTATICS */
487 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
488 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
490 /* Update potential sum for this i atom from the interaction with this j atom. */
491 velecsum = _mm_add_ps(velecsum,velec);
495 /* Calculate temporary vectorial force */
496 tx = _mm_mul_ps(fscal,dx21);
497 ty = _mm_mul_ps(fscal,dy21);
498 tz = _mm_mul_ps(fscal,dz21);
500 /* Update vectorial force */
501 fix2 = _mm_add_ps(fix2,tx);
502 fiy2 = _mm_add_ps(fiy2,ty);
503 fiz2 = _mm_add_ps(fiz2,tz);
505 fjx1 = _mm_add_ps(fjx1,tx);
506 fjy1 = _mm_add_ps(fjy1,ty);
507 fjz1 = _mm_add_ps(fjz1,tz);
509 /**************************
510 * CALCULATE INTERACTIONS *
511 **************************/
513 /* REACTION-FIELD ELECTROSTATICS */
514 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
515 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
517 /* Update potential sum for this i atom from the interaction with this j atom. */
518 velecsum = _mm_add_ps(velecsum,velec);
522 /* Calculate temporary vectorial force */
523 tx = _mm_mul_ps(fscal,dx22);
524 ty = _mm_mul_ps(fscal,dy22);
525 tz = _mm_mul_ps(fscal,dz22);
527 /* Update vectorial force */
528 fix2 = _mm_add_ps(fix2,tx);
529 fiy2 = _mm_add_ps(fiy2,ty);
530 fiz2 = _mm_add_ps(fiz2,tz);
532 fjx2 = _mm_add_ps(fjx2,tx);
533 fjy2 = _mm_add_ps(fjy2,ty);
534 fjz2 = _mm_add_ps(fjz2,tz);
536 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
537 f+j_coord_offsetC,f+j_coord_offsetD,
538 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
540 /* Inner loop uses 300 flops */
546 /* Get j neighbor index, and coordinate index */
552 /* Sign of each element will be negative for non-real atoms.
553 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
554 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
556 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
557 jnrA = (jnrA>=0) ? jnrA : 0;
558 jnrB = (jnrB>=0) ? jnrB : 0;
559 jnrC = (jnrC>=0) ? jnrC : 0;
560 jnrD = (jnrD>=0) ? jnrD : 0;
562 j_coord_offsetA = DIM*jnrA;
563 j_coord_offsetB = DIM*jnrB;
564 j_coord_offsetC = DIM*jnrC;
565 j_coord_offsetD = DIM*jnrD;
567 /* load j atom coordinates */
568 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
569 x+j_coord_offsetC,x+j_coord_offsetD,
570 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
572 /* Calculate displacement vector */
573 dx00 = _mm_sub_ps(ix0,jx0);
574 dy00 = _mm_sub_ps(iy0,jy0);
575 dz00 = _mm_sub_ps(iz0,jz0);
576 dx01 = _mm_sub_ps(ix0,jx1);
577 dy01 = _mm_sub_ps(iy0,jy1);
578 dz01 = _mm_sub_ps(iz0,jz1);
579 dx02 = _mm_sub_ps(ix0,jx2);
580 dy02 = _mm_sub_ps(iy0,jy2);
581 dz02 = _mm_sub_ps(iz0,jz2);
582 dx10 = _mm_sub_ps(ix1,jx0);
583 dy10 = _mm_sub_ps(iy1,jy0);
584 dz10 = _mm_sub_ps(iz1,jz0);
585 dx11 = _mm_sub_ps(ix1,jx1);
586 dy11 = _mm_sub_ps(iy1,jy1);
587 dz11 = _mm_sub_ps(iz1,jz1);
588 dx12 = _mm_sub_ps(ix1,jx2);
589 dy12 = _mm_sub_ps(iy1,jy2);
590 dz12 = _mm_sub_ps(iz1,jz2);
591 dx20 = _mm_sub_ps(ix2,jx0);
592 dy20 = _mm_sub_ps(iy2,jy0);
593 dz20 = _mm_sub_ps(iz2,jz0);
594 dx21 = _mm_sub_ps(ix2,jx1);
595 dy21 = _mm_sub_ps(iy2,jy1);
596 dz21 = _mm_sub_ps(iz2,jz1);
597 dx22 = _mm_sub_ps(ix2,jx2);
598 dy22 = _mm_sub_ps(iy2,jy2);
599 dz22 = _mm_sub_ps(iz2,jz2);
601 /* Calculate squared distance and things based on it */
602 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
603 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
604 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
605 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
606 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
607 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
608 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
609 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
610 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
612 rinv00 = gmx_mm_invsqrt_ps(rsq00);
613 rinv01 = gmx_mm_invsqrt_ps(rsq01);
614 rinv02 = gmx_mm_invsqrt_ps(rsq02);
615 rinv10 = gmx_mm_invsqrt_ps(rsq10);
616 rinv11 = gmx_mm_invsqrt_ps(rsq11);
617 rinv12 = gmx_mm_invsqrt_ps(rsq12);
618 rinv20 = gmx_mm_invsqrt_ps(rsq20);
619 rinv21 = gmx_mm_invsqrt_ps(rsq21);
620 rinv22 = gmx_mm_invsqrt_ps(rsq22);
622 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
623 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
624 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
625 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
626 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
627 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
628 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
629 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
630 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
632 fjx0 = _mm_setzero_ps();
633 fjy0 = _mm_setzero_ps();
634 fjz0 = _mm_setzero_ps();
635 fjx1 = _mm_setzero_ps();
636 fjy1 = _mm_setzero_ps();
637 fjz1 = _mm_setzero_ps();
638 fjx2 = _mm_setzero_ps();
639 fjy2 = _mm_setzero_ps();
640 fjz2 = _mm_setzero_ps();
642 /**************************
643 * CALCULATE INTERACTIONS *
644 **************************/
646 /* REACTION-FIELD ELECTROSTATICS */
647 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
648 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
650 /* LENNARD-JONES DISPERSION/REPULSION */
652 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
653 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
654 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
655 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
656 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
658 /* Update potential sum for this i atom from the interaction with this j atom. */
659 velec = _mm_andnot_ps(dummy_mask,velec);
660 velecsum = _mm_add_ps(velecsum,velec);
661 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
662 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
664 fscal = _mm_add_ps(felec,fvdw);
666 fscal = _mm_andnot_ps(dummy_mask,fscal);
668 /* Calculate temporary vectorial force */
669 tx = _mm_mul_ps(fscal,dx00);
670 ty = _mm_mul_ps(fscal,dy00);
671 tz = _mm_mul_ps(fscal,dz00);
673 /* Update vectorial force */
674 fix0 = _mm_add_ps(fix0,tx);
675 fiy0 = _mm_add_ps(fiy0,ty);
676 fiz0 = _mm_add_ps(fiz0,tz);
678 fjx0 = _mm_add_ps(fjx0,tx);
679 fjy0 = _mm_add_ps(fjy0,ty);
680 fjz0 = _mm_add_ps(fjz0,tz);
682 /**************************
683 * CALCULATE INTERACTIONS *
684 **************************/
686 /* REACTION-FIELD ELECTROSTATICS */
687 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
688 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
690 /* Update potential sum for this i atom from the interaction with this j atom. */
691 velec = _mm_andnot_ps(dummy_mask,velec);
692 velecsum = _mm_add_ps(velecsum,velec);
696 fscal = _mm_andnot_ps(dummy_mask,fscal);
698 /* Calculate temporary vectorial force */
699 tx = _mm_mul_ps(fscal,dx01);
700 ty = _mm_mul_ps(fscal,dy01);
701 tz = _mm_mul_ps(fscal,dz01);
703 /* Update vectorial force */
704 fix0 = _mm_add_ps(fix0,tx);
705 fiy0 = _mm_add_ps(fiy0,ty);
706 fiz0 = _mm_add_ps(fiz0,tz);
708 fjx1 = _mm_add_ps(fjx1,tx);
709 fjy1 = _mm_add_ps(fjy1,ty);
710 fjz1 = _mm_add_ps(fjz1,tz);
712 /**************************
713 * CALCULATE INTERACTIONS *
714 **************************/
716 /* REACTION-FIELD ELECTROSTATICS */
717 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
718 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
720 /* Update potential sum for this i atom from the interaction with this j atom. */
721 velec = _mm_andnot_ps(dummy_mask,velec);
722 velecsum = _mm_add_ps(velecsum,velec);
726 fscal = _mm_andnot_ps(dummy_mask,fscal);
728 /* Calculate temporary vectorial force */
729 tx = _mm_mul_ps(fscal,dx02);
730 ty = _mm_mul_ps(fscal,dy02);
731 tz = _mm_mul_ps(fscal,dz02);
733 /* Update vectorial force */
734 fix0 = _mm_add_ps(fix0,tx);
735 fiy0 = _mm_add_ps(fiy0,ty);
736 fiz0 = _mm_add_ps(fiz0,tz);
738 fjx2 = _mm_add_ps(fjx2,tx);
739 fjy2 = _mm_add_ps(fjy2,ty);
740 fjz2 = _mm_add_ps(fjz2,tz);
742 /**************************
743 * CALCULATE INTERACTIONS *
744 **************************/
746 /* REACTION-FIELD ELECTROSTATICS */
747 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
748 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
750 /* Update potential sum for this i atom from the interaction with this j atom. */
751 velec = _mm_andnot_ps(dummy_mask,velec);
752 velecsum = _mm_add_ps(velecsum,velec);
756 fscal = _mm_andnot_ps(dummy_mask,fscal);
758 /* Calculate temporary vectorial force */
759 tx = _mm_mul_ps(fscal,dx10);
760 ty = _mm_mul_ps(fscal,dy10);
761 tz = _mm_mul_ps(fscal,dz10);
763 /* Update vectorial force */
764 fix1 = _mm_add_ps(fix1,tx);
765 fiy1 = _mm_add_ps(fiy1,ty);
766 fiz1 = _mm_add_ps(fiz1,tz);
768 fjx0 = _mm_add_ps(fjx0,tx);
769 fjy0 = _mm_add_ps(fjy0,ty);
770 fjz0 = _mm_add_ps(fjz0,tz);
772 /**************************
773 * CALCULATE INTERACTIONS *
774 **************************/
776 /* REACTION-FIELD ELECTROSTATICS */
777 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
778 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
780 /* Update potential sum for this i atom from the interaction with this j atom. */
781 velec = _mm_andnot_ps(dummy_mask,velec);
782 velecsum = _mm_add_ps(velecsum,velec);
786 fscal = _mm_andnot_ps(dummy_mask,fscal);
788 /* Calculate temporary vectorial force */
789 tx = _mm_mul_ps(fscal,dx11);
790 ty = _mm_mul_ps(fscal,dy11);
791 tz = _mm_mul_ps(fscal,dz11);
793 /* Update vectorial force */
794 fix1 = _mm_add_ps(fix1,tx);
795 fiy1 = _mm_add_ps(fiy1,ty);
796 fiz1 = _mm_add_ps(fiz1,tz);
798 fjx1 = _mm_add_ps(fjx1,tx);
799 fjy1 = _mm_add_ps(fjy1,ty);
800 fjz1 = _mm_add_ps(fjz1,tz);
802 /**************************
803 * CALCULATE INTERACTIONS *
804 **************************/
806 /* REACTION-FIELD ELECTROSTATICS */
807 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
808 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
810 /* Update potential sum for this i atom from the interaction with this j atom. */
811 velec = _mm_andnot_ps(dummy_mask,velec);
812 velecsum = _mm_add_ps(velecsum,velec);
816 fscal = _mm_andnot_ps(dummy_mask,fscal);
818 /* Calculate temporary vectorial force */
819 tx = _mm_mul_ps(fscal,dx12);
820 ty = _mm_mul_ps(fscal,dy12);
821 tz = _mm_mul_ps(fscal,dz12);
823 /* Update vectorial force */
824 fix1 = _mm_add_ps(fix1,tx);
825 fiy1 = _mm_add_ps(fiy1,ty);
826 fiz1 = _mm_add_ps(fiz1,tz);
828 fjx2 = _mm_add_ps(fjx2,tx);
829 fjy2 = _mm_add_ps(fjy2,ty);
830 fjz2 = _mm_add_ps(fjz2,tz);
832 /**************************
833 * CALCULATE INTERACTIONS *
834 **************************/
836 /* REACTION-FIELD ELECTROSTATICS */
837 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
838 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
840 /* Update potential sum for this i atom from the interaction with this j atom. */
841 velec = _mm_andnot_ps(dummy_mask,velec);
842 velecsum = _mm_add_ps(velecsum,velec);
846 fscal = _mm_andnot_ps(dummy_mask,fscal);
848 /* Calculate temporary vectorial force */
849 tx = _mm_mul_ps(fscal,dx20);
850 ty = _mm_mul_ps(fscal,dy20);
851 tz = _mm_mul_ps(fscal,dz20);
853 /* Update vectorial force */
854 fix2 = _mm_add_ps(fix2,tx);
855 fiy2 = _mm_add_ps(fiy2,ty);
856 fiz2 = _mm_add_ps(fiz2,tz);
858 fjx0 = _mm_add_ps(fjx0,tx);
859 fjy0 = _mm_add_ps(fjy0,ty);
860 fjz0 = _mm_add_ps(fjz0,tz);
862 /**************************
863 * CALCULATE INTERACTIONS *
864 **************************/
866 /* REACTION-FIELD ELECTROSTATICS */
867 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
868 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
870 /* Update potential sum for this i atom from the interaction with this j atom. */
871 velec = _mm_andnot_ps(dummy_mask,velec);
872 velecsum = _mm_add_ps(velecsum,velec);
876 fscal = _mm_andnot_ps(dummy_mask,fscal);
878 /* Calculate temporary vectorial force */
879 tx = _mm_mul_ps(fscal,dx21);
880 ty = _mm_mul_ps(fscal,dy21);
881 tz = _mm_mul_ps(fscal,dz21);
883 /* Update vectorial force */
884 fix2 = _mm_add_ps(fix2,tx);
885 fiy2 = _mm_add_ps(fiy2,ty);
886 fiz2 = _mm_add_ps(fiz2,tz);
888 fjx1 = _mm_add_ps(fjx1,tx);
889 fjy1 = _mm_add_ps(fjy1,ty);
890 fjz1 = _mm_add_ps(fjz1,tz);
892 /**************************
893 * CALCULATE INTERACTIONS *
894 **************************/
896 /* REACTION-FIELD ELECTROSTATICS */
897 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
898 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
900 /* Update potential sum for this i atom from the interaction with this j atom. */
901 velec = _mm_andnot_ps(dummy_mask,velec);
902 velecsum = _mm_add_ps(velecsum,velec);
906 fscal = _mm_andnot_ps(dummy_mask,fscal);
908 /* Calculate temporary vectorial force */
909 tx = _mm_mul_ps(fscal,dx22);
910 ty = _mm_mul_ps(fscal,dy22);
911 tz = _mm_mul_ps(fscal,dz22);
913 /* Update vectorial force */
914 fix2 = _mm_add_ps(fix2,tx);
915 fiy2 = _mm_add_ps(fiy2,ty);
916 fiz2 = _mm_add_ps(fiz2,tz);
918 fjx2 = _mm_add_ps(fjx2,tx);
919 fjy2 = _mm_add_ps(fjy2,ty);
920 fjz2 = _mm_add_ps(fjz2,tz);
922 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
923 f+j_coord_offsetC,f+j_coord_offsetD,
924 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
926 /* Inner loop uses 300 flops */
929 /* End of innermost loop */
931 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
932 f+i_coord_offset,fshift+i_shift_offset);
935 /* Update potential energies */
936 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
937 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
939 /* Increment number of inner iterations */
940 inneriter += j_index_end - j_index_start;
942 /* Outer loop uses 29 flops */
945 /* Increment number of outer iterations */
948 /* Update outer/inner flops */
950 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*300);
953 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse2_single
954 * Electrostatics interaction: ReactionField
955 * VdW interaction: LennardJones
956 * Geometry: Water3-Water3
957 * Calculate force/pot: Force
960 nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse2_single
961 (t_nblist * gmx_restrict nlist,
962 rvec * gmx_restrict xx,
963 rvec * gmx_restrict ff,
964 t_forcerec * gmx_restrict fr,
965 t_mdatoms * gmx_restrict mdatoms,
966 nb_kernel_data_t * gmx_restrict kernel_data,
967 t_nrnb * gmx_restrict nrnb)
969 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
970 * just 0 for non-waters.
971 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
972 * jnr indices corresponding to data put in the four positions in the SIMD register.
974 int i_shift_offset,i_coord_offset,outeriter,inneriter;
975 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
976 int jnrA,jnrB,jnrC,jnrD;
977 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
978 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
979 real shX,shY,shZ,rcutoff_scalar;
980 real *shiftvec,*fshift,*x,*f;
981 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
983 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
985 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
987 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
988 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
989 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
990 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
991 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
992 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
993 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
994 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
995 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
996 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
997 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
998 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
999 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1000 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1001 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1002 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1003 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1006 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1009 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1010 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1011 __m128 dummy_mask,cutoff_mask;
1012 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1013 __m128 one = _mm_set1_ps(1.0);
1014 __m128 two = _mm_set1_ps(2.0);
1020 jindex = nlist->jindex;
1022 shiftidx = nlist->shift;
1024 shiftvec = fr->shift_vec[0];
1025 fshift = fr->fshift[0];
1026 facel = _mm_set1_ps(fr->epsfac);
1027 charge = mdatoms->chargeA;
1028 krf = _mm_set1_ps(fr->ic->k_rf);
1029 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
1030 crf = _mm_set1_ps(fr->ic->c_rf);
1031 nvdwtype = fr->ntype;
1032 vdwparam = fr->nbfp;
1033 vdwtype = mdatoms->typeA;
1035 /* Setup water-specific parameters */
1036 inr = nlist->iinr[0];
1037 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1038 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1039 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1040 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1042 jq0 = _mm_set1_ps(charge[inr+0]);
1043 jq1 = _mm_set1_ps(charge[inr+1]);
1044 jq2 = _mm_set1_ps(charge[inr+2]);
1045 vdwjidx0A = 2*vdwtype[inr+0];
1046 qq00 = _mm_mul_ps(iq0,jq0);
1047 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1048 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1049 qq01 = _mm_mul_ps(iq0,jq1);
1050 qq02 = _mm_mul_ps(iq0,jq2);
1051 qq10 = _mm_mul_ps(iq1,jq0);
1052 qq11 = _mm_mul_ps(iq1,jq1);
1053 qq12 = _mm_mul_ps(iq1,jq2);
1054 qq20 = _mm_mul_ps(iq2,jq0);
1055 qq21 = _mm_mul_ps(iq2,jq1);
1056 qq22 = _mm_mul_ps(iq2,jq2);
1058 /* Avoid stupid compiler warnings */
1059 jnrA = jnrB = jnrC = jnrD = 0;
1060 j_coord_offsetA = 0;
1061 j_coord_offsetB = 0;
1062 j_coord_offsetC = 0;
1063 j_coord_offsetD = 0;
1068 /* Start outer loop over neighborlists */
1069 for(iidx=0; iidx<nri; iidx++)
1071 /* Load shift vector for this list */
1072 i_shift_offset = DIM*shiftidx[iidx];
1073 shX = shiftvec[i_shift_offset+XX];
1074 shY = shiftvec[i_shift_offset+YY];
1075 shZ = shiftvec[i_shift_offset+ZZ];
1077 /* Load limits for loop over neighbors */
1078 j_index_start = jindex[iidx];
1079 j_index_end = jindex[iidx+1];
1081 /* Get outer coordinate index */
1083 i_coord_offset = DIM*inr;
1085 /* Load i particle coords and add shift vector */
1086 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
1087 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
1088 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
1089 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1090 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1091 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1092 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1093 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1094 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1096 fix0 = _mm_setzero_ps();
1097 fiy0 = _mm_setzero_ps();
1098 fiz0 = _mm_setzero_ps();
1099 fix1 = _mm_setzero_ps();
1100 fiy1 = _mm_setzero_ps();
1101 fiz1 = _mm_setzero_ps();
1102 fix2 = _mm_setzero_ps();
1103 fiy2 = _mm_setzero_ps();
1104 fiz2 = _mm_setzero_ps();
1106 /* Start inner kernel loop */
1107 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1110 /* Get j neighbor index, and coordinate index */
1112 jnrB = jjnr[jidx+1];
1113 jnrC = jjnr[jidx+2];
1114 jnrD = jjnr[jidx+3];
1116 j_coord_offsetA = DIM*jnrA;
1117 j_coord_offsetB = DIM*jnrB;
1118 j_coord_offsetC = DIM*jnrC;
1119 j_coord_offsetD = DIM*jnrD;
1121 /* load j atom coordinates */
1122 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1123 x+j_coord_offsetC,x+j_coord_offsetD,
1124 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1126 /* Calculate displacement vector */
1127 dx00 = _mm_sub_ps(ix0,jx0);
1128 dy00 = _mm_sub_ps(iy0,jy0);
1129 dz00 = _mm_sub_ps(iz0,jz0);
1130 dx01 = _mm_sub_ps(ix0,jx1);
1131 dy01 = _mm_sub_ps(iy0,jy1);
1132 dz01 = _mm_sub_ps(iz0,jz1);
1133 dx02 = _mm_sub_ps(ix0,jx2);
1134 dy02 = _mm_sub_ps(iy0,jy2);
1135 dz02 = _mm_sub_ps(iz0,jz2);
1136 dx10 = _mm_sub_ps(ix1,jx0);
1137 dy10 = _mm_sub_ps(iy1,jy0);
1138 dz10 = _mm_sub_ps(iz1,jz0);
1139 dx11 = _mm_sub_ps(ix1,jx1);
1140 dy11 = _mm_sub_ps(iy1,jy1);
1141 dz11 = _mm_sub_ps(iz1,jz1);
1142 dx12 = _mm_sub_ps(ix1,jx2);
1143 dy12 = _mm_sub_ps(iy1,jy2);
1144 dz12 = _mm_sub_ps(iz1,jz2);
1145 dx20 = _mm_sub_ps(ix2,jx0);
1146 dy20 = _mm_sub_ps(iy2,jy0);
1147 dz20 = _mm_sub_ps(iz2,jz0);
1148 dx21 = _mm_sub_ps(ix2,jx1);
1149 dy21 = _mm_sub_ps(iy2,jy1);
1150 dz21 = _mm_sub_ps(iz2,jz1);
1151 dx22 = _mm_sub_ps(ix2,jx2);
1152 dy22 = _mm_sub_ps(iy2,jy2);
1153 dz22 = _mm_sub_ps(iz2,jz2);
1155 /* Calculate squared distance and things based on it */
1156 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1157 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1158 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1159 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1160 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1161 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1162 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1163 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1164 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1166 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1167 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1168 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1169 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1170 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1171 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1172 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1173 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1174 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1176 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1177 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1178 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1179 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1180 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1181 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1182 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1183 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1184 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1186 fjx0 = _mm_setzero_ps();
1187 fjy0 = _mm_setzero_ps();
1188 fjz0 = _mm_setzero_ps();
1189 fjx1 = _mm_setzero_ps();
1190 fjy1 = _mm_setzero_ps();
1191 fjz1 = _mm_setzero_ps();
1192 fjx2 = _mm_setzero_ps();
1193 fjy2 = _mm_setzero_ps();
1194 fjz2 = _mm_setzero_ps();
1196 /**************************
1197 * CALCULATE INTERACTIONS *
1198 **************************/
1200 /* REACTION-FIELD ELECTROSTATICS */
1201 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1203 /* LENNARD-JONES DISPERSION/REPULSION */
1205 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1206 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1208 fscal = _mm_add_ps(felec,fvdw);
1210 /* Calculate temporary vectorial force */
1211 tx = _mm_mul_ps(fscal,dx00);
1212 ty = _mm_mul_ps(fscal,dy00);
1213 tz = _mm_mul_ps(fscal,dz00);
1215 /* Update vectorial force */
1216 fix0 = _mm_add_ps(fix0,tx);
1217 fiy0 = _mm_add_ps(fiy0,ty);
1218 fiz0 = _mm_add_ps(fiz0,tz);
1220 fjx0 = _mm_add_ps(fjx0,tx);
1221 fjy0 = _mm_add_ps(fjy0,ty);
1222 fjz0 = _mm_add_ps(fjz0,tz);
1224 /**************************
1225 * CALCULATE INTERACTIONS *
1226 **************************/
1228 /* REACTION-FIELD ELECTROSTATICS */
1229 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1233 /* Calculate temporary vectorial force */
1234 tx = _mm_mul_ps(fscal,dx01);
1235 ty = _mm_mul_ps(fscal,dy01);
1236 tz = _mm_mul_ps(fscal,dz01);
1238 /* Update vectorial force */
1239 fix0 = _mm_add_ps(fix0,tx);
1240 fiy0 = _mm_add_ps(fiy0,ty);
1241 fiz0 = _mm_add_ps(fiz0,tz);
1243 fjx1 = _mm_add_ps(fjx1,tx);
1244 fjy1 = _mm_add_ps(fjy1,ty);
1245 fjz1 = _mm_add_ps(fjz1,tz);
1247 /**************************
1248 * CALCULATE INTERACTIONS *
1249 **************************/
1251 /* REACTION-FIELD ELECTROSTATICS */
1252 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1256 /* Calculate temporary vectorial force */
1257 tx = _mm_mul_ps(fscal,dx02);
1258 ty = _mm_mul_ps(fscal,dy02);
1259 tz = _mm_mul_ps(fscal,dz02);
1261 /* Update vectorial force */
1262 fix0 = _mm_add_ps(fix0,tx);
1263 fiy0 = _mm_add_ps(fiy0,ty);
1264 fiz0 = _mm_add_ps(fiz0,tz);
1266 fjx2 = _mm_add_ps(fjx2,tx);
1267 fjy2 = _mm_add_ps(fjy2,ty);
1268 fjz2 = _mm_add_ps(fjz2,tz);
1270 /**************************
1271 * CALCULATE INTERACTIONS *
1272 **************************/
1274 /* REACTION-FIELD ELECTROSTATICS */
1275 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1279 /* Calculate temporary vectorial force */
1280 tx = _mm_mul_ps(fscal,dx10);
1281 ty = _mm_mul_ps(fscal,dy10);
1282 tz = _mm_mul_ps(fscal,dz10);
1284 /* Update vectorial force */
1285 fix1 = _mm_add_ps(fix1,tx);
1286 fiy1 = _mm_add_ps(fiy1,ty);
1287 fiz1 = _mm_add_ps(fiz1,tz);
1289 fjx0 = _mm_add_ps(fjx0,tx);
1290 fjy0 = _mm_add_ps(fjy0,ty);
1291 fjz0 = _mm_add_ps(fjz0,tz);
1293 /**************************
1294 * CALCULATE INTERACTIONS *
1295 **************************/
1297 /* REACTION-FIELD ELECTROSTATICS */
1298 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1302 /* Calculate temporary vectorial force */
1303 tx = _mm_mul_ps(fscal,dx11);
1304 ty = _mm_mul_ps(fscal,dy11);
1305 tz = _mm_mul_ps(fscal,dz11);
1307 /* Update vectorial force */
1308 fix1 = _mm_add_ps(fix1,tx);
1309 fiy1 = _mm_add_ps(fiy1,ty);
1310 fiz1 = _mm_add_ps(fiz1,tz);
1312 fjx1 = _mm_add_ps(fjx1,tx);
1313 fjy1 = _mm_add_ps(fjy1,ty);
1314 fjz1 = _mm_add_ps(fjz1,tz);
1316 /**************************
1317 * CALCULATE INTERACTIONS *
1318 **************************/
1320 /* REACTION-FIELD ELECTROSTATICS */
1321 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1325 /* Calculate temporary vectorial force */
1326 tx = _mm_mul_ps(fscal,dx12);
1327 ty = _mm_mul_ps(fscal,dy12);
1328 tz = _mm_mul_ps(fscal,dz12);
1330 /* Update vectorial force */
1331 fix1 = _mm_add_ps(fix1,tx);
1332 fiy1 = _mm_add_ps(fiy1,ty);
1333 fiz1 = _mm_add_ps(fiz1,tz);
1335 fjx2 = _mm_add_ps(fjx2,tx);
1336 fjy2 = _mm_add_ps(fjy2,ty);
1337 fjz2 = _mm_add_ps(fjz2,tz);
1339 /**************************
1340 * CALCULATE INTERACTIONS *
1341 **************************/
1343 /* REACTION-FIELD ELECTROSTATICS */
1344 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1348 /* Calculate temporary vectorial force */
1349 tx = _mm_mul_ps(fscal,dx20);
1350 ty = _mm_mul_ps(fscal,dy20);
1351 tz = _mm_mul_ps(fscal,dz20);
1353 /* Update vectorial force */
1354 fix2 = _mm_add_ps(fix2,tx);
1355 fiy2 = _mm_add_ps(fiy2,ty);
1356 fiz2 = _mm_add_ps(fiz2,tz);
1358 fjx0 = _mm_add_ps(fjx0,tx);
1359 fjy0 = _mm_add_ps(fjy0,ty);
1360 fjz0 = _mm_add_ps(fjz0,tz);
1362 /**************************
1363 * CALCULATE INTERACTIONS *
1364 **************************/
1366 /* REACTION-FIELD ELECTROSTATICS */
1367 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1371 /* Calculate temporary vectorial force */
1372 tx = _mm_mul_ps(fscal,dx21);
1373 ty = _mm_mul_ps(fscal,dy21);
1374 tz = _mm_mul_ps(fscal,dz21);
1376 /* Update vectorial force */
1377 fix2 = _mm_add_ps(fix2,tx);
1378 fiy2 = _mm_add_ps(fiy2,ty);
1379 fiz2 = _mm_add_ps(fiz2,tz);
1381 fjx1 = _mm_add_ps(fjx1,tx);
1382 fjy1 = _mm_add_ps(fjy1,ty);
1383 fjz1 = _mm_add_ps(fjz1,tz);
1385 /**************************
1386 * CALCULATE INTERACTIONS *
1387 **************************/
1389 /* REACTION-FIELD ELECTROSTATICS */
1390 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1394 /* Calculate temporary vectorial force */
1395 tx = _mm_mul_ps(fscal,dx22);
1396 ty = _mm_mul_ps(fscal,dy22);
1397 tz = _mm_mul_ps(fscal,dz22);
1399 /* Update vectorial force */
1400 fix2 = _mm_add_ps(fix2,tx);
1401 fiy2 = _mm_add_ps(fiy2,ty);
1402 fiz2 = _mm_add_ps(fiz2,tz);
1404 fjx2 = _mm_add_ps(fjx2,tx);
1405 fjy2 = _mm_add_ps(fjy2,ty);
1406 fjz2 = _mm_add_ps(fjz2,tz);
1408 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1409 f+j_coord_offsetC,f+j_coord_offsetD,
1410 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1412 /* Inner loop uses 250 flops */
1415 if(jidx<j_index_end)
1418 /* Get j neighbor index, and coordinate index */
1420 jnrB = jjnr[jidx+1];
1421 jnrC = jjnr[jidx+2];
1422 jnrD = jjnr[jidx+3];
1424 /* Sign of each element will be negative for non-real atoms.
1425 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1426 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1428 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1429 jnrA = (jnrA>=0) ? jnrA : 0;
1430 jnrB = (jnrB>=0) ? jnrB : 0;
1431 jnrC = (jnrC>=0) ? jnrC : 0;
1432 jnrD = (jnrD>=0) ? jnrD : 0;
1434 j_coord_offsetA = DIM*jnrA;
1435 j_coord_offsetB = DIM*jnrB;
1436 j_coord_offsetC = DIM*jnrC;
1437 j_coord_offsetD = DIM*jnrD;
1439 /* load j atom coordinates */
1440 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1441 x+j_coord_offsetC,x+j_coord_offsetD,
1442 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1444 /* Calculate displacement vector */
1445 dx00 = _mm_sub_ps(ix0,jx0);
1446 dy00 = _mm_sub_ps(iy0,jy0);
1447 dz00 = _mm_sub_ps(iz0,jz0);
1448 dx01 = _mm_sub_ps(ix0,jx1);
1449 dy01 = _mm_sub_ps(iy0,jy1);
1450 dz01 = _mm_sub_ps(iz0,jz1);
1451 dx02 = _mm_sub_ps(ix0,jx2);
1452 dy02 = _mm_sub_ps(iy0,jy2);
1453 dz02 = _mm_sub_ps(iz0,jz2);
1454 dx10 = _mm_sub_ps(ix1,jx0);
1455 dy10 = _mm_sub_ps(iy1,jy0);
1456 dz10 = _mm_sub_ps(iz1,jz0);
1457 dx11 = _mm_sub_ps(ix1,jx1);
1458 dy11 = _mm_sub_ps(iy1,jy1);
1459 dz11 = _mm_sub_ps(iz1,jz1);
1460 dx12 = _mm_sub_ps(ix1,jx2);
1461 dy12 = _mm_sub_ps(iy1,jy2);
1462 dz12 = _mm_sub_ps(iz1,jz2);
1463 dx20 = _mm_sub_ps(ix2,jx0);
1464 dy20 = _mm_sub_ps(iy2,jy0);
1465 dz20 = _mm_sub_ps(iz2,jz0);
1466 dx21 = _mm_sub_ps(ix2,jx1);
1467 dy21 = _mm_sub_ps(iy2,jy1);
1468 dz21 = _mm_sub_ps(iz2,jz1);
1469 dx22 = _mm_sub_ps(ix2,jx2);
1470 dy22 = _mm_sub_ps(iy2,jy2);
1471 dz22 = _mm_sub_ps(iz2,jz2);
1473 /* Calculate squared distance and things based on it */
1474 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1475 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1476 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1477 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1478 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1479 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1480 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1481 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1482 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1484 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1485 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1486 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1487 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1488 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1489 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1490 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1491 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1492 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1494 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1495 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1496 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1497 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1498 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1499 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1500 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1501 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1502 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1504 fjx0 = _mm_setzero_ps();
1505 fjy0 = _mm_setzero_ps();
1506 fjz0 = _mm_setzero_ps();
1507 fjx1 = _mm_setzero_ps();
1508 fjy1 = _mm_setzero_ps();
1509 fjz1 = _mm_setzero_ps();
1510 fjx2 = _mm_setzero_ps();
1511 fjy2 = _mm_setzero_ps();
1512 fjz2 = _mm_setzero_ps();
1514 /**************************
1515 * CALCULATE INTERACTIONS *
1516 **************************/
1518 /* REACTION-FIELD ELECTROSTATICS */
1519 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1521 /* LENNARD-JONES DISPERSION/REPULSION */
1523 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1524 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1526 fscal = _mm_add_ps(felec,fvdw);
1528 fscal = _mm_andnot_ps(dummy_mask,fscal);
1530 /* Calculate temporary vectorial force */
1531 tx = _mm_mul_ps(fscal,dx00);
1532 ty = _mm_mul_ps(fscal,dy00);
1533 tz = _mm_mul_ps(fscal,dz00);
1535 /* Update vectorial force */
1536 fix0 = _mm_add_ps(fix0,tx);
1537 fiy0 = _mm_add_ps(fiy0,ty);
1538 fiz0 = _mm_add_ps(fiz0,tz);
1540 fjx0 = _mm_add_ps(fjx0,tx);
1541 fjy0 = _mm_add_ps(fjy0,ty);
1542 fjz0 = _mm_add_ps(fjz0,tz);
1544 /**************************
1545 * CALCULATE INTERACTIONS *
1546 **************************/
1548 /* REACTION-FIELD ELECTROSTATICS */
1549 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1553 fscal = _mm_andnot_ps(dummy_mask,fscal);
1555 /* Calculate temporary vectorial force */
1556 tx = _mm_mul_ps(fscal,dx01);
1557 ty = _mm_mul_ps(fscal,dy01);
1558 tz = _mm_mul_ps(fscal,dz01);
1560 /* Update vectorial force */
1561 fix0 = _mm_add_ps(fix0,tx);
1562 fiy0 = _mm_add_ps(fiy0,ty);
1563 fiz0 = _mm_add_ps(fiz0,tz);
1565 fjx1 = _mm_add_ps(fjx1,tx);
1566 fjy1 = _mm_add_ps(fjy1,ty);
1567 fjz1 = _mm_add_ps(fjz1,tz);
1569 /**************************
1570 * CALCULATE INTERACTIONS *
1571 **************************/
1573 /* REACTION-FIELD ELECTROSTATICS */
1574 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1578 fscal = _mm_andnot_ps(dummy_mask,fscal);
1580 /* Calculate temporary vectorial force */
1581 tx = _mm_mul_ps(fscal,dx02);
1582 ty = _mm_mul_ps(fscal,dy02);
1583 tz = _mm_mul_ps(fscal,dz02);
1585 /* Update vectorial force */
1586 fix0 = _mm_add_ps(fix0,tx);
1587 fiy0 = _mm_add_ps(fiy0,ty);
1588 fiz0 = _mm_add_ps(fiz0,tz);
1590 fjx2 = _mm_add_ps(fjx2,tx);
1591 fjy2 = _mm_add_ps(fjy2,ty);
1592 fjz2 = _mm_add_ps(fjz2,tz);
1594 /**************************
1595 * CALCULATE INTERACTIONS *
1596 **************************/
1598 /* REACTION-FIELD ELECTROSTATICS */
1599 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1603 fscal = _mm_andnot_ps(dummy_mask,fscal);
1605 /* Calculate temporary vectorial force */
1606 tx = _mm_mul_ps(fscal,dx10);
1607 ty = _mm_mul_ps(fscal,dy10);
1608 tz = _mm_mul_ps(fscal,dz10);
1610 /* Update vectorial force */
1611 fix1 = _mm_add_ps(fix1,tx);
1612 fiy1 = _mm_add_ps(fiy1,ty);
1613 fiz1 = _mm_add_ps(fiz1,tz);
1615 fjx0 = _mm_add_ps(fjx0,tx);
1616 fjy0 = _mm_add_ps(fjy0,ty);
1617 fjz0 = _mm_add_ps(fjz0,tz);
1619 /**************************
1620 * CALCULATE INTERACTIONS *
1621 **************************/
1623 /* REACTION-FIELD ELECTROSTATICS */
1624 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1628 fscal = _mm_andnot_ps(dummy_mask,fscal);
1630 /* Calculate temporary vectorial force */
1631 tx = _mm_mul_ps(fscal,dx11);
1632 ty = _mm_mul_ps(fscal,dy11);
1633 tz = _mm_mul_ps(fscal,dz11);
1635 /* Update vectorial force */
1636 fix1 = _mm_add_ps(fix1,tx);
1637 fiy1 = _mm_add_ps(fiy1,ty);
1638 fiz1 = _mm_add_ps(fiz1,tz);
1640 fjx1 = _mm_add_ps(fjx1,tx);
1641 fjy1 = _mm_add_ps(fjy1,ty);
1642 fjz1 = _mm_add_ps(fjz1,tz);
1644 /**************************
1645 * CALCULATE INTERACTIONS *
1646 **************************/
1648 /* REACTION-FIELD ELECTROSTATICS */
1649 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1653 fscal = _mm_andnot_ps(dummy_mask,fscal);
1655 /* Calculate temporary vectorial force */
1656 tx = _mm_mul_ps(fscal,dx12);
1657 ty = _mm_mul_ps(fscal,dy12);
1658 tz = _mm_mul_ps(fscal,dz12);
1660 /* Update vectorial force */
1661 fix1 = _mm_add_ps(fix1,tx);
1662 fiy1 = _mm_add_ps(fiy1,ty);
1663 fiz1 = _mm_add_ps(fiz1,tz);
1665 fjx2 = _mm_add_ps(fjx2,tx);
1666 fjy2 = _mm_add_ps(fjy2,ty);
1667 fjz2 = _mm_add_ps(fjz2,tz);
1669 /**************************
1670 * CALCULATE INTERACTIONS *
1671 **************************/
1673 /* REACTION-FIELD ELECTROSTATICS */
1674 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1678 fscal = _mm_andnot_ps(dummy_mask,fscal);
1680 /* Calculate temporary vectorial force */
1681 tx = _mm_mul_ps(fscal,dx20);
1682 ty = _mm_mul_ps(fscal,dy20);
1683 tz = _mm_mul_ps(fscal,dz20);
1685 /* Update vectorial force */
1686 fix2 = _mm_add_ps(fix2,tx);
1687 fiy2 = _mm_add_ps(fiy2,ty);
1688 fiz2 = _mm_add_ps(fiz2,tz);
1690 fjx0 = _mm_add_ps(fjx0,tx);
1691 fjy0 = _mm_add_ps(fjy0,ty);
1692 fjz0 = _mm_add_ps(fjz0,tz);
1694 /**************************
1695 * CALCULATE INTERACTIONS *
1696 **************************/
1698 /* REACTION-FIELD ELECTROSTATICS */
1699 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1703 fscal = _mm_andnot_ps(dummy_mask,fscal);
1705 /* Calculate temporary vectorial force */
1706 tx = _mm_mul_ps(fscal,dx21);
1707 ty = _mm_mul_ps(fscal,dy21);
1708 tz = _mm_mul_ps(fscal,dz21);
1710 /* Update vectorial force */
1711 fix2 = _mm_add_ps(fix2,tx);
1712 fiy2 = _mm_add_ps(fiy2,ty);
1713 fiz2 = _mm_add_ps(fiz2,tz);
1715 fjx1 = _mm_add_ps(fjx1,tx);
1716 fjy1 = _mm_add_ps(fjy1,ty);
1717 fjz1 = _mm_add_ps(fjz1,tz);
1719 /**************************
1720 * CALCULATE INTERACTIONS *
1721 **************************/
1723 /* REACTION-FIELD ELECTROSTATICS */
1724 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1728 fscal = _mm_andnot_ps(dummy_mask,fscal);
1730 /* Calculate temporary vectorial force */
1731 tx = _mm_mul_ps(fscal,dx22);
1732 ty = _mm_mul_ps(fscal,dy22);
1733 tz = _mm_mul_ps(fscal,dz22);
1735 /* Update vectorial force */
1736 fix2 = _mm_add_ps(fix2,tx);
1737 fiy2 = _mm_add_ps(fiy2,ty);
1738 fiz2 = _mm_add_ps(fiz2,tz);
1740 fjx2 = _mm_add_ps(fjx2,tx);
1741 fjy2 = _mm_add_ps(fjy2,ty);
1742 fjz2 = _mm_add_ps(fjz2,tz);
1744 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1745 f+j_coord_offsetC,f+j_coord_offsetD,
1746 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1748 /* Inner loop uses 250 flops */
1751 /* End of innermost loop */
1753 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1754 f+i_coord_offset,fshift+i_shift_offset);
1756 /* Increment number of inner iterations */
1757 inneriter += j_index_end - j_index_start;
1759 /* Outer loop uses 27 flops */
1762 /* Increment number of outer iterations */
1765 /* Update outer/inner flops */
1767 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*250);