2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse2_single
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
78 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
79 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
80 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
81 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
82 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
83 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
84 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
85 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
86 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
87 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
88 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
89 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
90 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
93 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
96 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
97 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
98 __m128 dummy_mask,cutoff_mask;
99 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
100 __m128 one = _mm_set1_ps(1.0);
101 __m128 two = _mm_set1_ps(2.0);
107 jindex = nlist->jindex;
109 shiftidx = nlist->shift;
111 shiftvec = fr->shift_vec[0];
112 fshift = fr->fshift[0];
113 facel = _mm_set1_ps(fr->epsfac);
114 charge = mdatoms->chargeA;
115 krf = _mm_set1_ps(fr->ic->k_rf);
116 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
117 crf = _mm_set1_ps(fr->ic->c_rf);
118 nvdwtype = fr->ntype;
120 vdwtype = mdatoms->typeA;
122 /* Setup water-specific parameters */
123 inr = nlist->iinr[0];
124 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
125 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
126 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
127 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
129 jq0 = _mm_set1_ps(charge[inr+0]);
130 jq1 = _mm_set1_ps(charge[inr+1]);
131 jq2 = _mm_set1_ps(charge[inr+2]);
132 vdwjidx0A = 2*vdwtype[inr+0];
133 qq00 = _mm_mul_ps(iq0,jq0);
134 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
135 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
136 qq01 = _mm_mul_ps(iq0,jq1);
137 qq02 = _mm_mul_ps(iq0,jq2);
138 qq10 = _mm_mul_ps(iq1,jq0);
139 qq11 = _mm_mul_ps(iq1,jq1);
140 qq12 = _mm_mul_ps(iq1,jq2);
141 qq20 = _mm_mul_ps(iq2,jq0);
142 qq21 = _mm_mul_ps(iq2,jq1);
143 qq22 = _mm_mul_ps(iq2,jq2);
145 /* Avoid stupid compiler warnings */
146 jnrA = jnrB = jnrC = jnrD = 0;
155 for(iidx=0;iidx<4*DIM;iidx++)
160 /* Start outer loop over neighborlists */
161 for(iidx=0; iidx<nri; iidx++)
163 /* Load shift vector for this list */
164 i_shift_offset = DIM*shiftidx[iidx];
166 /* Load limits for loop over neighbors */
167 j_index_start = jindex[iidx];
168 j_index_end = jindex[iidx+1];
170 /* Get outer coordinate index */
172 i_coord_offset = DIM*inr;
174 /* Load i particle coords and add shift vector */
175 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
176 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
178 fix0 = _mm_setzero_ps();
179 fiy0 = _mm_setzero_ps();
180 fiz0 = _mm_setzero_ps();
181 fix1 = _mm_setzero_ps();
182 fiy1 = _mm_setzero_ps();
183 fiz1 = _mm_setzero_ps();
184 fix2 = _mm_setzero_ps();
185 fiy2 = _mm_setzero_ps();
186 fiz2 = _mm_setzero_ps();
188 /* Reset potential sums */
189 velecsum = _mm_setzero_ps();
190 vvdwsum = _mm_setzero_ps();
192 /* Start inner kernel loop */
193 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
196 /* Get j neighbor index, and coordinate index */
201 j_coord_offsetA = DIM*jnrA;
202 j_coord_offsetB = DIM*jnrB;
203 j_coord_offsetC = DIM*jnrC;
204 j_coord_offsetD = DIM*jnrD;
206 /* load j atom coordinates */
207 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
208 x+j_coord_offsetC,x+j_coord_offsetD,
209 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
211 /* Calculate displacement vector */
212 dx00 = _mm_sub_ps(ix0,jx0);
213 dy00 = _mm_sub_ps(iy0,jy0);
214 dz00 = _mm_sub_ps(iz0,jz0);
215 dx01 = _mm_sub_ps(ix0,jx1);
216 dy01 = _mm_sub_ps(iy0,jy1);
217 dz01 = _mm_sub_ps(iz0,jz1);
218 dx02 = _mm_sub_ps(ix0,jx2);
219 dy02 = _mm_sub_ps(iy0,jy2);
220 dz02 = _mm_sub_ps(iz0,jz2);
221 dx10 = _mm_sub_ps(ix1,jx0);
222 dy10 = _mm_sub_ps(iy1,jy0);
223 dz10 = _mm_sub_ps(iz1,jz0);
224 dx11 = _mm_sub_ps(ix1,jx1);
225 dy11 = _mm_sub_ps(iy1,jy1);
226 dz11 = _mm_sub_ps(iz1,jz1);
227 dx12 = _mm_sub_ps(ix1,jx2);
228 dy12 = _mm_sub_ps(iy1,jy2);
229 dz12 = _mm_sub_ps(iz1,jz2);
230 dx20 = _mm_sub_ps(ix2,jx0);
231 dy20 = _mm_sub_ps(iy2,jy0);
232 dz20 = _mm_sub_ps(iz2,jz0);
233 dx21 = _mm_sub_ps(ix2,jx1);
234 dy21 = _mm_sub_ps(iy2,jy1);
235 dz21 = _mm_sub_ps(iz2,jz1);
236 dx22 = _mm_sub_ps(ix2,jx2);
237 dy22 = _mm_sub_ps(iy2,jy2);
238 dz22 = _mm_sub_ps(iz2,jz2);
240 /* Calculate squared distance and things based on it */
241 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
242 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
243 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
244 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
245 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
246 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
247 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
248 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
249 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
251 rinv00 = gmx_mm_invsqrt_ps(rsq00);
252 rinv01 = gmx_mm_invsqrt_ps(rsq01);
253 rinv02 = gmx_mm_invsqrt_ps(rsq02);
254 rinv10 = gmx_mm_invsqrt_ps(rsq10);
255 rinv11 = gmx_mm_invsqrt_ps(rsq11);
256 rinv12 = gmx_mm_invsqrt_ps(rsq12);
257 rinv20 = gmx_mm_invsqrt_ps(rsq20);
258 rinv21 = gmx_mm_invsqrt_ps(rsq21);
259 rinv22 = gmx_mm_invsqrt_ps(rsq22);
261 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
262 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
263 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
264 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
265 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
266 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
267 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
268 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
269 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
271 fjx0 = _mm_setzero_ps();
272 fjy0 = _mm_setzero_ps();
273 fjz0 = _mm_setzero_ps();
274 fjx1 = _mm_setzero_ps();
275 fjy1 = _mm_setzero_ps();
276 fjz1 = _mm_setzero_ps();
277 fjx2 = _mm_setzero_ps();
278 fjy2 = _mm_setzero_ps();
279 fjz2 = _mm_setzero_ps();
281 /**************************
282 * CALCULATE INTERACTIONS *
283 **************************/
285 /* REACTION-FIELD ELECTROSTATICS */
286 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
287 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
289 /* LENNARD-JONES DISPERSION/REPULSION */
291 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
292 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
293 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
294 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
295 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
297 /* Update potential sum for this i atom from the interaction with this j atom. */
298 velecsum = _mm_add_ps(velecsum,velec);
299 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
301 fscal = _mm_add_ps(felec,fvdw);
303 /* Calculate temporary vectorial force */
304 tx = _mm_mul_ps(fscal,dx00);
305 ty = _mm_mul_ps(fscal,dy00);
306 tz = _mm_mul_ps(fscal,dz00);
308 /* Update vectorial force */
309 fix0 = _mm_add_ps(fix0,tx);
310 fiy0 = _mm_add_ps(fiy0,ty);
311 fiz0 = _mm_add_ps(fiz0,tz);
313 fjx0 = _mm_add_ps(fjx0,tx);
314 fjy0 = _mm_add_ps(fjy0,ty);
315 fjz0 = _mm_add_ps(fjz0,tz);
317 /**************************
318 * CALCULATE INTERACTIONS *
319 **************************/
321 /* REACTION-FIELD ELECTROSTATICS */
322 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
323 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
325 /* Update potential sum for this i atom from the interaction with this j atom. */
326 velecsum = _mm_add_ps(velecsum,velec);
330 /* Calculate temporary vectorial force */
331 tx = _mm_mul_ps(fscal,dx01);
332 ty = _mm_mul_ps(fscal,dy01);
333 tz = _mm_mul_ps(fscal,dz01);
335 /* Update vectorial force */
336 fix0 = _mm_add_ps(fix0,tx);
337 fiy0 = _mm_add_ps(fiy0,ty);
338 fiz0 = _mm_add_ps(fiz0,tz);
340 fjx1 = _mm_add_ps(fjx1,tx);
341 fjy1 = _mm_add_ps(fjy1,ty);
342 fjz1 = _mm_add_ps(fjz1,tz);
344 /**************************
345 * CALCULATE INTERACTIONS *
346 **************************/
348 /* REACTION-FIELD ELECTROSTATICS */
349 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
350 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
352 /* Update potential sum for this i atom from the interaction with this j atom. */
353 velecsum = _mm_add_ps(velecsum,velec);
357 /* Calculate temporary vectorial force */
358 tx = _mm_mul_ps(fscal,dx02);
359 ty = _mm_mul_ps(fscal,dy02);
360 tz = _mm_mul_ps(fscal,dz02);
362 /* Update vectorial force */
363 fix0 = _mm_add_ps(fix0,tx);
364 fiy0 = _mm_add_ps(fiy0,ty);
365 fiz0 = _mm_add_ps(fiz0,tz);
367 fjx2 = _mm_add_ps(fjx2,tx);
368 fjy2 = _mm_add_ps(fjy2,ty);
369 fjz2 = _mm_add_ps(fjz2,tz);
371 /**************************
372 * CALCULATE INTERACTIONS *
373 **************************/
375 /* REACTION-FIELD ELECTROSTATICS */
376 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
377 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
379 /* Update potential sum for this i atom from the interaction with this j atom. */
380 velecsum = _mm_add_ps(velecsum,velec);
384 /* Calculate temporary vectorial force */
385 tx = _mm_mul_ps(fscal,dx10);
386 ty = _mm_mul_ps(fscal,dy10);
387 tz = _mm_mul_ps(fscal,dz10);
389 /* Update vectorial force */
390 fix1 = _mm_add_ps(fix1,tx);
391 fiy1 = _mm_add_ps(fiy1,ty);
392 fiz1 = _mm_add_ps(fiz1,tz);
394 fjx0 = _mm_add_ps(fjx0,tx);
395 fjy0 = _mm_add_ps(fjy0,ty);
396 fjz0 = _mm_add_ps(fjz0,tz);
398 /**************************
399 * CALCULATE INTERACTIONS *
400 **************************/
402 /* REACTION-FIELD ELECTROSTATICS */
403 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
404 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
406 /* Update potential sum for this i atom from the interaction with this j atom. */
407 velecsum = _mm_add_ps(velecsum,velec);
411 /* Calculate temporary vectorial force */
412 tx = _mm_mul_ps(fscal,dx11);
413 ty = _mm_mul_ps(fscal,dy11);
414 tz = _mm_mul_ps(fscal,dz11);
416 /* Update vectorial force */
417 fix1 = _mm_add_ps(fix1,tx);
418 fiy1 = _mm_add_ps(fiy1,ty);
419 fiz1 = _mm_add_ps(fiz1,tz);
421 fjx1 = _mm_add_ps(fjx1,tx);
422 fjy1 = _mm_add_ps(fjy1,ty);
423 fjz1 = _mm_add_ps(fjz1,tz);
425 /**************************
426 * CALCULATE INTERACTIONS *
427 **************************/
429 /* REACTION-FIELD ELECTROSTATICS */
430 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
431 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
433 /* Update potential sum for this i atom from the interaction with this j atom. */
434 velecsum = _mm_add_ps(velecsum,velec);
438 /* Calculate temporary vectorial force */
439 tx = _mm_mul_ps(fscal,dx12);
440 ty = _mm_mul_ps(fscal,dy12);
441 tz = _mm_mul_ps(fscal,dz12);
443 /* Update vectorial force */
444 fix1 = _mm_add_ps(fix1,tx);
445 fiy1 = _mm_add_ps(fiy1,ty);
446 fiz1 = _mm_add_ps(fiz1,tz);
448 fjx2 = _mm_add_ps(fjx2,tx);
449 fjy2 = _mm_add_ps(fjy2,ty);
450 fjz2 = _mm_add_ps(fjz2,tz);
452 /**************************
453 * CALCULATE INTERACTIONS *
454 **************************/
456 /* REACTION-FIELD ELECTROSTATICS */
457 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
458 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
460 /* Update potential sum for this i atom from the interaction with this j atom. */
461 velecsum = _mm_add_ps(velecsum,velec);
465 /* Calculate temporary vectorial force */
466 tx = _mm_mul_ps(fscal,dx20);
467 ty = _mm_mul_ps(fscal,dy20);
468 tz = _mm_mul_ps(fscal,dz20);
470 /* Update vectorial force */
471 fix2 = _mm_add_ps(fix2,tx);
472 fiy2 = _mm_add_ps(fiy2,ty);
473 fiz2 = _mm_add_ps(fiz2,tz);
475 fjx0 = _mm_add_ps(fjx0,tx);
476 fjy0 = _mm_add_ps(fjy0,ty);
477 fjz0 = _mm_add_ps(fjz0,tz);
479 /**************************
480 * CALCULATE INTERACTIONS *
481 **************************/
483 /* REACTION-FIELD ELECTROSTATICS */
484 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
485 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
487 /* Update potential sum for this i atom from the interaction with this j atom. */
488 velecsum = _mm_add_ps(velecsum,velec);
492 /* Calculate temporary vectorial force */
493 tx = _mm_mul_ps(fscal,dx21);
494 ty = _mm_mul_ps(fscal,dy21);
495 tz = _mm_mul_ps(fscal,dz21);
497 /* Update vectorial force */
498 fix2 = _mm_add_ps(fix2,tx);
499 fiy2 = _mm_add_ps(fiy2,ty);
500 fiz2 = _mm_add_ps(fiz2,tz);
502 fjx1 = _mm_add_ps(fjx1,tx);
503 fjy1 = _mm_add_ps(fjy1,ty);
504 fjz1 = _mm_add_ps(fjz1,tz);
506 /**************************
507 * CALCULATE INTERACTIONS *
508 **************************/
510 /* REACTION-FIELD ELECTROSTATICS */
511 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
512 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
514 /* Update potential sum for this i atom from the interaction with this j atom. */
515 velecsum = _mm_add_ps(velecsum,velec);
519 /* Calculate temporary vectorial force */
520 tx = _mm_mul_ps(fscal,dx22);
521 ty = _mm_mul_ps(fscal,dy22);
522 tz = _mm_mul_ps(fscal,dz22);
524 /* Update vectorial force */
525 fix2 = _mm_add_ps(fix2,tx);
526 fiy2 = _mm_add_ps(fiy2,ty);
527 fiz2 = _mm_add_ps(fiz2,tz);
529 fjx2 = _mm_add_ps(fjx2,tx);
530 fjy2 = _mm_add_ps(fjy2,ty);
531 fjz2 = _mm_add_ps(fjz2,tz);
533 fjptrA = f+j_coord_offsetA;
534 fjptrB = f+j_coord_offsetB;
535 fjptrC = f+j_coord_offsetC;
536 fjptrD = f+j_coord_offsetD;
538 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
539 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
541 /* Inner loop uses 300 flops */
547 /* Get j neighbor index, and coordinate index */
548 jnrlistA = jjnr[jidx];
549 jnrlistB = jjnr[jidx+1];
550 jnrlistC = jjnr[jidx+2];
551 jnrlistD = jjnr[jidx+3];
552 /* Sign of each element will be negative for non-real atoms.
553 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
554 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
556 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
557 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
558 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
559 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
560 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
561 j_coord_offsetA = DIM*jnrA;
562 j_coord_offsetB = DIM*jnrB;
563 j_coord_offsetC = DIM*jnrC;
564 j_coord_offsetD = DIM*jnrD;
566 /* load j atom coordinates */
567 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
568 x+j_coord_offsetC,x+j_coord_offsetD,
569 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
571 /* Calculate displacement vector */
572 dx00 = _mm_sub_ps(ix0,jx0);
573 dy00 = _mm_sub_ps(iy0,jy0);
574 dz00 = _mm_sub_ps(iz0,jz0);
575 dx01 = _mm_sub_ps(ix0,jx1);
576 dy01 = _mm_sub_ps(iy0,jy1);
577 dz01 = _mm_sub_ps(iz0,jz1);
578 dx02 = _mm_sub_ps(ix0,jx2);
579 dy02 = _mm_sub_ps(iy0,jy2);
580 dz02 = _mm_sub_ps(iz0,jz2);
581 dx10 = _mm_sub_ps(ix1,jx0);
582 dy10 = _mm_sub_ps(iy1,jy0);
583 dz10 = _mm_sub_ps(iz1,jz0);
584 dx11 = _mm_sub_ps(ix1,jx1);
585 dy11 = _mm_sub_ps(iy1,jy1);
586 dz11 = _mm_sub_ps(iz1,jz1);
587 dx12 = _mm_sub_ps(ix1,jx2);
588 dy12 = _mm_sub_ps(iy1,jy2);
589 dz12 = _mm_sub_ps(iz1,jz2);
590 dx20 = _mm_sub_ps(ix2,jx0);
591 dy20 = _mm_sub_ps(iy2,jy0);
592 dz20 = _mm_sub_ps(iz2,jz0);
593 dx21 = _mm_sub_ps(ix2,jx1);
594 dy21 = _mm_sub_ps(iy2,jy1);
595 dz21 = _mm_sub_ps(iz2,jz1);
596 dx22 = _mm_sub_ps(ix2,jx2);
597 dy22 = _mm_sub_ps(iy2,jy2);
598 dz22 = _mm_sub_ps(iz2,jz2);
600 /* Calculate squared distance and things based on it */
601 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
602 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
603 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
604 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
605 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
606 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
607 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
608 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
609 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
611 rinv00 = gmx_mm_invsqrt_ps(rsq00);
612 rinv01 = gmx_mm_invsqrt_ps(rsq01);
613 rinv02 = gmx_mm_invsqrt_ps(rsq02);
614 rinv10 = gmx_mm_invsqrt_ps(rsq10);
615 rinv11 = gmx_mm_invsqrt_ps(rsq11);
616 rinv12 = gmx_mm_invsqrt_ps(rsq12);
617 rinv20 = gmx_mm_invsqrt_ps(rsq20);
618 rinv21 = gmx_mm_invsqrt_ps(rsq21);
619 rinv22 = gmx_mm_invsqrt_ps(rsq22);
621 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
622 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
623 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
624 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
625 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
626 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
627 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
628 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
629 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
631 fjx0 = _mm_setzero_ps();
632 fjy0 = _mm_setzero_ps();
633 fjz0 = _mm_setzero_ps();
634 fjx1 = _mm_setzero_ps();
635 fjy1 = _mm_setzero_ps();
636 fjz1 = _mm_setzero_ps();
637 fjx2 = _mm_setzero_ps();
638 fjy2 = _mm_setzero_ps();
639 fjz2 = _mm_setzero_ps();
641 /**************************
642 * CALCULATE INTERACTIONS *
643 **************************/
645 /* REACTION-FIELD ELECTROSTATICS */
646 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
647 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
649 /* LENNARD-JONES DISPERSION/REPULSION */
651 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
652 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
653 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
654 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
655 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
657 /* Update potential sum for this i atom from the interaction with this j atom. */
658 velec = _mm_andnot_ps(dummy_mask,velec);
659 velecsum = _mm_add_ps(velecsum,velec);
660 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
661 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
663 fscal = _mm_add_ps(felec,fvdw);
665 fscal = _mm_andnot_ps(dummy_mask,fscal);
667 /* Calculate temporary vectorial force */
668 tx = _mm_mul_ps(fscal,dx00);
669 ty = _mm_mul_ps(fscal,dy00);
670 tz = _mm_mul_ps(fscal,dz00);
672 /* Update vectorial force */
673 fix0 = _mm_add_ps(fix0,tx);
674 fiy0 = _mm_add_ps(fiy0,ty);
675 fiz0 = _mm_add_ps(fiz0,tz);
677 fjx0 = _mm_add_ps(fjx0,tx);
678 fjy0 = _mm_add_ps(fjy0,ty);
679 fjz0 = _mm_add_ps(fjz0,tz);
681 /**************************
682 * CALCULATE INTERACTIONS *
683 **************************/
685 /* REACTION-FIELD ELECTROSTATICS */
686 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
687 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
689 /* Update potential sum for this i atom from the interaction with this j atom. */
690 velec = _mm_andnot_ps(dummy_mask,velec);
691 velecsum = _mm_add_ps(velecsum,velec);
695 fscal = _mm_andnot_ps(dummy_mask,fscal);
697 /* Calculate temporary vectorial force */
698 tx = _mm_mul_ps(fscal,dx01);
699 ty = _mm_mul_ps(fscal,dy01);
700 tz = _mm_mul_ps(fscal,dz01);
702 /* Update vectorial force */
703 fix0 = _mm_add_ps(fix0,tx);
704 fiy0 = _mm_add_ps(fiy0,ty);
705 fiz0 = _mm_add_ps(fiz0,tz);
707 fjx1 = _mm_add_ps(fjx1,tx);
708 fjy1 = _mm_add_ps(fjy1,ty);
709 fjz1 = _mm_add_ps(fjz1,tz);
711 /**************************
712 * CALCULATE INTERACTIONS *
713 **************************/
715 /* REACTION-FIELD ELECTROSTATICS */
716 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
717 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
719 /* Update potential sum for this i atom from the interaction with this j atom. */
720 velec = _mm_andnot_ps(dummy_mask,velec);
721 velecsum = _mm_add_ps(velecsum,velec);
725 fscal = _mm_andnot_ps(dummy_mask,fscal);
727 /* Calculate temporary vectorial force */
728 tx = _mm_mul_ps(fscal,dx02);
729 ty = _mm_mul_ps(fscal,dy02);
730 tz = _mm_mul_ps(fscal,dz02);
732 /* Update vectorial force */
733 fix0 = _mm_add_ps(fix0,tx);
734 fiy0 = _mm_add_ps(fiy0,ty);
735 fiz0 = _mm_add_ps(fiz0,tz);
737 fjx2 = _mm_add_ps(fjx2,tx);
738 fjy2 = _mm_add_ps(fjy2,ty);
739 fjz2 = _mm_add_ps(fjz2,tz);
741 /**************************
742 * CALCULATE INTERACTIONS *
743 **************************/
745 /* REACTION-FIELD ELECTROSTATICS */
746 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
747 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
749 /* Update potential sum for this i atom from the interaction with this j atom. */
750 velec = _mm_andnot_ps(dummy_mask,velec);
751 velecsum = _mm_add_ps(velecsum,velec);
755 fscal = _mm_andnot_ps(dummy_mask,fscal);
757 /* Calculate temporary vectorial force */
758 tx = _mm_mul_ps(fscal,dx10);
759 ty = _mm_mul_ps(fscal,dy10);
760 tz = _mm_mul_ps(fscal,dz10);
762 /* Update vectorial force */
763 fix1 = _mm_add_ps(fix1,tx);
764 fiy1 = _mm_add_ps(fiy1,ty);
765 fiz1 = _mm_add_ps(fiz1,tz);
767 fjx0 = _mm_add_ps(fjx0,tx);
768 fjy0 = _mm_add_ps(fjy0,ty);
769 fjz0 = _mm_add_ps(fjz0,tz);
771 /**************************
772 * CALCULATE INTERACTIONS *
773 **************************/
775 /* REACTION-FIELD ELECTROSTATICS */
776 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
777 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
779 /* Update potential sum for this i atom from the interaction with this j atom. */
780 velec = _mm_andnot_ps(dummy_mask,velec);
781 velecsum = _mm_add_ps(velecsum,velec);
785 fscal = _mm_andnot_ps(dummy_mask,fscal);
787 /* Calculate temporary vectorial force */
788 tx = _mm_mul_ps(fscal,dx11);
789 ty = _mm_mul_ps(fscal,dy11);
790 tz = _mm_mul_ps(fscal,dz11);
792 /* Update vectorial force */
793 fix1 = _mm_add_ps(fix1,tx);
794 fiy1 = _mm_add_ps(fiy1,ty);
795 fiz1 = _mm_add_ps(fiz1,tz);
797 fjx1 = _mm_add_ps(fjx1,tx);
798 fjy1 = _mm_add_ps(fjy1,ty);
799 fjz1 = _mm_add_ps(fjz1,tz);
801 /**************************
802 * CALCULATE INTERACTIONS *
803 **************************/
805 /* REACTION-FIELD ELECTROSTATICS */
806 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
807 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
809 /* Update potential sum for this i atom from the interaction with this j atom. */
810 velec = _mm_andnot_ps(dummy_mask,velec);
811 velecsum = _mm_add_ps(velecsum,velec);
815 fscal = _mm_andnot_ps(dummy_mask,fscal);
817 /* Calculate temporary vectorial force */
818 tx = _mm_mul_ps(fscal,dx12);
819 ty = _mm_mul_ps(fscal,dy12);
820 tz = _mm_mul_ps(fscal,dz12);
822 /* Update vectorial force */
823 fix1 = _mm_add_ps(fix1,tx);
824 fiy1 = _mm_add_ps(fiy1,ty);
825 fiz1 = _mm_add_ps(fiz1,tz);
827 fjx2 = _mm_add_ps(fjx2,tx);
828 fjy2 = _mm_add_ps(fjy2,ty);
829 fjz2 = _mm_add_ps(fjz2,tz);
831 /**************************
832 * CALCULATE INTERACTIONS *
833 **************************/
835 /* REACTION-FIELD ELECTROSTATICS */
836 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
837 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
839 /* Update potential sum for this i atom from the interaction with this j atom. */
840 velec = _mm_andnot_ps(dummy_mask,velec);
841 velecsum = _mm_add_ps(velecsum,velec);
845 fscal = _mm_andnot_ps(dummy_mask,fscal);
847 /* Calculate temporary vectorial force */
848 tx = _mm_mul_ps(fscal,dx20);
849 ty = _mm_mul_ps(fscal,dy20);
850 tz = _mm_mul_ps(fscal,dz20);
852 /* Update vectorial force */
853 fix2 = _mm_add_ps(fix2,tx);
854 fiy2 = _mm_add_ps(fiy2,ty);
855 fiz2 = _mm_add_ps(fiz2,tz);
857 fjx0 = _mm_add_ps(fjx0,tx);
858 fjy0 = _mm_add_ps(fjy0,ty);
859 fjz0 = _mm_add_ps(fjz0,tz);
861 /**************************
862 * CALCULATE INTERACTIONS *
863 **************************/
865 /* REACTION-FIELD ELECTROSTATICS */
866 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
867 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
869 /* Update potential sum for this i atom from the interaction with this j atom. */
870 velec = _mm_andnot_ps(dummy_mask,velec);
871 velecsum = _mm_add_ps(velecsum,velec);
875 fscal = _mm_andnot_ps(dummy_mask,fscal);
877 /* Calculate temporary vectorial force */
878 tx = _mm_mul_ps(fscal,dx21);
879 ty = _mm_mul_ps(fscal,dy21);
880 tz = _mm_mul_ps(fscal,dz21);
882 /* Update vectorial force */
883 fix2 = _mm_add_ps(fix2,tx);
884 fiy2 = _mm_add_ps(fiy2,ty);
885 fiz2 = _mm_add_ps(fiz2,tz);
887 fjx1 = _mm_add_ps(fjx1,tx);
888 fjy1 = _mm_add_ps(fjy1,ty);
889 fjz1 = _mm_add_ps(fjz1,tz);
891 /**************************
892 * CALCULATE INTERACTIONS *
893 **************************/
895 /* REACTION-FIELD ELECTROSTATICS */
896 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
897 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
899 /* Update potential sum for this i atom from the interaction with this j atom. */
900 velec = _mm_andnot_ps(dummy_mask,velec);
901 velecsum = _mm_add_ps(velecsum,velec);
905 fscal = _mm_andnot_ps(dummy_mask,fscal);
907 /* Calculate temporary vectorial force */
908 tx = _mm_mul_ps(fscal,dx22);
909 ty = _mm_mul_ps(fscal,dy22);
910 tz = _mm_mul_ps(fscal,dz22);
912 /* Update vectorial force */
913 fix2 = _mm_add_ps(fix2,tx);
914 fiy2 = _mm_add_ps(fiy2,ty);
915 fiz2 = _mm_add_ps(fiz2,tz);
917 fjx2 = _mm_add_ps(fjx2,tx);
918 fjy2 = _mm_add_ps(fjy2,ty);
919 fjz2 = _mm_add_ps(fjz2,tz);
921 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
922 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
923 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
924 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
926 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
927 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
929 /* Inner loop uses 300 flops */
932 /* End of innermost loop */
934 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
935 f+i_coord_offset,fshift+i_shift_offset);
938 /* Update potential energies */
939 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
940 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
942 /* Increment number of inner iterations */
943 inneriter += j_index_end - j_index_start;
945 /* Outer loop uses 20 flops */
948 /* Increment number of outer iterations */
951 /* Update outer/inner flops */
953 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*300);
956 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse2_single
957 * Electrostatics interaction: ReactionField
958 * VdW interaction: LennardJones
959 * Geometry: Water3-Water3
960 * Calculate force/pot: Force
963 nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse2_single
964 (t_nblist * gmx_restrict nlist,
965 rvec * gmx_restrict xx,
966 rvec * gmx_restrict ff,
967 t_forcerec * gmx_restrict fr,
968 t_mdatoms * gmx_restrict mdatoms,
969 nb_kernel_data_t * gmx_restrict kernel_data,
970 t_nrnb * gmx_restrict nrnb)
972 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
973 * just 0 for non-waters.
974 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
975 * jnr indices corresponding to data put in the four positions in the SIMD register.
977 int i_shift_offset,i_coord_offset,outeriter,inneriter;
978 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
979 int jnrA,jnrB,jnrC,jnrD;
980 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
981 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
982 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
984 real *shiftvec,*fshift,*x,*f;
985 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
987 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
989 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
991 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
993 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
994 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
995 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
996 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
997 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
998 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
999 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1000 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1001 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1002 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1003 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1004 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1005 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1006 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1007 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1008 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1009 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1012 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1015 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1016 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1017 __m128 dummy_mask,cutoff_mask;
1018 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1019 __m128 one = _mm_set1_ps(1.0);
1020 __m128 two = _mm_set1_ps(2.0);
1026 jindex = nlist->jindex;
1028 shiftidx = nlist->shift;
1030 shiftvec = fr->shift_vec[0];
1031 fshift = fr->fshift[0];
1032 facel = _mm_set1_ps(fr->epsfac);
1033 charge = mdatoms->chargeA;
1034 krf = _mm_set1_ps(fr->ic->k_rf);
1035 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
1036 crf = _mm_set1_ps(fr->ic->c_rf);
1037 nvdwtype = fr->ntype;
1038 vdwparam = fr->nbfp;
1039 vdwtype = mdatoms->typeA;
1041 /* Setup water-specific parameters */
1042 inr = nlist->iinr[0];
1043 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1044 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1045 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1046 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1048 jq0 = _mm_set1_ps(charge[inr+0]);
1049 jq1 = _mm_set1_ps(charge[inr+1]);
1050 jq2 = _mm_set1_ps(charge[inr+2]);
1051 vdwjidx0A = 2*vdwtype[inr+0];
1052 qq00 = _mm_mul_ps(iq0,jq0);
1053 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1054 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1055 qq01 = _mm_mul_ps(iq0,jq1);
1056 qq02 = _mm_mul_ps(iq0,jq2);
1057 qq10 = _mm_mul_ps(iq1,jq0);
1058 qq11 = _mm_mul_ps(iq1,jq1);
1059 qq12 = _mm_mul_ps(iq1,jq2);
1060 qq20 = _mm_mul_ps(iq2,jq0);
1061 qq21 = _mm_mul_ps(iq2,jq1);
1062 qq22 = _mm_mul_ps(iq2,jq2);
1064 /* Avoid stupid compiler warnings */
1065 jnrA = jnrB = jnrC = jnrD = 0;
1066 j_coord_offsetA = 0;
1067 j_coord_offsetB = 0;
1068 j_coord_offsetC = 0;
1069 j_coord_offsetD = 0;
1074 for(iidx=0;iidx<4*DIM;iidx++)
1076 scratch[iidx] = 0.0;
1079 /* Start outer loop over neighborlists */
1080 for(iidx=0; iidx<nri; iidx++)
1082 /* Load shift vector for this list */
1083 i_shift_offset = DIM*shiftidx[iidx];
1085 /* Load limits for loop over neighbors */
1086 j_index_start = jindex[iidx];
1087 j_index_end = jindex[iidx+1];
1089 /* Get outer coordinate index */
1091 i_coord_offset = DIM*inr;
1093 /* Load i particle coords and add shift vector */
1094 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1095 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1097 fix0 = _mm_setzero_ps();
1098 fiy0 = _mm_setzero_ps();
1099 fiz0 = _mm_setzero_ps();
1100 fix1 = _mm_setzero_ps();
1101 fiy1 = _mm_setzero_ps();
1102 fiz1 = _mm_setzero_ps();
1103 fix2 = _mm_setzero_ps();
1104 fiy2 = _mm_setzero_ps();
1105 fiz2 = _mm_setzero_ps();
1107 /* Start inner kernel loop */
1108 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1111 /* Get j neighbor index, and coordinate index */
1113 jnrB = jjnr[jidx+1];
1114 jnrC = jjnr[jidx+2];
1115 jnrD = jjnr[jidx+3];
1116 j_coord_offsetA = DIM*jnrA;
1117 j_coord_offsetB = DIM*jnrB;
1118 j_coord_offsetC = DIM*jnrC;
1119 j_coord_offsetD = DIM*jnrD;
1121 /* load j atom coordinates */
1122 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1123 x+j_coord_offsetC,x+j_coord_offsetD,
1124 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1126 /* Calculate displacement vector */
1127 dx00 = _mm_sub_ps(ix0,jx0);
1128 dy00 = _mm_sub_ps(iy0,jy0);
1129 dz00 = _mm_sub_ps(iz0,jz0);
1130 dx01 = _mm_sub_ps(ix0,jx1);
1131 dy01 = _mm_sub_ps(iy0,jy1);
1132 dz01 = _mm_sub_ps(iz0,jz1);
1133 dx02 = _mm_sub_ps(ix0,jx2);
1134 dy02 = _mm_sub_ps(iy0,jy2);
1135 dz02 = _mm_sub_ps(iz0,jz2);
1136 dx10 = _mm_sub_ps(ix1,jx0);
1137 dy10 = _mm_sub_ps(iy1,jy0);
1138 dz10 = _mm_sub_ps(iz1,jz0);
1139 dx11 = _mm_sub_ps(ix1,jx1);
1140 dy11 = _mm_sub_ps(iy1,jy1);
1141 dz11 = _mm_sub_ps(iz1,jz1);
1142 dx12 = _mm_sub_ps(ix1,jx2);
1143 dy12 = _mm_sub_ps(iy1,jy2);
1144 dz12 = _mm_sub_ps(iz1,jz2);
1145 dx20 = _mm_sub_ps(ix2,jx0);
1146 dy20 = _mm_sub_ps(iy2,jy0);
1147 dz20 = _mm_sub_ps(iz2,jz0);
1148 dx21 = _mm_sub_ps(ix2,jx1);
1149 dy21 = _mm_sub_ps(iy2,jy1);
1150 dz21 = _mm_sub_ps(iz2,jz1);
1151 dx22 = _mm_sub_ps(ix2,jx2);
1152 dy22 = _mm_sub_ps(iy2,jy2);
1153 dz22 = _mm_sub_ps(iz2,jz2);
1155 /* Calculate squared distance and things based on it */
1156 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1157 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1158 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1159 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1160 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1161 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1162 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1163 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1164 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1166 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1167 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1168 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1169 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1170 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1171 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1172 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1173 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1174 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1176 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1177 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1178 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1179 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1180 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1181 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1182 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1183 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1184 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1186 fjx0 = _mm_setzero_ps();
1187 fjy0 = _mm_setzero_ps();
1188 fjz0 = _mm_setzero_ps();
1189 fjx1 = _mm_setzero_ps();
1190 fjy1 = _mm_setzero_ps();
1191 fjz1 = _mm_setzero_ps();
1192 fjx2 = _mm_setzero_ps();
1193 fjy2 = _mm_setzero_ps();
1194 fjz2 = _mm_setzero_ps();
1196 /**************************
1197 * CALCULATE INTERACTIONS *
1198 **************************/
1200 /* REACTION-FIELD ELECTROSTATICS */
1201 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1203 /* LENNARD-JONES DISPERSION/REPULSION */
1205 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1206 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1208 fscal = _mm_add_ps(felec,fvdw);
1210 /* Calculate temporary vectorial force */
1211 tx = _mm_mul_ps(fscal,dx00);
1212 ty = _mm_mul_ps(fscal,dy00);
1213 tz = _mm_mul_ps(fscal,dz00);
1215 /* Update vectorial force */
1216 fix0 = _mm_add_ps(fix0,tx);
1217 fiy0 = _mm_add_ps(fiy0,ty);
1218 fiz0 = _mm_add_ps(fiz0,tz);
1220 fjx0 = _mm_add_ps(fjx0,tx);
1221 fjy0 = _mm_add_ps(fjy0,ty);
1222 fjz0 = _mm_add_ps(fjz0,tz);
1224 /**************************
1225 * CALCULATE INTERACTIONS *
1226 **************************/
1228 /* REACTION-FIELD ELECTROSTATICS */
1229 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1233 /* Calculate temporary vectorial force */
1234 tx = _mm_mul_ps(fscal,dx01);
1235 ty = _mm_mul_ps(fscal,dy01);
1236 tz = _mm_mul_ps(fscal,dz01);
1238 /* Update vectorial force */
1239 fix0 = _mm_add_ps(fix0,tx);
1240 fiy0 = _mm_add_ps(fiy0,ty);
1241 fiz0 = _mm_add_ps(fiz0,tz);
1243 fjx1 = _mm_add_ps(fjx1,tx);
1244 fjy1 = _mm_add_ps(fjy1,ty);
1245 fjz1 = _mm_add_ps(fjz1,tz);
1247 /**************************
1248 * CALCULATE INTERACTIONS *
1249 **************************/
1251 /* REACTION-FIELD ELECTROSTATICS */
1252 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1256 /* Calculate temporary vectorial force */
1257 tx = _mm_mul_ps(fscal,dx02);
1258 ty = _mm_mul_ps(fscal,dy02);
1259 tz = _mm_mul_ps(fscal,dz02);
1261 /* Update vectorial force */
1262 fix0 = _mm_add_ps(fix0,tx);
1263 fiy0 = _mm_add_ps(fiy0,ty);
1264 fiz0 = _mm_add_ps(fiz0,tz);
1266 fjx2 = _mm_add_ps(fjx2,tx);
1267 fjy2 = _mm_add_ps(fjy2,ty);
1268 fjz2 = _mm_add_ps(fjz2,tz);
1270 /**************************
1271 * CALCULATE INTERACTIONS *
1272 **************************/
1274 /* REACTION-FIELD ELECTROSTATICS */
1275 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1279 /* Calculate temporary vectorial force */
1280 tx = _mm_mul_ps(fscal,dx10);
1281 ty = _mm_mul_ps(fscal,dy10);
1282 tz = _mm_mul_ps(fscal,dz10);
1284 /* Update vectorial force */
1285 fix1 = _mm_add_ps(fix1,tx);
1286 fiy1 = _mm_add_ps(fiy1,ty);
1287 fiz1 = _mm_add_ps(fiz1,tz);
1289 fjx0 = _mm_add_ps(fjx0,tx);
1290 fjy0 = _mm_add_ps(fjy0,ty);
1291 fjz0 = _mm_add_ps(fjz0,tz);
1293 /**************************
1294 * CALCULATE INTERACTIONS *
1295 **************************/
1297 /* REACTION-FIELD ELECTROSTATICS */
1298 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1302 /* Calculate temporary vectorial force */
1303 tx = _mm_mul_ps(fscal,dx11);
1304 ty = _mm_mul_ps(fscal,dy11);
1305 tz = _mm_mul_ps(fscal,dz11);
1307 /* Update vectorial force */
1308 fix1 = _mm_add_ps(fix1,tx);
1309 fiy1 = _mm_add_ps(fiy1,ty);
1310 fiz1 = _mm_add_ps(fiz1,tz);
1312 fjx1 = _mm_add_ps(fjx1,tx);
1313 fjy1 = _mm_add_ps(fjy1,ty);
1314 fjz1 = _mm_add_ps(fjz1,tz);
1316 /**************************
1317 * CALCULATE INTERACTIONS *
1318 **************************/
1320 /* REACTION-FIELD ELECTROSTATICS */
1321 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1325 /* Calculate temporary vectorial force */
1326 tx = _mm_mul_ps(fscal,dx12);
1327 ty = _mm_mul_ps(fscal,dy12);
1328 tz = _mm_mul_ps(fscal,dz12);
1330 /* Update vectorial force */
1331 fix1 = _mm_add_ps(fix1,tx);
1332 fiy1 = _mm_add_ps(fiy1,ty);
1333 fiz1 = _mm_add_ps(fiz1,tz);
1335 fjx2 = _mm_add_ps(fjx2,tx);
1336 fjy2 = _mm_add_ps(fjy2,ty);
1337 fjz2 = _mm_add_ps(fjz2,tz);
1339 /**************************
1340 * CALCULATE INTERACTIONS *
1341 **************************/
1343 /* REACTION-FIELD ELECTROSTATICS */
1344 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1348 /* Calculate temporary vectorial force */
1349 tx = _mm_mul_ps(fscal,dx20);
1350 ty = _mm_mul_ps(fscal,dy20);
1351 tz = _mm_mul_ps(fscal,dz20);
1353 /* Update vectorial force */
1354 fix2 = _mm_add_ps(fix2,tx);
1355 fiy2 = _mm_add_ps(fiy2,ty);
1356 fiz2 = _mm_add_ps(fiz2,tz);
1358 fjx0 = _mm_add_ps(fjx0,tx);
1359 fjy0 = _mm_add_ps(fjy0,ty);
1360 fjz0 = _mm_add_ps(fjz0,tz);
1362 /**************************
1363 * CALCULATE INTERACTIONS *
1364 **************************/
1366 /* REACTION-FIELD ELECTROSTATICS */
1367 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1371 /* Calculate temporary vectorial force */
1372 tx = _mm_mul_ps(fscal,dx21);
1373 ty = _mm_mul_ps(fscal,dy21);
1374 tz = _mm_mul_ps(fscal,dz21);
1376 /* Update vectorial force */
1377 fix2 = _mm_add_ps(fix2,tx);
1378 fiy2 = _mm_add_ps(fiy2,ty);
1379 fiz2 = _mm_add_ps(fiz2,tz);
1381 fjx1 = _mm_add_ps(fjx1,tx);
1382 fjy1 = _mm_add_ps(fjy1,ty);
1383 fjz1 = _mm_add_ps(fjz1,tz);
1385 /**************************
1386 * CALCULATE INTERACTIONS *
1387 **************************/
1389 /* REACTION-FIELD ELECTROSTATICS */
1390 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1394 /* Calculate temporary vectorial force */
1395 tx = _mm_mul_ps(fscal,dx22);
1396 ty = _mm_mul_ps(fscal,dy22);
1397 tz = _mm_mul_ps(fscal,dz22);
1399 /* Update vectorial force */
1400 fix2 = _mm_add_ps(fix2,tx);
1401 fiy2 = _mm_add_ps(fiy2,ty);
1402 fiz2 = _mm_add_ps(fiz2,tz);
1404 fjx2 = _mm_add_ps(fjx2,tx);
1405 fjy2 = _mm_add_ps(fjy2,ty);
1406 fjz2 = _mm_add_ps(fjz2,tz);
1408 fjptrA = f+j_coord_offsetA;
1409 fjptrB = f+j_coord_offsetB;
1410 fjptrC = f+j_coord_offsetC;
1411 fjptrD = f+j_coord_offsetD;
1413 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1414 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1416 /* Inner loop uses 250 flops */
1419 if(jidx<j_index_end)
1422 /* Get j neighbor index, and coordinate index */
1423 jnrlistA = jjnr[jidx];
1424 jnrlistB = jjnr[jidx+1];
1425 jnrlistC = jjnr[jidx+2];
1426 jnrlistD = jjnr[jidx+3];
1427 /* Sign of each element will be negative for non-real atoms.
1428 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1429 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1431 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1432 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1433 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1434 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1435 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1436 j_coord_offsetA = DIM*jnrA;
1437 j_coord_offsetB = DIM*jnrB;
1438 j_coord_offsetC = DIM*jnrC;
1439 j_coord_offsetD = DIM*jnrD;
1441 /* load j atom coordinates */
1442 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1443 x+j_coord_offsetC,x+j_coord_offsetD,
1444 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1446 /* Calculate displacement vector */
1447 dx00 = _mm_sub_ps(ix0,jx0);
1448 dy00 = _mm_sub_ps(iy0,jy0);
1449 dz00 = _mm_sub_ps(iz0,jz0);
1450 dx01 = _mm_sub_ps(ix0,jx1);
1451 dy01 = _mm_sub_ps(iy0,jy1);
1452 dz01 = _mm_sub_ps(iz0,jz1);
1453 dx02 = _mm_sub_ps(ix0,jx2);
1454 dy02 = _mm_sub_ps(iy0,jy2);
1455 dz02 = _mm_sub_ps(iz0,jz2);
1456 dx10 = _mm_sub_ps(ix1,jx0);
1457 dy10 = _mm_sub_ps(iy1,jy0);
1458 dz10 = _mm_sub_ps(iz1,jz0);
1459 dx11 = _mm_sub_ps(ix1,jx1);
1460 dy11 = _mm_sub_ps(iy1,jy1);
1461 dz11 = _mm_sub_ps(iz1,jz1);
1462 dx12 = _mm_sub_ps(ix1,jx2);
1463 dy12 = _mm_sub_ps(iy1,jy2);
1464 dz12 = _mm_sub_ps(iz1,jz2);
1465 dx20 = _mm_sub_ps(ix2,jx0);
1466 dy20 = _mm_sub_ps(iy2,jy0);
1467 dz20 = _mm_sub_ps(iz2,jz0);
1468 dx21 = _mm_sub_ps(ix2,jx1);
1469 dy21 = _mm_sub_ps(iy2,jy1);
1470 dz21 = _mm_sub_ps(iz2,jz1);
1471 dx22 = _mm_sub_ps(ix2,jx2);
1472 dy22 = _mm_sub_ps(iy2,jy2);
1473 dz22 = _mm_sub_ps(iz2,jz2);
1475 /* Calculate squared distance and things based on it */
1476 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1477 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1478 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1479 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1480 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1481 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1482 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1483 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1484 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1486 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1487 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1488 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1489 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1490 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1491 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1492 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1493 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1494 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1496 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1497 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1498 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1499 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1500 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1501 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1502 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1503 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1504 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1506 fjx0 = _mm_setzero_ps();
1507 fjy0 = _mm_setzero_ps();
1508 fjz0 = _mm_setzero_ps();
1509 fjx1 = _mm_setzero_ps();
1510 fjy1 = _mm_setzero_ps();
1511 fjz1 = _mm_setzero_ps();
1512 fjx2 = _mm_setzero_ps();
1513 fjy2 = _mm_setzero_ps();
1514 fjz2 = _mm_setzero_ps();
1516 /**************************
1517 * CALCULATE INTERACTIONS *
1518 **************************/
1520 /* REACTION-FIELD ELECTROSTATICS */
1521 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1523 /* LENNARD-JONES DISPERSION/REPULSION */
1525 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1526 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1528 fscal = _mm_add_ps(felec,fvdw);
1530 fscal = _mm_andnot_ps(dummy_mask,fscal);
1532 /* Calculate temporary vectorial force */
1533 tx = _mm_mul_ps(fscal,dx00);
1534 ty = _mm_mul_ps(fscal,dy00);
1535 tz = _mm_mul_ps(fscal,dz00);
1537 /* Update vectorial force */
1538 fix0 = _mm_add_ps(fix0,tx);
1539 fiy0 = _mm_add_ps(fiy0,ty);
1540 fiz0 = _mm_add_ps(fiz0,tz);
1542 fjx0 = _mm_add_ps(fjx0,tx);
1543 fjy0 = _mm_add_ps(fjy0,ty);
1544 fjz0 = _mm_add_ps(fjz0,tz);
1546 /**************************
1547 * CALCULATE INTERACTIONS *
1548 **************************/
1550 /* REACTION-FIELD ELECTROSTATICS */
1551 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1555 fscal = _mm_andnot_ps(dummy_mask,fscal);
1557 /* Calculate temporary vectorial force */
1558 tx = _mm_mul_ps(fscal,dx01);
1559 ty = _mm_mul_ps(fscal,dy01);
1560 tz = _mm_mul_ps(fscal,dz01);
1562 /* Update vectorial force */
1563 fix0 = _mm_add_ps(fix0,tx);
1564 fiy0 = _mm_add_ps(fiy0,ty);
1565 fiz0 = _mm_add_ps(fiz0,tz);
1567 fjx1 = _mm_add_ps(fjx1,tx);
1568 fjy1 = _mm_add_ps(fjy1,ty);
1569 fjz1 = _mm_add_ps(fjz1,tz);
1571 /**************************
1572 * CALCULATE INTERACTIONS *
1573 **************************/
1575 /* REACTION-FIELD ELECTROSTATICS */
1576 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1580 fscal = _mm_andnot_ps(dummy_mask,fscal);
1582 /* Calculate temporary vectorial force */
1583 tx = _mm_mul_ps(fscal,dx02);
1584 ty = _mm_mul_ps(fscal,dy02);
1585 tz = _mm_mul_ps(fscal,dz02);
1587 /* Update vectorial force */
1588 fix0 = _mm_add_ps(fix0,tx);
1589 fiy0 = _mm_add_ps(fiy0,ty);
1590 fiz0 = _mm_add_ps(fiz0,tz);
1592 fjx2 = _mm_add_ps(fjx2,tx);
1593 fjy2 = _mm_add_ps(fjy2,ty);
1594 fjz2 = _mm_add_ps(fjz2,tz);
1596 /**************************
1597 * CALCULATE INTERACTIONS *
1598 **************************/
1600 /* REACTION-FIELD ELECTROSTATICS */
1601 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1605 fscal = _mm_andnot_ps(dummy_mask,fscal);
1607 /* Calculate temporary vectorial force */
1608 tx = _mm_mul_ps(fscal,dx10);
1609 ty = _mm_mul_ps(fscal,dy10);
1610 tz = _mm_mul_ps(fscal,dz10);
1612 /* Update vectorial force */
1613 fix1 = _mm_add_ps(fix1,tx);
1614 fiy1 = _mm_add_ps(fiy1,ty);
1615 fiz1 = _mm_add_ps(fiz1,tz);
1617 fjx0 = _mm_add_ps(fjx0,tx);
1618 fjy0 = _mm_add_ps(fjy0,ty);
1619 fjz0 = _mm_add_ps(fjz0,tz);
1621 /**************************
1622 * CALCULATE INTERACTIONS *
1623 **************************/
1625 /* REACTION-FIELD ELECTROSTATICS */
1626 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1630 fscal = _mm_andnot_ps(dummy_mask,fscal);
1632 /* Calculate temporary vectorial force */
1633 tx = _mm_mul_ps(fscal,dx11);
1634 ty = _mm_mul_ps(fscal,dy11);
1635 tz = _mm_mul_ps(fscal,dz11);
1637 /* Update vectorial force */
1638 fix1 = _mm_add_ps(fix1,tx);
1639 fiy1 = _mm_add_ps(fiy1,ty);
1640 fiz1 = _mm_add_ps(fiz1,tz);
1642 fjx1 = _mm_add_ps(fjx1,tx);
1643 fjy1 = _mm_add_ps(fjy1,ty);
1644 fjz1 = _mm_add_ps(fjz1,tz);
1646 /**************************
1647 * CALCULATE INTERACTIONS *
1648 **************************/
1650 /* REACTION-FIELD ELECTROSTATICS */
1651 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1655 fscal = _mm_andnot_ps(dummy_mask,fscal);
1657 /* Calculate temporary vectorial force */
1658 tx = _mm_mul_ps(fscal,dx12);
1659 ty = _mm_mul_ps(fscal,dy12);
1660 tz = _mm_mul_ps(fscal,dz12);
1662 /* Update vectorial force */
1663 fix1 = _mm_add_ps(fix1,tx);
1664 fiy1 = _mm_add_ps(fiy1,ty);
1665 fiz1 = _mm_add_ps(fiz1,tz);
1667 fjx2 = _mm_add_ps(fjx2,tx);
1668 fjy2 = _mm_add_ps(fjy2,ty);
1669 fjz2 = _mm_add_ps(fjz2,tz);
1671 /**************************
1672 * CALCULATE INTERACTIONS *
1673 **************************/
1675 /* REACTION-FIELD ELECTROSTATICS */
1676 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1680 fscal = _mm_andnot_ps(dummy_mask,fscal);
1682 /* Calculate temporary vectorial force */
1683 tx = _mm_mul_ps(fscal,dx20);
1684 ty = _mm_mul_ps(fscal,dy20);
1685 tz = _mm_mul_ps(fscal,dz20);
1687 /* Update vectorial force */
1688 fix2 = _mm_add_ps(fix2,tx);
1689 fiy2 = _mm_add_ps(fiy2,ty);
1690 fiz2 = _mm_add_ps(fiz2,tz);
1692 fjx0 = _mm_add_ps(fjx0,tx);
1693 fjy0 = _mm_add_ps(fjy0,ty);
1694 fjz0 = _mm_add_ps(fjz0,tz);
1696 /**************************
1697 * CALCULATE INTERACTIONS *
1698 **************************/
1700 /* REACTION-FIELD ELECTROSTATICS */
1701 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1705 fscal = _mm_andnot_ps(dummy_mask,fscal);
1707 /* Calculate temporary vectorial force */
1708 tx = _mm_mul_ps(fscal,dx21);
1709 ty = _mm_mul_ps(fscal,dy21);
1710 tz = _mm_mul_ps(fscal,dz21);
1712 /* Update vectorial force */
1713 fix2 = _mm_add_ps(fix2,tx);
1714 fiy2 = _mm_add_ps(fiy2,ty);
1715 fiz2 = _mm_add_ps(fiz2,tz);
1717 fjx1 = _mm_add_ps(fjx1,tx);
1718 fjy1 = _mm_add_ps(fjy1,ty);
1719 fjz1 = _mm_add_ps(fjz1,tz);
1721 /**************************
1722 * CALCULATE INTERACTIONS *
1723 **************************/
1725 /* REACTION-FIELD ELECTROSTATICS */
1726 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1730 fscal = _mm_andnot_ps(dummy_mask,fscal);
1732 /* Calculate temporary vectorial force */
1733 tx = _mm_mul_ps(fscal,dx22);
1734 ty = _mm_mul_ps(fscal,dy22);
1735 tz = _mm_mul_ps(fscal,dz22);
1737 /* Update vectorial force */
1738 fix2 = _mm_add_ps(fix2,tx);
1739 fiy2 = _mm_add_ps(fiy2,ty);
1740 fiz2 = _mm_add_ps(fiz2,tz);
1742 fjx2 = _mm_add_ps(fjx2,tx);
1743 fjy2 = _mm_add_ps(fjy2,ty);
1744 fjz2 = _mm_add_ps(fjz2,tz);
1746 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1747 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1748 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1749 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1751 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1752 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1754 /* Inner loop uses 250 flops */
1757 /* End of innermost loop */
1759 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1760 f+i_coord_offset,fshift+i_shift_offset);
1762 /* Increment number of inner iterations */
1763 inneriter += j_index_end - j_index_start;
1765 /* Outer loop uses 18 flops */
1768 /* Increment number of outer iterations */
1771 /* Update outer/inner flops */
1773 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*250);