2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse2_single
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: None
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real shX,shY,shZ,rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
73 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
75 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
77 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
89 __m128 dummy_mask,cutoff_mask;
90 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
91 __m128 one = _mm_set1_ps(1.0);
92 __m128 two = _mm_set1_ps(2.0);
98 jindex = nlist->jindex;
100 shiftidx = nlist->shift;
102 shiftvec = fr->shift_vec[0];
103 fshift = fr->fshift[0];
104 facel = _mm_set1_ps(fr->epsfac);
105 charge = mdatoms->chargeA;
106 krf = _mm_set1_ps(fr->ic->k_rf);
107 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
108 crf = _mm_set1_ps(fr->ic->c_rf);
110 /* Setup water-specific parameters */
111 inr = nlist->iinr[0];
112 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
113 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
114 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
116 jq0 = _mm_set1_ps(charge[inr+0]);
117 jq1 = _mm_set1_ps(charge[inr+1]);
118 jq2 = _mm_set1_ps(charge[inr+2]);
119 qq00 = _mm_mul_ps(iq0,jq0);
120 qq01 = _mm_mul_ps(iq0,jq1);
121 qq02 = _mm_mul_ps(iq0,jq2);
122 qq10 = _mm_mul_ps(iq1,jq0);
123 qq11 = _mm_mul_ps(iq1,jq1);
124 qq12 = _mm_mul_ps(iq1,jq2);
125 qq20 = _mm_mul_ps(iq2,jq0);
126 qq21 = _mm_mul_ps(iq2,jq1);
127 qq22 = _mm_mul_ps(iq2,jq2);
129 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
130 rcutoff_scalar = fr->rcoulomb;
131 rcutoff = _mm_set1_ps(rcutoff_scalar);
132 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
134 /* Avoid stupid compiler warnings */
135 jnrA = jnrB = jnrC = jnrD = 0;
144 /* Start outer loop over neighborlists */
145 for(iidx=0; iidx<nri; iidx++)
147 /* Load shift vector for this list */
148 i_shift_offset = DIM*shiftidx[iidx];
149 shX = shiftvec[i_shift_offset+XX];
150 shY = shiftvec[i_shift_offset+YY];
151 shZ = shiftvec[i_shift_offset+ZZ];
153 /* Load limits for loop over neighbors */
154 j_index_start = jindex[iidx];
155 j_index_end = jindex[iidx+1];
157 /* Get outer coordinate index */
159 i_coord_offset = DIM*inr;
161 /* Load i particle coords and add shift vector */
162 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
163 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
164 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
165 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
166 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
167 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
168 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
169 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
170 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
172 fix0 = _mm_setzero_ps();
173 fiy0 = _mm_setzero_ps();
174 fiz0 = _mm_setzero_ps();
175 fix1 = _mm_setzero_ps();
176 fiy1 = _mm_setzero_ps();
177 fiz1 = _mm_setzero_ps();
178 fix2 = _mm_setzero_ps();
179 fiy2 = _mm_setzero_ps();
180 fiz2 = _mm_setzero_ps();
182 /* Reset potential sums */
183 velecsum = _mm_setzero_ps();
185 /* Start inner kernel loop */
186 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
189 /* Get j neighbor index, and coordinate index */
195 j_coord_offsetA = DIM*jnrA;
196 j_coord_offsetB = DIM*jnrB;
197 j_coord_offsetC = DIM*jnrC;
198 j_coord_offsetD = DIM*jnrD;
200 /* load j atom coordinates */
201 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
202 x+j_coord_offsetC,x+j_coord_offsetD,
203 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
205 /* Calculate displacement vector */
206 dx00 = _mm_sub_ps(ix0,jx0);
207 dy00 = _mm_sub_ps(iy0,jy0);
208 dz00 = _mm_sub_ps(iz0,jz0);
209 dx01 = _mm_sub_ps(ix0,jx1);
210 dy01 = _mm_sub_ps(iy0,jy1);
211 dz01 = _mm_sub_ps(iz0,jz1);
212 dx02 = _mm_sub_ps(ix0,jx2);
213 dy02 = _mm_sub_ps(iy0,jy2);
214 dz02 = _mm_sub_ps(iz0,jz2);
215 dx10 = _mm_sub_ps(ix1,jx0);
216 dy10 = _mm_sub_ps(iy1,jy0);
217 dz10 = _mm_sub_ps(iz1,jz0);
218 dx11 = _mm_sub_ps(ix1,jx1);
219 dy11 = _mm_sub_ps(iy1,jy1);
220 dz11 = _mm_sub_ps(iz1,jz1);
221 dx12 = _mm_sub_ps(ix1,jx2);
222 dy12 = _mm_sub_ps(iy1,jy2);
223 dz12 = _mm_sub_ps(iz1,jz2);
224 dx20 = _mm_sub_ps(ix2,jx0);
225 dy20 = _mm_sub_ps(iy2,jy0);
226 dz20 = _mm_sub_ps(iz2,jz0);
227 dx21 = _mm_sub_ps(ix2,jx1);
228 dy21 = _mm_sub_ps(iy2,jy1);
229 dz21 = _mm_sub_ps(iz2,jz1);
230 dx22 = _mm_sub_ps(ix2,jx2);
231 dy22 = _mm_sub_ps(iy2,jy2);
232 dz22 = _mm_sub_ps(iz2,jz2);
234 /* Calculate squared distance and things based on it */
235 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
236 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
237 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
238 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
239 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
240 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
241 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
242 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
243 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
245 rinv00 = gmx_mm_invsqrt_ps(rsq00);
246 rinv01 = gmx_mm_invsqrt_ps(rsq01);
247 rinv02 = gmx_mm_invsqrt_ps(rsq02);
248 rinv10 = gmx_mm_invsqrt_ps(rsq10);
249 rinv11 = gmx_mm_invsqrt_ps(rsq11);
250 rinv12 = gmx_mm_invsqrt_ps(rsq12);
251 rinv20 = gmx_mm_invsqrt_ps(rsq20);
252 rinv21 = gmx_mm_invsqrt_ps(rsq21);
253 rinv22 = gmx_mm_invsqrt_ps(rsq22);
255 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
256 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
257 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
258 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
259 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
260 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
261 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
262 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
263 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
265 fjx0 = _mm_setzero_ps();
266 fjy0 = _mm_setzero_ps();
267 fjz0 = _mm_setzero_ps();
268 fjx1 = _mm_setzero_ps();
269 fjy1 = _mm_setzero_ps();
270 fjz1 = _mm_setzero_ps();
271 fjx2 = _mm_setzero_ps();
272 fjy2 = _mm_setzero_ps();
273 fjz2 = _mm_setzero_ps();
275 /**************************
276 * CALCULATE INTERACTIONS *
277 **************************/
279 if (gmx_mm_any_lt(rsq00,rcutoff2))
282 /* REACTION-FIELD ELECTROSTATICS */
283 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
284 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
286 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
288 /* Update potential sum for this i atom from the interaction with this j atom. */
289 velec = _mm_and_ps(velec,cutoff_mask);
290 velecsum = _mm_add_ps(velecsum,velec);
294 fscal = _mm_and_ps(fscal,cutoff_mask);
296 /* Calculate temporary vectorial force */
297 tx = _mm_mul_ps(fscal,dx00);
298 ty = _mm_mul_ps(fscal,dy00);
299 tz = _mm_mul_ps(fscal,dz00);
301 /* Update vectorial force */
302 fix0 = _mm_add_ps(fix0,tx);
303 fiy0 = _mm_add_ps(fiy0,ty);
304 fiz0 = _mm_add_ps(fiz0,tz);
306 fjx0 = _mm_add_ps(fjx0,tx);
307 fjy0 = _mm_add_ps(fjy0,ty);
308 fjz0 = _mm_add_ps(fjz0,tz);
312 /**************************
313 * CALCULATE INTERACTIONS *
314 **************************/
316 if (gmx_mm_any_lt(rsq01,rcutoff2))
319 /* REACTION-FIELD ELECTROSTATICS */
320 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
321 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
323 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
325 /* Update potential sum for this i atom from the interaction with this j atom. */
326 velec = _mm_and_ps(velec,cutoff_mask);
327 velecsum = _mm_add_ps(velecsum,velec);
331 fscal = _mm_and_ps(fscal,cutoff_mask);
333 /* Calculate temporary vectorial force */
334 tx = _mm_mul_ps(fscal,dx01);
335 ty = _mm_mul_ps(fscal,dy01);
336 tz = _mm_mul_ps(fscal,dz01);
338 /* Update vectorial force */
339 fix0 = _mm_add_ps(fix0,tx);
340 fiy0 = _mm_add_ps(fiy0,ty);
341 fiz0 = _mm_add_ps(fiz0,tz);
343 fjx1 = _mm_add_ps(fjx1,tx);
344 fjy1 = _mm_add_ps(fjy1,ty);
345 fjz1 = _mm_add_ps(fjz1,tz);
349 /**************************
350 * CALCULATE INTERACTIONS *
351 **************************/
353 if (gmx_mm_any_lt(rsq02,rcutoff2))
356 /* REACTION-FIELD ELECTROSTATICS */
357 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
358 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
360 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
362 /* Update potential sum for this i atom from the interaction with this j atom. */
363 velec = _mm_and_ps(velec,cutoff_mask);
364 velecsum = _mm_add_ps(velecsum,velec);
368 fscal = _mm_and_ps(fscal,cutoff_mask);
370 /* Calculate temporary vectorial force */
371 tx = _mm_mul_ps(fscal,dx02);
372 ty = _mm_mul_ps(fscal,dy02);
373 tz = _mm_mul_ps(fscal,dz02);
375 /* Update vectorial force */
376 fix0 = _mm_add_ps(fix0,tx);
377 fiy0 = _mm_add_ps(fiy0,ty);
378 fiz0 = _mm_add_ps(fiz0,tz);
380 fjx2 = _mm_add_ps(fjx2,tx);
381 fjy2 = _mm_add_ps(fjy2,ty);
382 fjz2 = _mm_add_ps(fjz2,tz);
386 /**************************
387 * CALCULATE INTERACTIONS *
388 **************************/
390 if (gmx_mm_any_lt(rsq10,rcutoff2))
393 /* REACTION-FIELD ELECTROSTATICS */
394 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
395 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
397 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
399 /* Update potential sum for this i atom from the interaction with this j atom. */
400 velec = _mm_and_ps(velec,cutoff_mask);
401 velecsum = _mm_add_ps(velecsum,velec);
405 fscal = _mm_and_ps(fscal,cutoff_mask);
407 /* Calculate temporary vectorial force */
408 tx = _mm_mul_ps(fscal,dx10);
409 ty = _mm_mul_ps(fscal,dy10);
410 tz = _mm_mul_ps(fscal,dz10);
412 /* Update vectorial force */
413 fix1 = _mm_add_ps(fix1,tx);
414 fiy1 = _mm_add_ps(fiy1,ty);
415 fiz1 = _mm_add_ps(fiz1,tz);
417 fjx0 = _mm_add_ps(fjx0,tx);
418 fjy0 = _mm_add_ps(fjy0,ty);
419 fjz0 = _mm_add_ps(fjz0,tz);
423 /**************************
424 * CALCULATE INTERACTIONS *
425 **************************/
427 if (gmx_mm_any_lt(rsq11,rcutoff2))
430 /* REACTION-FIELD ELECTROSTATICS */
431 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
432 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
434 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
436 /* Update potential sum for this i atom from the interaction with this j atom. */
437 velec = _mm_and_ps(velec,cutoff_mask);
438 velecsum = _mm_add_ps(velecsum,velec);
442 fscal = _mm_and_ps(fscal,cutoff_mask);
444 /* Calculate temporary vectorial force */
445 tx = _mm_mul_ps(fscal,dx11);
446 ty = _mm_mul_ps(fscal,dy11);
447 tz = _mm_mul_ps(fscal,dz11);
449 /* Update vectorial force */
450 fix1 = _mm_add_ps(fix1,tx);
451 fiy1 = _mm_add_ps(fiy1,ty);
452 fiz1 = _mm_add_ps(fiz1,tz);
454 fjx1 = _mm_add_ps(fjx1,tx);
455 fjy1 = _mm_add_ps(fjy1,ty);
456 fjz1 = _mm_add_ps(fjz1,tz);
460 /**************************
461 * CALCULATE INTERACTIONS *
462 **************************/
464 if (gmx_mm_any_lt(rsq12,rcutoff2))
467 /* REACTION-FIELD ELECTROSTATICS */
468 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
469 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
471 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
473 /* Update potential sum for this i atom from the interaction with this j atom. */
474 velec = _mm_and_ps(velec,cutoff_mask);
475 velecsum = _mm_add_ps(velecsum,velec);
479 fscal = _mm_and_ps(fscal,cutoff_mask);
481 /* Calculate temporary vectorial force */
482 tx = _mm_mul_ps(fscal,dx12);
483 ty = _mm_mul_ps(fscal,dy12);
484 tz = _mm_mul_ps(fscal,dz12);
486 /* Update vectorial force */
487 fix1 = _mm_add_ps(fix1,tx);
488 fiy1 = _mm_add_ps(fiy1,ty);
489 fiz1 = _mm_add_ps(fiz1,tz);
491 fjx2 = _mm_add_ps(fjx2,tx);
492 fjy2 = _mm_add_ps(fjy2,ty);
493 fjz2 = _mm_add_ps(fjz2,tz);
497 /**************************
498 * CALCULATE INTERACTIONS *
499 **************************/
501 if (gmx_mm_any_lt(rsq20,rcutoff2))
504 /* REACTION-FIELD ELECTROSTATICS */
505 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
506 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
508 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
510 /* Update potential sum for this i atom from the interaction with this j atom. */
511 velec = _mm_and_ps(velec,cutoff_mask);
512 velecsum = _mm_add_ps(velecsum,velec);
516 fscal = _mm_and_ps(fscal,cutoff_mask);
518 /* Calculate temporary vectorial force */
519 tx = _mm_mul_ps(fscal,dx20);
520 ty = _mm_mul_ps(fscal,dy20);
521 tz = _mm_mul_ps(fscal,dz20);
523 /* Update vectorial force */
524 fix2 = _mm_add_ps(fix2,tx);
525 fiy2 = _mm_add_ps(fiy2,ty);
526 fiz2 = _mm_add_ps(fiz2,tz);
528 fjx0 = _mm_add_ps(fjx0,tx);
529 fjy0 = _mm_add_ps(fjy0,ty);
530 fjz0 = _mm_add_ps(fjz0,tz);
534 /**************************
535 * CALCULATE INTERACTIONS *
536 **************************/
538 if (gmx_mm_any_lt(rsq21,rcutoff2))
541 /* REACTION-FIELD ELECTROSTATICS */
542 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
543 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
545 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
547 /* Update potential sum for this i atom from the interaction with this j atom. */
548 velec = _mm_and_ps(velec,cutoff_mask);
549 velecsum = _mm_add_ps(velecsum,velec);
553 fscal = _mm_and_ps(fscal,cutoff_mask);
555 /* Calculate temporary vectorial force */
556 tx = _mm_mul_ps(fscal,dx21);
557 ty = _mm_mul_ps(fscal,dy21);
558 tz = _mm_mul_ps(fscal,dz21);
560 /* Update vectorial force */
561 fix2 = _mm_add_ps(fix2,tx);
562 fiy2 = _mm_add_ps(fiy2,ty);
563 fiz2 = _mm_add_ps(fiz2,tz);
565 fjx1 = _mm_add_ps(fjx1,tx);
566 fjy1 = _mm_add_ps(fjy1,ty);
567 fjz1 = _mm_add_ps(fjz1,tz);
571 /**************************
572 * CALCULATE INTERACTIONS *
573 **************************/
575 if (gmx_mm_any_lt(rsq22,rcutoff2))
578 /* REACTION-FIELD ELECTROSTATICS */
579 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
580 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
582 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
584 /* Update potential sum for this i atom from the interaction with this j atom. */
585 velec = _mm_and_ps(velec,cutoff_mask);
586 velecsum = _mm_add_ps(velecsum,velec);
590 fscal = _mm_and_ps(fscal,cutoff_mask);
592 /* Calculate temporary vectorial force */
593 tx = _mm_mul_ps(fscal,dx22);
594 ty = _mm_mul_ps(fscal,dy22);
595 tz = _mm_mul_ps(fscal,dz22);
597 /* Update vectorial force */
598 fix2 = _mm_add_ps(fix2,tx);
599 fiy2 = _mm_add_ps(fiy2,ty);
600 fiz2 = _mm_add_ps(fiz2,tz);
602 fjx2 = _mm_add_ps(fjx2,tx);
603 fjy2 = _mm_add_ps(fjy2,ty);
604 fjz2 = _mm_add_ps(fjz2,tz);
608 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
609 f+j_coord_offsetC,f+j_coord_offsetD,
610 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
612 /* Inner loop uses 324 flops */
618 /* Get j neighbor index, and coordinate index */
624 /* Sign of each element will be negative for non-real atoms.
625 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
626 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
628 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
629 jnrA = (jnrA>=0) ? jnrA : 0;
630 jnrB = (jnrB>=0) ? jnrB : 0;
631 jnrC = (jnrC>=0) ? jnrC : 0;
632 jnrD = (jnrD>=0) ? jnrD : 0;
634 j_coord_offsetA = DIM*jnrA;
635 j_coord_offsetB = DIM*jnrB;
636 j_coord_offsetC = DIM*jnrC;
637 j_coord_offsetD = DIM*jnrD;
639 /* load j atom coordinates */
640 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
641 x+j_coord_offsetC,x+j_coord_offsetD,
642 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
644 /* Calculate displacement vector */
645 dx00 = _mm_sub_ps(ix0,jx0);
646 dy00 = _mm_sub_ps(iy0,jy0);
647 dz00 = _mm_sub_ps(iz0,jz0);
648 dx01 = _mm_sub_ps(ix0,jx1);
649 dy01 = _mm_sub_ps(iy0,jy1);
650 dz01 = _mm_sub_ps(iz0,jz1);
651 dx02 = _mm_sub_ps(ix0,jx2);
652 dy02 = _mm_sub_ps(iy0,jy2);
653 dz02 = _mm_sub_ps(iz0,jz2);
654 dx10 = _mm_sub_ps(ix1,jx0);
655 dy10 = _mm_sub_ps(iy1,jy0);
656 dz10 = _mm_sub_ps(iz1,jz0);
657 dx11 = _mm_sub_ps(ix1,jx1);
658 dy11 = _mm_sub_ps(iy1,jy1);
659 dz11 = _mm_sub_ps(iz1,jz1);
660 dx12 = _mm_sub_ps(ix1,jx2);
661 dy12 = _mm_sub_ps(iy1,jy2);
662 dz12 = _mm_sub_ps(iz1,jz2);
663 dx20 = _mm_sub_ps(ix2,jx0);
664 dy20 = _mm_sub_ps(iy2,jy0);
665 dz20 = _mm_sub_ps(iz2,jz0);
666 dx21 = _mm_sub_ps(ix2,jx1);
667 dy21 = _mm_sub_ps(iy2,jy1);
668 dz21 = _mm_sub_ps(iz2,jz1);
669 dx22 = _mm_sub_ps(ix2,jx2);
670 dy22 = _mm_sub_ps(iy2,jy2);
671 dz22 = _mm_sub_ps(iz2,jz2);
673 /* Calculate squared distance and things based on it */
674 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
675 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
676 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
677 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
678 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
679 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
680 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
681 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
682 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
684 rinv00 = gmx_mm_invsqrt_ps(rsq00);
685 rinv01 = gmx_mm_invsqrt_ps(rsq01);
686 rinv02 = gmx_mm_invsqrt_ps(rsq02);
687 rinv10 = gmx_mm_invsqrt_ps(rsq10);
688 rinv11 = gmx_mm_invsqrt_ps(rsq11);
689 rinv12 = gmx_mm_invsqrt_ps(rsq12);
690 rinv20 = gmx_mm_invsqrt_ps(rsq20);
691 rinv21 = gmx_mm_invsqrt_ps(rsq21);
692 rinv22 = gmx_mm_invsqrt_ps(rsq22);
694 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
695 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
696 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
697 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
698 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
699 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
700 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
701 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
702 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
704 fjx0 = _mm_setzero_ps();
705 fjy0 = _mm_setzero_ps();
706 fjz0 = _mm_setzero_ps();
707 fjx1 = _mm_setzero_ps();
708 fjy1 = _mm_setzero_ps();
709 fjz1 = _mm_setzero_ps();
710 fjx2 = _mm_setzero_ps();
711 fjy2 = _mm_setzero_ps();
712 fjz2 = _mm_setzero_ps();
714 /**************************
715 * CALCULATE INTERACTIONS *
716 **************************/
718 if (gmx_mm_any_lt(rsq00,rcutoff2))
721 /* REACTION-FIELD ELECTROSTATICS */
722 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
723 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
725 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
727 /* Update potential sum for this i atom from the interaction with this j atom. */
728 velec = _mm_and_ps(velec,cutoff_mask);
729 velec = _mm_andnot_ps(dummy_mask,velec);
730 velecsum = _mm_add_ps(velecsum,velec);
734 fscal = _mm_and_ps(fscal,cutoff_mask);
736 fscal = _mm_andnot_ps(dummy_mask,fscal);
738 /* Calculate temporary vectorial force */
739 tx = _mm_mul_ps(fscal,dx00);
740 ty = _mm_mul_ps(fscal,dy00);
741 tz = _mm_mul_ps(fscal,dz00);
743 /* Update vectorial force */
744 fix0 = _mm_add_ps(fix0,tx);
745 fiy0 = _mm_add_ps(fiy0,ty);
746 fiz0 = _mm_add_ps(fiz0,tz);
748 fjx0 = _mm_add_ps(fjx0,tx);
749 fjy0 = _mm_add_ps(fjy0,ty);
750 fjz0 = _mm_add_ps(fjz0,tz);
754 /**************************
755 * CALCULATE INTERACTIONS *
756 **************************/
758 if (gmx_mm_any_lt(rsq01,rcutoff2))
761 /* REACTION-FIELD ELECTROSTATICS */
762 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
763 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
765 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
767 /* Update potential sum for this i atom from the interaction with this j atom. */
768 velec = _mm_and_ps(velec,cutoff_mask);
769 velec = _mm_andnot_ps(dummy_mask,velec);
770 velecsum = _mm_add_ps(velecsum,velec);
774 fscal = _mm_and_ps(fscal,cutoff_mask);
776 fscal = _mm_andnot_ps(dummy_mask,fscal);
778 /* Calculate temporary vectorial force */
779 tx = _mm_mul_ps(fscal,dx01);
780 ty = _mm_mul_ps(fscal,dy01);
781 tz = _mm_mul_ps(fscal,dz01);
783 /* Update vectorial force */
784 fix0 = _mm_add_ps(fix0,tx);
785 fiy0 = _mm_add_ps(fiy0,ty);
786 fiz0 = _mm_add_ps(fiz0,tz);
788 fjx1 = _mm_add_ps(fjx1,tx);
789 fjy1 = _mm_add_ps(fjy1,ty);
790 fjz1 = _mm_add_ps(fjz1,tz);
794 /**************************
795 * CALCULATE INTERACTIONS *
796 **************************/
798 if (gmx_mm_any_lt(rsq02,rcutoff2))
801 /* REACTION-FIELD ELECTROSTATICS */
802 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
803 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
805 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
807 /* Update potential sum for this i atom from the interaction with this j atom. */
808 velec = _mm_and_ps(velec,cutoff_mask);
809 velec = _mm_andnot_ps(dummy_mask,velec);
810 velecsum = _mm_add_ps(velecsum,velec);
814 fscal = _mm_and_ps(fscal,cutoff_mask);
816 fscal = _mm_andnot_ps(dummy_mask,fscal);
818 /* Calculate temporary vectorial force */
819 tx = _mm_mul_ps(fscal,dx02);
820 ty = _mm_mul_ps(fscal,dy02);
821 tz = _mm_mul_ps(fscal,dz02);
823 /* Update vectorial force */
824 fix0 = _mm_add_ps(fix0,tx);
825 fiy0 = _mm_add_ps(fiy0,ty);
826 fiz0 = _mm_add_ps(fiz0,tz);
828 fjx2 = _mm_add_ps(fjx2,tx);
829 fjy2 = _mm_add_ps(fjy2,ty);
830 fjz2 = _mm_add_ps(fjz2,tz);
834 /**************************
835 * CALCULATE INTERACTIONS *
836 **************************/
838 if (gmx_mm_any_lt(rsq10,rcutoff2))
841 /* REACTION-FIELD ELECTROSTATICS */
842 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
843 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
845 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
847 /* Update potential sum for this i atom from the interaction with this j atom. */
848 velec = _mm_and_ps(velec,cutoff_mask);
849 velec = _mm_andnot_ps(dummy_mask,velec);
850 velecsum = _mm_add_ps(velecsum,velec);
854 fscal = _mm_and_ps(fscal,cutoff_mask);
856 fscal = _mm_andnot_ps(dummy_mask,fscal);
858 /* Calculate temporary vectorial force */
859 tx = _mm_mul_ps(fscal,dx10);
860 ty = _mm_mul_ps(fscal,dy10);
861 tz = _mm_mul_ps(fscal,dz10);
863 /* Update vectorial force */
864 fix1 = _mm_add_ps(fix1,tx);
865 fiy1 = _mm_add_ps(fiy1,ty);
866 fiz1 = _mm_add_ps(fiz1,tz);
868 fjx0 = _mm_add_ps(fjx0,tx);
869 fjy0 = _mm_add_ps(fjy0,ty);
870 fjz0 = _mm_add_ps(fjz0,tz);
874 /**************************
875 * CALCULATE INTERACTIONS *
876 **************************/
878 if (gmx_mm_any_lt(rsq11,rcutoff2))
881 /* REACTION-FIELD ELECTROSTATICS */
882 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
883 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
885 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
887 /* Update potential sum for this i atom from the interaction with this j atom. */
888 velec = _mm_and_ps(velec,cutoff_mask);
889 velec = _mm_andnot_ps(dummy_mask,velec);
890 velecsum = _mm_add_ps(velecsum,velec);
894 fscal = _mm_and_ps(fscal,cutoff_mask);
896 fscal = _mm_andnot_ps(dummy_mask,fscal);
898 /* Calculate temporary vectorial force */
899 tx = _mm_mul_ps(fscal,dx11);
900 ty = _mm_mul_ps(fscal,dy11);
901 tz = _mm_mul_ps(fscal,dz11);
903 /* Update vectorial force */
904 fix1 = _mm_add_ps(fix1,tx);
905 fiy1 = _mm_add_ps(fiy1,ty);
906 fiz1 = _mm_add_ps(fiz1,tz);
908 fjx1 = _mm_add_ps(fjx1,tx);
909 fjy1 = _mm_add_ps(fjy1,ty);
910 fjz1 = _mm_add_ps(fjz1,tz);
914 /**************************
915 * CALCULATE INTERACTIONS *
916 **************************/
918 if (gmx_mm_any_lt(rsq12,rcutoff2))
921 /* REACTION-FIELD ELECTROSTATICS */
922 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
923 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
925 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
927 /* Update potential sum for this i atom from the interaction with this j atom. */
928 velec = _mm_and_ps(velec,cutoff_mask);
929 velec = _mm_andnot_ps(dummy_mask,velec);
930 velecsum = _mm_add_ps(velecsum,velec);
934 fscal = _mm_and_ps(fscal,cutoff_mask);
936 fscal = _mm_andnot_ps(dummy_mask,fscal);
938 /* Calculate temporary vectorial force */
939 tx = _mm_mul_ps(fscal,dx12);
940 ty = _mm_mul_ps(fscal,dy12);
941 tz = _mm_mul_ps(fscal,dz12);
943 /* Update vectorial force */
944 fix1 = _mm_add_ps(fix1,tx);
945 fiy1 = _mm_add_ps(fiy1,ty);
946 fiz1 = _mm_add_ps(fiz1,tz);
948 fjx2 = _mm_add_ps(fjx2,tx);
949 fjy2 = _mm_add_ps(fjy2,ty);
950 fjz2 = _mm_add_ps(fjz2,tz);
954 /**************************
955 * CALCULATE INTERACTIONS *
956 **************************/
958 if (gmx_mm_any_lt(rsq20,rcutoff2))
961 /* REACTION-FIELD ELECTROSTATICS */
962 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
963 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
965 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
967 /* Update potential sum for this i atom from the interaction with this j atom. */
968 velec = _mm_and_ps(velec,cutoff_mask);
969 velec = _mm_andnot_ps(dummy_mask,velec);
970 velecsum = _mm_add_ps(velecsum,velec);
974 fscal = _mm_and_ps(fscal,cutoff_mask);
976 fscal = _mm_andnot_ps(dummy_mask,fscal);
978 /* Calculate temporary vectorial force */
979 tx = _mm_mul_ps(fscal,dx20);
980 ty = _mm_mul_ps(fscal,dy20);
981 tz = _mm_mul_ps(fscal,dz20);
983 /* Update vectorial force */
984 fix2 = _mm_add_ps(fix2,tx);
985 fiy2 = _mm_add_ps(fiy2,ty);
986 fiz2 = _mm_add_ps(fiz2,tz);
988 fjx0 = _mm_add_ps(fjx0,tx);
989 fjy0 = _mm_add_ps(fjy0,ty);
990 fjz0 = _mm_add_ps(fjz0,tz);
994 /**************************
995 * CALCULATE INTERACTIONS *
996 **************************/
998 if (gmx_mm_any_lt(rsq21,rcutoff2))
1001 /* REACTION-FIELD ELECTROSTATICS */
1002 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
1003 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1005 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1007 /* Update potential sum for this i atom from the interaction with this j atom. */
1008 velec = _mm_and_ps(velec,cutoff_mask);
1009 velec = _mm_andnot_ps(dummy_mask,velec);
1010 velecsum = _mm_add_ps(velecsum,velec);
1014 fscal = _mm_and_ps(fscal,cutoff_mask);
1016 fscal = _mm_andnot_ps(dummy_mask,fscal);
1018 /* Calculate temporary vectorial force */
1019 tx = _mm_mul_ps(fscal,dx21);
1020 ty = _mm_mul_ps(fscal,dy21);
1021 tz = _mm_mul_ps(fscal,dz21);
1023 /* Update vectorial force */
1024 fix2 = _mm_add_ps(fix2,tx);
1025 fiy2 = _mm_add_ps(fiy2,ty);
1026 fiz2 = _mm_add_ps(fiz2,tz);
1028 fjx1 = _mm_add_ps(fjx1,tx);
1029 fjy1 = _mm_add_ps(fjy1,ty);
1030 fjz1 = _mm_add_ps(fjz1,tz);
1034 /**************************
1035 * CALCULATE INTERACTIONS *
1036 **************************/
1038 if (gmx_mm_any_lt(rsq22,rcutoff2))
1041 /* REACTION-FIELD ELECTROSTATICS */
1042 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
1043 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1045 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1047 /* Update potential sum for this i atom from the interaction with this j atom. */
1048 velec = _mm_and_ps(velec,cutoff_mask);
1049 velec = _mm_andnot_ps(dummy_mask,velec);
1050 velecsum = _mm_add_ps(velecsum,velec);
1054 fscal = _mm_and_ps(fscal,cutoff_mask);
1056 fscal = _mm_andnot_ps(dummy_mask,fscal);
1058 /* Calculate temporary vectorial force */
1059 tx = _mm_mul_ps(fscal,dx22);
1060 ty = _mm_mul_ps(fscal,dy22);
1061 tz = _mm_mul_ps(fscal,dz22);
1063 /* Update vectorial force */
1064 fix2 = _mm_add_ps(fix2,tx);
1065 fiy2 = _mm_add_ps(fiy2,ty);
1066 fiz2 = _mm_add_ps(fiz2,tz);
1068 fjx2 = _mm_add_ps(fjx2,tx);
1069 fjy2 = _mm_add_ps(fjy2,ty);
1070 fjz2 = _mm_add_ps(fjz2,tz);
1074 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1075 f+j_coord_offsetC,f+j_coord_offsetD,
1076 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1078 /* Inner loop uses 324 flops */
1081 /* End of innermost loop */
1083 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1084 f+i_coord_offset,fshift+i_shift_offset);
1087 /* Update potential energies */
1088 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1090 /* Increment number of inner iterations */
1091 inneriter += j_index_end - j_index_start;
1093 /* Outer loop uses 28 flops */
1096 /* Increment number of outer iterations */
1099 /* Update outer/inner flops */
1101 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*324);
1104 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse2_single
1105 * Electrostatics interaction: ReactionField
1106 * VdW interaction: None
1107 * Geometry: Water3-Water3
1108 * Calculate force/pot: Force
1111 nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse2_single
1112 (t_nblist * gmx_restrict nlist,
1113 rvec * gmx_restrict xx,
1114 rvec * gmx_restrict ff,
1115 t_forcerec * gmx_restrict fr,
1116 t_mdatoms * gmx_restrict mdatoms,
1117 nb_kernel_data_t * gmx_restrict kernel_data,
1118 t_nrnb * gmx_restrict nrnb)
1120 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1121 * just 0 for non-waters.
1122 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1123 * jnr indices corresponding to data put in the four positions in the SIMD register.
1125 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1126 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1127 int jnrA,jnrB,jnrC,jnrD;
1128 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1129 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1130 real shX,shY,shZ,rcutoff_scalar;
1131 real *shiftvec,*fshift,*x,*f;
1132 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1134 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1136 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1138 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1139 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1140 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1141 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1142 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1143 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1144 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1145 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1146 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1147 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1148 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1149 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1150 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1151 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1152 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1153 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1154 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1156 __m128 dummy_mask,cutoff_mask;
1157 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1158 __m128 one = _mm_set1_ps(1.0);
1159 __m128 two = _mm_set1_ps(2.0);
1165 jindex = nlist->jindex;
1167 shiftidx = nlist->shift;
1169 shiftvec = fr->shift_vec[0];
1170 fshift = fr->fshift[0];
1171 facel = _mm_set1_ps(fr->epsfac);
1172 charge = mdatoms->chargeA;
1173 krf = _mm_set1_ps(fr->ic->k_rf);
1174 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
1175 crf = _mm_set1_ps(fr->ic->c_rf);
1177 /* Setup water-specific parameters */
1178 inr = nlist->iinr[0];
1179 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1180 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1181 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1183 jq0 = _mm_set1_ps(charge[inr+0]);
1184 jq1 = _mm_set1_ps(charge[inr+1]);
1185 jq2 = _mm_set1_ps(charge[inr+2]);
1186 qq00 = _mm_mul_ps(iq0,jq0);
1187 qq01 = _mm_mul_ps(iq0,jq1);
1188 qq02 = _mm_mul_ps(iq0,jq2);
1189 qq10 = _mm_mul_ps(iq1,jq0);
1190 qq11 = _mm_mul_ps(iq1,jq1);
1191 qq12 = _mm_mul_ps(iq1,jq2);
1192 qq20 = _mm_mul_ps(iq2,jq0);
1193 qq21 = _mm_mul_ps(iq2,jq1);
1194 qq22 = _mm_mul_ps(iq2,jq2);
1196 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1197 rcutoff_scalar = fr->rcoulomb;
1198 rcutoff = _mm_set1_ps(rcutoff_scalar);
1199 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1201 /* Avoid stupid compiler warnings */
1202 jnrA = jnrB = jnrC = jnrD = 0;
1203 j_coord_offsetA = 0;
1204 j_coord_offsetB = 0;
1205 j_coord_offsetC = 0;
1206 j_coord_offsetD = 0;
1211 /* Start outer loop over neighborlists */
1212 for(iidx=0; iidx<nri; iidx++)
1214 /* Load shift vector for this list */
1215 i_shift_offset = DIM*shiftidx[iidx];
1216 shX = shiftvec[i_shift_offset+XX];
1217 shY = shiftvec[i_shift_offset+YY];
1218 shZ = shiftvec[i_shift_offset+ZZ];
1220 /* Load limits for loop over neighbors */
1221 j_index_start = jindex[iidx];
1222 j_index_end = jindex[iidx+1];
1224 /* Get outer coordinate index */
1226 i_coord_offset = DIM*inr;
1228 /* Load i particle coords and add shift vector */
1229 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
1230 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
1231 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
1232 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1233 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1234 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1235 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1236 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1237 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1239 fix0 = _mm_setzero_ps();
1240 fiy0 = _mm_setzero_ps();
1241 fiz0 = _mm_setzero_ps();
1242 fix1 = _mm_setzero_ps();
1243 fiy1 = _mm_setzero_ps();
1244 fiz1 = _mm_setzero_ps();
1245 fix2 = _mm_setzero_ps();
1246 fiy2 = _mm_setzero_ps();
1247 fiz2 = _mm_setzero_ps();
1249 /* Start inner kernel loop */
1250 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1253 /* Get j neighbor index, and coordinate index */
1255 jnrB = jjnr[jidx+1];
1256 jnrC = jjnr[jidx+2];
1257 jnrD = jjnr[jidx+3];
1259 j_coord_offsetA = DIM*jnrA;
1260 j_coord_offsetB = DIM*jnrB;
1261 j_coord_offsetC = DIM*jnrC;
1262 j_coord_offsetD = DIM*jnrD;
1264 /* load j atom coordinates */
1265 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1266 x+j_coord_offsetC,x+j_coord_offsetD,
1267 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1269 /* Calculate displacement vector */
1270 dx00 = _mm_sub_ps(ix0,jx0);
1271 dy00 = _mm_sub_ps(iy0,jy0);
1272 dz00 = _mm_sub_ps(iz0,jz0);
1273 dx01 = _mm_sub_ps(ix0,jx1);
1274 dy01 = _mm_sub_ps(iy0,jy1);
1275 dz01 = _mm_sub_ps(iz0,jz1);
1276 dx02 = _mm_sub_ps(ix0,jx2);
1277 dy02 = _mm_sub_ps(iy0,jy2);
1278 dz02 = _mm_sub_ps(iz0,jz2);
1279 dx10 = _mm_sub_ps(ix1,jx0);
1280 dy10 = _mm_sub_ps(iy1,jy0);
1281 dz10 = _mm_sub_ps(iz1,jz0);
1282 dx11 = _mm_sub_ps(ix1,jx1);
1283 dy11 = _mm_sub_ps(iy1,jy1);
1284 dz11 = _mm_sub_ps(iz1,jz1);
1285 dx12 = _mm_sub_ps(ix1,jx2);
1286 dy12 = _mm_sub_ps(iy1,jy2);
1287 dz12 = _mm_sub_ps(iz1,jz2);
1288 dx20 = _mm_sub_ps(ix2,jx0);
1289 dy20 = _mm_sub_ps(iy2,jy0);
1290 dz20 = _mm_sub_ps(iz2,jz0);
1291 dx21 = _mm_sub_ps(ix2,jx1);
1292 dy21 = _mm_sub_ps(iy2,jy1);
1293 dz21 = _mm_sub_ps(iz2,jz1);
1294 dx22 = _mm_sub_ps(ix2,jx2);
1295 dy22 = _mm_sub_ps(iy2,jy2);
1296 dz22 = _mm_sub_ps(iz2,jz2);
1298 /* Calculate squared distance and things based on it */
1299 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1300 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1301 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1302 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1303 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1304 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1305 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1306 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1307 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1309 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1310 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1311 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1312 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1313 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1314 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1315 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1316 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1317 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1319 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1320 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1321 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1322 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1323 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1324 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1325 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1326 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1327 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1329 fjx0 = _mm_setzero_ps();
1330 fjy0 = _mm_setzero_ps();
1331 fjz0 = _mm_setzero_ps();
1332 fjx1 = _mm_setzero_ps();
1333 fjy1 = _mm_setzero_ps();
1334 fjz1 = _mm_setzero_ps();
1335 fjx2 = _mm_setzero_ps();
1336 fjy2 = _mm_setzero_ps();
1337 fjz2 = _mm_setzero_ps();
1339 /**************************
1340 * CALCULATE INTERACTIONS *
1341 **************************/
1343 if (gmx_mm_any_lt(rsq00,rcutoff2))
1346 /* REACTION-FIELD ELECTROSTATICS */
1347 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1349 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1353 fscal = _mm_and_ps(fscal,cutoff_mask);
1355 /* Calculate temporary vectorial force */
1356 tx = _mm_mul_ps(fscal,dx00);
1357 ty = _mm_mul_ps(fscal,dy00);
1358 tz = _mm_mul_ps(fscal,dz00);
1360 /* Update vectorial force */
1361 fix0 = _mm_add_ps(fix0,tx);
1362 fiy0 = _mm_add_ps(fiy0,ty);
1363 fiz0 = _mm_add_ps(fiz0,tz);
1365 fjx0 = _mm_add_ps(fjx0,tx);
1366 fjy0 = _mm_add_ps(fjy0,ty);
1367 fjz0 = _mm_add_ps(fjz0,tz);
1371 /**************************
1372 * CALCULATE INTERACTIONS *
1373 **************************/
1375 if (gmx_mm_any_lt(rsq01,rcutoff2))
1378 /* REACTION-FIELD ELECTROSTATICS */
1379 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1381 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1385 fscal = _mm_and_ps(fscal,cutoff_mask);
1387 /* Calculate temporary vectorial force */
1388 tx = _mm_mul_ps(fscal,dx01);
1389 ty = _mm_mul_ps(fscal,dy01);
1390 tz = _mm_mul_ps(fscal,dz01);
1392 /* Update vectorial force */
1393 fix0 = _mm_add_ps(fix0,tx);
1394 fiy0 = _mm_add_ps(fiy0,ty);
1395 fiz0 = _mm_add_ps(fiz0,tz);
1397 fjx1 = _mm_add_ps(fjx1,tx);
1398 fjy1 = _mm_add_ps(fjy1,ty);
1399 fjz1 = _mm_add_ps(fjz1,tz);
1403 /**************************
1404 * CALCULATE INTERACTIONS *
1405 **************************/
1407 if (gmx_mm_any_lt(rsq02,rcutoff2))
1410 /* REACTION-FIELD ELECTROSTATICS */
1411 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1413 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1417 fscal = _mm_and_ps(fscal,cutoff_mask);
1419 /* Calculate temporary vectorial force */
1420 tx = _mm_mul_ps(fscal,dx02);
1421 ty = _mm_mul_ps(fscal,dy02);
1422 tz = _mm_mul_ps(fscal,dz02);
1424 /* Update vectorial force */
1425 fix0 = _mm_add_ps(fix0,tx);
1426 fiy0 = _mm_add_ps(fiy0,ty);
1427 fiz0 = _mm_add_ps(fiz0,tz);
1429 fjx2 = _mm_add_ps(fjx2,tx);
1430 fjy2 = _mm_add_ps(fjy2,ty);
1431 fjz2 = _mm_add_ps(fjz2,tz);
1435 /**************************
1436 * CALCULATE INTERACTIONS *
1437 **************************/
1439 if (gmx_mm_any_lt(rsq10,rcutoff2))
1442 /* REACTION-FIELD ELECTROSTATICS */
1443 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1445 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1449 fscal = _mm_and_ps(fscal,cutoff_mask);
1451 /* Calculate temporary vectorial force */
1452 tx = _mm_mul_ps(fscal,dx10);
1453 ty = _mm_mul_ps(fscal,dy10);
1454 tz = _mm_mul_ps(fscal,dz10);
1456 /* Update vectorial force */
1457 fix1 = _mm_add_ps(fix1,tx);
1458 fiy1 = _mm_add_ps(fiy1,ty);
1459 fiz1 = _mm_add_ps(fiz1,tz);
1461 fjx0 = _mm_add_ps(fjx0,tx);
1462 fjy0 = _mm_add_ps(fjy0,ty);
1463 fjz0 = _mm_add_ps(fjz0,tz);
1467 /**************************
1468 * CALCULATE INTERACTIONS *
1469 **************************/
1471 if (gmx_mm_any_lt(rsq11,rcutoff2))
1474 /* REACTION-FIELD ELECTROSTATICS */
1475 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1477 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1481 fscal = _mm_and_ps(fscal,cutoff_mask);
1483 /* Calculate temporary vectorial force */
1484 tx = _mm_mul_ps(fscal,dx11);
1485 ty = _mm_mul_ps(fscal,dy11);
1486 tz = _mm_mul_ps(fscal,dz11);
1488 /* Update vectorial force */
1489 fix1 = _mm_add_ps(fix1,tx);
1490 fiy1 = _mm_add_ps(fiy1,ty);
1491 fiz1 = _mm_add_ps(fiz1,tz);
1493 fjx1 = _mm_add_ps(fjx1,tx);
1494 fjy1 = _mm_add_ps(fjy1,ty);
1495 fjz1 = _mm_add_ps(fjz1,tz);
1499 /**************************
1500 * CALCULATE INTERACTIONS *
1501 **************************/
1503 if (gmx_mm_any_lt(rsq12,rcutoff2))
1506 /* REACTION-FIELD ELECTROSTATICS */
1507 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1509 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1513 fscal = _mm_and_ps(fscal,cutoff_mask);
1515 /* Calculate temporary vectorial force */
1516 tx = _mm_mul_ps(fscal,dx12);
1517 ty = _mm_mul_ps(fscal,dy12);
1518 tz = _mm_mul_ps(fscal,dz12);
1520 /* Update vectorial force */
1521 fix1 = _mm_add_ps(fix1,tx);
1522 fiy1 = _mm_add_ps(fiy1,ty);
1523 fiz1 = _mm_add_ps(fiz1,tz);
1525 fjx2 = _mm_add_ps(fjx2,tx);
1526 fjy2 = _mm_add_ps(fjy2,ty);
1527 fjz2 = _mm_add_ps(fjz2,tz);
1531 /**************************
1532 * CALCULATE INTERACTIONS *
1533 **************************/
1535 if (gmx_mm_any_lt(rsq20,rcutoff2))
1538 /* REACTION-FIELD ELECTROSTATICS */
1539 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1541 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1545 fscal = _mm_and_ps(fscal,cutoff_mask);
1547 /* Calculate temporary vectorial force */
1548 tx = _mm_mul_ps(fscal,dx20);
1549 ty = _mm_mul_ps(fscal,dy20);
1550 tz = _mm_mul_ps(fscal,dz20);
1552 /* Update vectorial force */
1553 fix2 = _mm_add_ps(fix2,tx);
1554 fiy2 = _mm_add_ps(fiy2,ty);
1555 fiz2 = _mm_add_ps(fiz2,tz);
1557 fjx0 = _mm_add_ps(fjx0,tx);
1558 fjy0 = _mm_add_ps(fjy0,ty);
1559 fjz0 = _mm_add_ps(fjz0,tz);
1563 /**************************
1564 * CALCULATE INTERACTIONS *
1565 **************************/
1567 if (gmx_mm_any_lt(rsq21,rcutoff2))
1570 /* REACTION-FIELD ELECTROSTATICS */
1571 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1573 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1577 fscal = _mm_and_ps(fscal,cutoff_mask);
1579 /* Calculate temporary vectorial force */
1580 tx = _mm_mul_ps(fscal,dx21);
1581 ty = _mm_mul_ps(fscal,dy21);
1582 tz = _mm_mul_ps(fscal,dz21);
1584 /* Update vectorial force */
1585 fix2 = _mm_add_ps(fix2,tx);
1586 fiy2 = _mm_add_ps(fiy2,ty);
1587 fiz2 = _mm_add_ps(fiz2,tz);
1589 fjx1 = _mm_add_ps(fjx1,tx);
1590 fjy1 = _mm_add_ps(fjy1,ty);
1591 fjz1 = _mm_add_ps(fjz1,tz);
1595 /**************************
1596 * CALCULATE INTERACTIONS *
1597 **************************/
1599 if (gmx_mm_any_lt(rsq22,rcutoff2))
1602 /* REACTION-FIELD ELECTROSTATICS */
1603 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1605 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1609 fscal = _mm_and_ps(fscal,cutoff_mask);
1611 /* Calculate temporary vectorial force */
1612 tx = _mm_mul_ps(fscal,dx22);
1613 ty = _mm_mul_ps(fscal,dy22);
1614 tz = _mm_mul_ps(fscal,dz22);
1616 /* Update vectorial force */
1617 fix2 = _mm_add_ps(fix2,tx);
1618 fiy2 = _mm_add_ps(fiy2,ty);
1619 fiz2 = _mm_add_ps(fiz2,tz);
1621 fjx2 = _mm_add_ps(fjx2,tx);
1622 fjy2 = _mm_add_ps(fjy2,ty);
1623 fjz2 = _mm_add_ps(fjz2,tz);
1627 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1628 f+j_coord_offsetC,f+j_coord_offsetD,
1629 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1631 /* Inner loop uses 270 flops */
1634 if(jidx<j_index_end)
1637 /* Get j neighbor index, and coordinate index */
1639 jnrB = jjnr[jidx+1];
1640 jnrC = jjnr[jidx+2];
1641 jnrD = jjnr[jidx+3];
1643 /* Sign of each element will be negative for non-real atoms.
1644 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1645 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1647 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1648 jnrA = (jnrA>=0) ? jnrA : 0;
1649 jnrB = (jnrB>=0) ? jnrB : 0;
1650 jnrC = (jnrC>=0) ? jnrC : 0;
1651 jnrD = (jnrD>=0) ? jnrD : 0;
1653 j_coord_offsetA = DIM*jnrA;
1654 j_coord_offsetB = DIM*jnrB;
1655 j_coord_offsetC = DIM*jnrC;
1656 j_coord_offsetD = DIM*jnrD;
1658 /* load j atom coordinates */
1659 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1660 x+j_coord_offsetC,x+j_coord_offsetD,
1661 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1663 /* Calculate displacement vector */
1664 dx00 = _mm_sub_ps(ix0,jx0);
1665 dy00 = _mm_sub_ps(iy0,jy0);
1666 dz00 = _mm_sub_ps(iz0,jz0);
1667 dx01 = _mm_sub_ps(ix0,jx1);
1668 dy01 = _mm_sub_ps(iy0,jy1);
1669 dz01 = _mm_sub_ps(iz0,jz1);
1670 dx02 = _mm_sub_ps(ix0,jx2);
1671 dy02 = _mm_sub_ps(iy0,jy2);
1672 dz02 = _mm_sub_ps(iz0,jz2);
1673 dx10 = _mm_sub_ps(ix1,jx0);
1674 dy10 = _mm_sub_ps(iy1,jy0);
1675 dz10 = _mm_sub_ps(iz1,jz0);
1676 dx11 = _mm_sub_ps(ix1,jx1);
1677 dy11 = _mm_sub_ps(iy1,jy1);
1678 dz11 = _mm_sub_ps(iz1,jz1);
1679 dx12 = _mm_sub_ps(ix1,jx2);
1680 dy12 = _mm_sub_ps(iy1,jy2);
1681 dz12 = _mm_sub_ps(iz1,jz2);
1682 dx20 = _mm_sub_ps(ix2,jx0);
1683 dy20 = _mm_sub_ps(iy2,jy0);
1684 dz20 = _mm_sub_ps(iz2,jz0);
1685 dx21 = _mm_sub_ps(ix2,jx1);
1686 dy21 = _mm_sub_ps(iy2,jy1);
1687 dz21 = _mm_sub_ps(iz2,jz1);
1688 dx22 = _mm_sub_ps(ix2,jx2);
1689 dy22 = _mm_sub_ps(iy2,jy2);
1690 dz22 = _mm_sub_ps(iz2,jz2);
1692 /* Calculate squared distance and things based on it */
1693 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1694 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1695 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1696 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1697 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1698 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1699 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1700 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1701 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1703 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1704 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1705 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1706 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1707 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1708 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1709 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1710 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1711 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1713 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1714 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1715 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1716 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1717 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1718 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1719 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1720 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1721 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1723 fjx0 = _mm_setzero_ps();
1724 fjy0 = _mm_setzero_ps();
1725 fjz0 = _mm_setzero_ps();
1726 fjx1 = _mm_setzero_ps();
1727 fjy1 = _mm_setzero_ps();
1728 fjz1 = _mm_setzero_ps();
1729 fjx2 = _mm_setzero_ps();
1730 fjy2 = _mm_setzero_ps();
1731 fjz2 = _mm_setzero_ps();
1733 /**************************
1734 * CALCULATE INTERACTIONS *
1735 **************************/
1737 if (gmx_mm_any_lt(rsq00,rcutoff2))
1740 /* REACTION-FIELD ELECTROSTATICS */
1741 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1743 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1747 fscal = _mm_and_ps(fscal,cutoff_mask);
1749 fscal = _mm_andnot_ps(dummy_mask,fscal);
1751 /* Calculate temporary vectorial force */
1752 tx = _mm_mul_ps(fscal,dx00);
1753 ty = _mm_mul_ps(fscal,dy00);
1754 tz = _mm_mul_ps(fscal,dz00);
1756 /* Update vectorial force */
1757 fix0 = _mm_add_ps(fix0,tx);
1758 fiy0 = _mm_add_ps(fiy0,ty);
1759 fiz0 = _mm_add_ps(fiz0,tz);
1761 fjx0 = _mm_add_ps(fjx0,tx);
1762 fjy0 = _mm_add_ps(fjy0,ty);
1763 fjz0 = _mm_add_ps(fjz0,tz);
1767 /**************************
1768 * CALCULATE INTERACTIONS *
1769 **************************/
1771 if (gmx_mm_any_lt(rsq01,rcutoff2))
1774 /* REACTION-FIELD ELECTROSTATICS */
1775 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1777 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1781 fscal = _mm_and_ps(fscal,cutoff_mask);
1783 fscal = _mm_andnot_ps(dummy_mask,fscal);
1785 /* Calculate temporary vectorial force */
1786 tx = _mm_mul_ps(fscal,dx01);
1787 ty = _mm_mul_ps(fscal,dy01);
1788 tz = _mm_mul_ps(fscal,dz01);
1790 /* Update vectorial force */
1791 fix0 = _mm_add_ps(fix0,tx);
1792 fiy0 = _mm_add_ps(fiy0,ty);
1793 fiz0 = _mm_add_ps(fiz0,tz);
1795 fjx1 = _mm_add_ps(fjx1,tx);
1796 fjy1 = _mm_add_ps(fjy1,ty);
1797 fjz1 = _mm_add_ps(fjz1,tz);
1801 /**************************
1802 * CALCULATE INTERACTIONS *
1803 **************************/
1805 if (gmx_mm_any_lt(rsq02,rcutoff2))
1808 /* REACTION-FIELD ELECTROSTATICS */
1809 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1811 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1815 fscal = _mm_and_ps(fscal,cutoff_mask);
1817 fscal = _mm_andnot_ps(dummy_mask,fscal);
1819 /* Calculate temporary vectorial force */
1820 tx = _mm_mul_ps(fscal,dx02);
1821 ty = _mm_mul_ps(fscal,dy02);
1822 tz = _mm_mul_ps(fscal,dz02);
1824 /* Update vectorial force */
1825 fix0 = _mm_add_ps(fix0,tx);
1826 fiy0 = _mm_add_ps(fiy0,ty);
1827 fiz0 = _mm_add_ps(fiz0,tz);
1829 fjx2 = _mm_add_ps(fjx2,tx);
1830 fjy2 = _mm_add_ps(fjy2,ty);
1831 fjz2 = _mm_add_ps(fjz2,tz);
1835 /**************************
1836 * CALCULATE INTERACTIONS *
1837 **************************/
1839 if (gmx_mm_any_lt(rsq10,rcutoff2))
1842 /* REACTION-FIELD ELECTROSTATICS */
1843 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1845 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1849 fscal = _mm_and_ps(fscal,cutoff_mask);
1851 fscal = _mm_andnot_ps(dummy_mask,fscal);
1853 /* Calculate temporary vectorial force */
1854 tx = _mm_mul_ps(fscal,dx10);
1855 ty = _mm_mul_ps(fscal,dy10);
1856 tz = _mm_mul_ps(fscal,dz10);
1858 /* Update vectorial force */
1859 fix1 = _mm_add_ps(fix1,tx);
1860 fiy1 = _mm_add_ps(fiy1,ty);
1861 fiz1 = _mm_add_ps(fiz1,tz);
1863 fjx0 = _mm_add_ps(fjx0,tx);
1864 fjy0 = _mm_add_ps(fjy0,ty);
1865 fjz0 = _mm_add_ps(fjz0,tz);
1869 /**************************
1870 * CALCULATE INTERACTIONS *
1871 **************************/
1873 if (gmx_mm_any_lt(rsq11,rcutoff2))
1876 /* REACTION-FIELD ELECTROSTATICS */
1877 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1879 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1883 fscal = _mm_and_ps(fscal,cutoff_mask);
1885 fscal = _mm_andnot_ps(dummy_mask,fscal);
1887 /* Calculate temporary vectorial force */
1888 tx = _mm_mul_ps(fscal,dx11);
1889 ty = _mm_mul_ps(fscal,dy11);
1890 tz = _mm_mul_ps(fscal,dz11);
1892 /* Update vectorial force */
1893 fix1 = _mm_add_ps(fix1,tx);
1894 fiy1 = _mm_add_ps(fiy1,ty);
1895 fiz1 = _mm_add_ps(fiz1,tz);
1897 fjx1 = _mm_add_ps(fjx1,tx);
1898 fjy1 = _mm_add_ps(fjy1,ty);
1899 fjz1 = _mm_add_ps(fjz1,tz);
1903 /**************************
1904 * CALCULATE INTERACTIONS *
1905 **************************/
1907 if (gmx_mm_any_lt(rsq12,rcutoff2))
1910 /* REACTION-FIELD ELECTROSTATICS */
1911 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1913 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1917 fscal = _mm_and_ps(fscal,cutoff_mask);
1919 fscal = _mm_andnot_ps(dummy_mask,fscal);
1921 /* Calculate temporary vectorial force */
1922 tx = _mm_mul_ps(fscal,dx12);
1923 ty = _mm_mul_ps(fscal,dy12);
1924 tz = _mm_mul_ps(fscal,dz12);
1926 /* Update vectorial force */
1927 fix1 = _mm_add_ps(fix1,tx);
1928 fiy1 = _mm_add_ps(fiy1,ty);
1929 fiz1 = _mm_add_ps(fiz1,tz);
1931 fjx2 = _mm_add_ps(fjx2,tx);
1932 fjy2 = _mm_add_ps(fjy2,ty);
1933 fjz2 = _mm_add_ps(fjz2,tz);
1937 /**************************
1938 * CALCULATE INTERACTIONS *
1939 **************************/
1941 if (gmx_mm_any_lt(rsq20,rcutoff2))
1944 /* REACTION-FIELD ELECTROSTATICS */
1945 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1947 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1951 fscal = _mm_and_ps(fscal,cutoff_mask);
1953 fscal = _mm_andnot_ps(dummy_mask,fscal);
1955 /* Calculate temporary vectorial force */
1956 tx = _mm_mul_ps(fscal,dx20);
1957 ty = _mm_mul_ps(fscal,dy20);
1958 tz = _mm_mul_ps(fscal,dz20);
1960 /* Update vectorial force */
1961 fix2 = _mm_add_ps(fix2,tx);
1962 fiy2 = _mm_add_ps(fiy2,ty);
1963 fiz2 = _mm_add_ps(fiz2,tz);
1965 fjx0 = _mm_add_ps(fjx0,tx);
1966 fjy0 = _mm_add_ps(fjy0,ty);
1967 fjz0 = _mm_add_ps(fjz0,tz);
1971 /**************************
1972 * CALCULATE INTERACTIONS *
1973 **************************/
1975 if (gmx_mm_any_lt(rsq21,rcutoff2))
1978 /* REACTION-FIELD ELECTROSTATICS */
1979 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1981 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1985 fscal = _mm_and_ps(fscal,cutoff_mask);
1987 fscal = _mm_andnot_ps(dummy_mask,fscal);
1989 /* Calculate temporary vectorial force */
1990 tx = _mm_mul_ps(fscal,dx21);
1991 ty = _mm_mul_ps(fscal,dy21);
1992 tz = _mm_mul_ps(fscal,dz21);
1994 /* Update vectorial force */
1995 fix2 = _mm_add_ps(fix2,tx);
1996 fiy2 = _mm_add_ps(fiy2,ty);
1997 fiz2 = _mm_add_ps(fiz2,tz);
1999 fjx1 = _mm_add_ps(fjx1,tx);
2000 fjy1 = _mm_add_ps(fjy1,ty);
2001 fjz1 = _mm_add_ps(fjz1,tz);
2005 /**************************
2006 * CALCULATE INTERACTIONS *
2007 **************************/
2009 if (gmx_mm_any_lt(rsq22,rcutoff2))
2012 /* REACTION-FIELD ELECTROSTATICS */
2013 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
2015 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2019 fscal = _mm_and_ps(fscal,cutoff_mask);
2021 fscal = _mm_andnot_ps(dummy_mask,fscal);
2023 /* Calculate temporary vectorial force */
2024 tx = _mm_mul_ps(fscal,dx22);
2025 ty = _mm_mul_ps(fscal,dy22);
2026 tz = _mm_mul_ps(fscal,dz22);
2028 /* Update vectorial force */
2029 fix2 = _mm_add_ps(fix2,tx);
2030 fiy2 = _mm_add_ps(fiy2,ty);
2031 fiz2 = _mm_add_ps(fiz2,tz);
2033 fjx2 = _mm_add_ps(fjx2,tx);
2034 fjy2 = _mm_add_ps(fjy2,ty);
2035 fjz2 = _mm_add_ps(fjz2,tz);
2039 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
2040 f+j_coord_offsetC,f+j_coord_offsetD,
2041 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2043 /* Inner loop uses 270 flops */
2046 /* End of innermost loop */
2048 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2049 f+i_coord_offset,fshift+i_shift_offset);
2051 /* Increment number of inner iterations */
2052 inneriter += j_index_end - j_index_start;
2054 /* Outer loop uses 27 flops */
2057 /* Increment number of outer iterations */
2060 /* Update outer/inner flops */
2062 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*270);