2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse2_single
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: None
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real shX,shY,shZ,rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
69 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
71 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
72 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
73 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
74 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
75 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
76 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
77 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
78 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
79 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
80 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
81 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
82 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
83 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
84 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
85 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
86 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
87 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
89 __m128 dummy_mask,cutoff_mask;
90 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
91 __m128 one = _mm_set1_ps(1.0);
92 __m128 two = _mm_set1_ps(2.0);
98 jindex = nlist->jindex;
100 shiftidx = nlist->shift;
102 shiftvec = fr->shift_vec[0];
103 fshift = fr->fshift[0];
104 facel = _mm_set1_ps(fr->epsfac);
105 charge = mdatoms->chargeA;
106 krf = _mm_set1_ps(fr->ic->k_rf);
107 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
108 crf = _mm_set1_ps(fr->ic->c_rf);
110 /* Setup water-specific parameters */
111 inr = nlist->iinr[0];
112 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
113 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
114 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
116 jq1 = _mm_set1_ps(charge[inr+1]);
117 jq2 = _mm_set1_ps(charge[inr+2]);
118 jq3 = _mm_set1_ps(charge[inr+3]);
119 qq11 = _mm_mul_ps(iq1,jq1);
120 qq12 = _mm_mul_ps(iq1,jq2);
121 qq13 = _mm_mul_ps(iq1,jq3);
122 qq21 = _mm_mul_ps(iq2,jq1);
123 qq22 = _mm_mul_ps(iq2,jq2);
124 qq23 = _mm_mul_ps(iq2,jq3);
125 qq31 = _mm_mul_ps(iq3,jq1);
126 qq32 = _mm_mul_ps(iq3,jq2);
127 qq33 = _mm_mul_ps(iq3,jq3);
129 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
130 rcutoff_scalar = fr->rcoulomb;
131 rcutoff = _mm_set1_ps(rcutoff_scalar);
132 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
134 /* Avoid stupid compiler warnings */
135 jnrA = jnrB = jnrC = jnrD = 0;
144 /* Start outer loop over neighborlists */
145 for(iidx=0; iidx<nri; iidx++)
147 /* Load shift vector for this list */
148 i_shift_offset = DIM*shiftidx[iidx];
149 shX = shiftvec[i_shift_offset+XX];
150 shY = shiftvec[i_shift_offset+YY];
151 shZ = shiftvec[i_shift_offset+ZZ];
153 /* Load limits for loop over neighbors */
154 j_index_start = jindex[iidx];
155 j_index_end = jindex[iidx+1];
157 /* Get outer coordinate index */
159 i_coord_offset = DIM*inr;
161 /* Load i particle coords and add shift vector */
162 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
163 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
164 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
165 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
166 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
167 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
168 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
169 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
170 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
172 fix1 = _mm_setzero_ps();
173 fiy1 = _mm_setzero_ps();
174 fiz1 = _mm_setzero_ps();
175 fix2 = _mm_setzero_ps();
176 fiy2 = _mm_setzero_ps();
177 fiz2 = _mm_setzero_ps();
178 fix3 = _mm_setzero_ps();
179 fiy3 = _mm_setzero_ps();
180 fiz3 = _mm_setzero_ps();
182 /* Reset potential sums */
183 velecsum = _mm_setzero_ps();
185 /* Start inner kernel loop */
186 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
189 /* Get j neighbor index, and coordinate index */
195 j_coord_offsetA = DIM*jnrA;
196 j_coord_offsetB = DIM*jnrB;
197 j_coord_offsetC = DIM*jnrC;
198 j_coord_offsetD = DIM*jnrD;
200 /* load j atom coordinates */
201 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
202 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
203 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
205 /* Calculate displacement vector */
206 dx11 = _mm_sub_ps(ix1,jx1);
207 dy11 = _mm_sub_ps(iy1,jy1);
208 dz11 = _mm_sub_ps(iz1,jz1);
209 dx12 = _mm_sub_ps(ix1,jx2);
210 dy12 = _mm_sub_ps(iy1,jy2);
211 dz12 = _mm_sub_ps(iz1,jz2);
212 dx13 = _mm_sub_ps(ix1,jx3);
213 dy13 = _mm_sub_ps(iy1,jy3);
214 dz13 = _mm_sub_ps(iz1,jz3);
215 dx21 = _mm_sub_ps(ix2,jx1);
216 dy21 = _mm_sub_ps(iy2,jy1);
217 dz21 = _mm_sub_ps(iz2,jz1);
218 dx22 = _mm_sub_ps(ix2,jx2);
219 dy22 = _mm_sub_ps(iy2,jy2);
220 dz22 = _mm_sub_ps(iz2,jz2);
221 dx23 = _mm_sub_ps(ix2,jx3);
222 dy23 = _mm_sub_ps(iy2,jy3);
223 dz23 = _mm_sub_ps(iz2,jz3);
224 dx31 = _mm_sub_ps(ix3,jx1);
225 dy31 = _mm_sub_ps(iy3,jy1);
226 dz31 = _mm_sub_ps(iz3,jz1);
227 dx32 = _mm_sub_ps(ix3,jx2);
228 dy32 = _mm_sub_ps(iy3,jy2);
229 dz32 = _mm_sub_ps(iz3,jz2);
230 dx33 = _mm_sub_ps(ix3,jx3);
231 dy33 = _mm_sub_ps(iy3,jy3);
232 dz33 = _mm_sub_ps(iz3,jz3);
234 /* Calculate squared distance and things based on it */
235 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
236 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
237 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
238 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
239 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
240 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
241 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
242 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
243 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
245 rinv11 = gmx_mm_invsqrt_ps(rsq11);
246 rinv12 = gmx_mm_invsqrt_ps(rsq12);
247 rinv13 = gmx_mm_invsqrt_ps(rsq13);
248 rinv21 = gmx_mm_invsqrt_ps(rsq21);
249 rinv22 = gmx_mm_invsqrt_ps(rsq22);
250 rinv23 = gmx_mm_invsqrt_ps(rsq23);
251 rinv31 = gmx_mm_invsqrt_ps(rsq31);
252 rinv32 = gmx_mm_invsqrt_ps(rsq32);
253 rinv33 = gmx_mm_invsqrt_ps(rsq33);
255 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
256 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
257 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
258 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
259 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
260 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
261 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
262 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
263 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
265 fjx1 = _mm_setzero_ps();
266 fjy1 = _mm_setzero_ps();
267 fjz1 = _mm_setzero_ps();
268 fjx2 = _mm_setzero_ps();
269 fjy2 = _mm_setzero_ps();
270 fjz2 = _mm_setzero_ps();
271 fjx3 = _mm_setzero_ps();
272 fjy3 = _mm_setzero_ps();
273 fjz3 = _mm_setzero_ps();
275 /**************************
276 * CALCULATE INTERACTIONS *
277 **************************/
279 if (gmx_mm_any_lt(rsq11,rcutoff2))
282 /* REACTION-FIELD ELECTROSTATICS */
283 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
284 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
286 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
288 /* Update potential sum for this i atom from the interaction with this j atom. */
289 velec = _mm_and_ps(velec,cutoff_mask);
290 velecsum = _mm_add_ps(velecsum,velec);
294 fscal = _mm_and_ps(fscal,cutoff_mask);
296 /* Calculate temporary vectorial force */
297 tx = _mm_mul_ps(fscal,dx11);
298 ty = _mm_mul_ps(fscal,dy11);
299 tz = _mm_mul_ps(fscal,dz11);
301 /* Update vectorial force */
302 fix1 = _mm_add_ps(fix1,tx);
303 fiy1 = _mm_add_ps(fiy1,ty);
304 fiz1 = _mm_add_ps(fiz1,tz);
306 fjx1 = _mm_add_ps(fjx1,tx);
307 fjy1 = _mm_add_ps(fjy1,ty);
308 fjz1 = _mm_add_ps(fjz1,tz);
312 /**************************
313 * CALCULATE INTERACTIONS *
314 **************************/
316 if (gmx_mm_any_lt(rsq12,rcutoff2))
319 /* REACTION-FIELD ELECTROSTATICS */
320 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
321 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
323 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
325 /* Update potential sum for this i atom from the interaction with this j atom. */
326 velec = _mm_and_ps(velec,cutoff_mask);
327 velecsum = _mm_add_ps(velecsum,velec);
331 fscal = _mm_and_ps(fscal,cutoff_mask);
333 /* Calculate temporary vectorial force */
334 tx = _mm_mul_ps(fscal,dx12);
335 ty = _mm_mul_ps(fscal,dy12);
336 tz = _mm_mul_ps(fscal,dz12);
338 /* Update vectorial force */
339 fix1 = _mm_add_ps(fix1,tx);
340 fiy1 = _mm_add_ps(fiy1,ty);
341 fiz1 = _mm_add_ps(fiz1,tz);
343 fjx2 = _mm_add_ps(fjx2,tx);
344 fjy2 = _mm_add_ps(fjy2,ty);
345 fjz2 = _mm_add_ps(fjz2,tz);
349 /**************************
350 * CALCULATE INTERACTIONS *
351 **************************/
353 if (gmx_mm_any_lt(rsq13,rcutoff2))
356 /* REACTION-FIELD ELECTROSTATICS */
357 velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
358 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
360 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
362 /* Update potential sum for this i atom from the interaction with this j atom. */
363 velec = _mm_and_ps(velec,cutoff_mask);
364 velecsum = _mm_add_ps(velecsum,velec);
368 fscal = _mm_and_ps(fscal,cutoff_mask);
370 /* Calculate temporary vectorial force */
371 tx = _mm_mul_ps(fscal,dx13);
372 ty = _mm_mul_ps(fscal,dy13);
373 tz = _mm_mul_ps(fscal,dz13);
375 /* Update vectorial force */
376 fix1 = _mm_add_ps(fix1,tx);
377 fiy1 = _mm_add_ps(fiy1,ty);
378 fiz1 = _mm_add_ps(fiz1,tz);
380 fjx3 = _mm_add_ps(fjx3,tx);
381 fjy3 = _mm_add_ps(fjy3,ty);
382 fjz3 = _mm_add_ps(fjz3,tz);
386 /**************************
387 * CALCULATE INTERACTIONS *
388 **************************/
390 if (gmx_mm_any_lt(rsq21,rcutoff2))
393 /* REACTION-FIELD ELECTROSTATICS */
394 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
395 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
397 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
399 /* Update potential sum for this i atom from the interaction with this j atom. */
400 velec = _mm_and_ps(velec,cutoff_mask);
401 velecsum = _mm_add_ps(velecsum,velec);
405 fscal = _mm_and_ps(fscal,cutoff_mask);
407 /* Calculate temporary vectorial force */
408 tx = _mm_mul_ps(fscal,dx21);
409 ty = _mm_mul_ps(fscal,dy21);
410 tz = _mm_mul_ps(fscal,dz21);
412 /* Update vectorial force */
413 fix2 = _mm_add_ps(fix2,tx);
414 fiy2 = _mm_add_ps(fiy2,ty);
415 fiz2 = _mm_add_ps(fiz2,tz);
417 fjx1 = _mm_add_ps(fjx1,tx);
418 fjy1 = _mm_add_ps(fjy1,ty);
419 fjz1 = _mm_add_ps(fjz1,tz);
423 /**************************
424 * CALCULATE INTERACTIONS *
425 **************************/
427 if (gmx_mm_any_lt(rsq22,rcutoff2))
430 /* REACTION-FIELD ELECTROSTATICS */
431 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
432 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
434 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
436 /* Update potential sum for this i atom from the interaction with this j atom. */
437 velec = _mm_and_ps(velec,cutoff_mask);
438 velecsum = _mm_add_ps(velecsum,velec);
442 fscal = _mm_and_ps(fscal,cutoff_mask);
444 /* Calculate temporary vectorial force */
445 tx = _mm_mul_ps(fscal,dx22);
446 ty = _mm_mul_ps(fscal,dy22);
447 tz = _mm_mul_ps(fscal,dz22);
449 /* Update vectorial force */
450 fix2 = _mm_add_ps(fix2,tx);
451 fiy2 = _mm_add_ps(fiy2,ty);
452 fiz2 = _mm_add_ps(fiz2,tz);
454 fjx2 = _mm_add_ps(fjx2,tx);
455 fjy2 = _mm_add_ps(fjy2,ty);
456 fjz2 = _mm_add_ps(fjz2,tz);
460 /**************************
461 * CALCULATE INTERACTIONS *
462 **************************/
464 if (gmx_mm_any_lt(rsq23,rcutoff2))
467 /* REACTION-FIELD ELECTROSTATICS */
468 velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
469 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
471 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
473 /* Update potential sum for this i atom from the interaction with this j atom. */
474 velec = _mm_and_ps(velec,cutoff_mask);
475 velecsum = _mm_add_ps(velecsum,velec);
479 fscal = _mm_and_ps(fscal,cutoff_mask);
481 /* Calculate temporary vectorial force */
482 tx = _mm_mul_ps(fscal,dx23);
483 ty = _mm_mul_ps(fscal,dy23);
484 tz = _mm_mul_ps(fscal,dz23);
486 /* Update vectorial force */
487 fix2 = _mm_add_ps(fix2,tx);
488 fiy2 = _mm_add_ps(fiy2,ty);
489 fiz2 = _mm_add_ps(fiz2,tz);
491 fjx3 = _mm_add_ps(fjx3,tx);
492 fjy3 = _mm_add_ps(fjy3,ty);
493 fjz3 = _mm_add_ps(fjz3,tz);
497 /**************************
498 * CALCULATE INTERACTIONS *
499 **************************/
501 if (gmx_mm_any_lt(rsq31,rcutoff2))
504 /* REACTION-FIELD ELECTROSTATICS */
505 velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
506 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
508 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
510 /* Update potential sum for this i atom from the interaction with this j atom. */
511 velec = _mm_and_ps(velec,cutoff_mask);
512 velecsum = _mm_add_ps(velecsum,velec);
516 fscal = _mm_and_ps(fscal,cutoff_mask);
518 /* Calculate temporary vectorial force */
519 tx = _mm_mul_ps(fscal,dx31);
520 ty = _mm_mul_ps(fscal,dy31);
521 tz = _mm_mul_ps(fscal,dz31);
523 /* Update vectorial force */
524 fix3 = _mm_add_ps(fix3,tx);
525 fiy3 = _mm_add_ps(fiy3,ty);
526 fiz3 = _mm_add_ps(fiz3,tz);
528 fjx1 = _mm_add_ps(fjx1,tx);
529 fjy1 = _mm_add_ps(fjy1,ty);
530 fjz1 = _mm_add_ps(fjz1,tz);
534 /**************************
535 * CALCULATE INTERACTIONS *
536 **************************/
538 if (gmx_mm_any_lt(rsq32,rcutoff2))
541 /* REACTION-FIELD ELECTROSTATICS */
542 velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
543 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
545 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
547 /* Update potential sum for this i atom from the interaction with this j atom. */
548 velec = _mm_and_ps(velec,cutoff_mask);
549 velecsum = _mm_add_ps(velecsum,velec);
553 fscal = _mm_and_ps(fscal,cutoff_mask);
555 /* Calculate temporary vectorial force */
556 tx = _mm_mul_ps(fscal,dx32);
557 ty = _mm_mul_ps(fscal,dy32);
558 tz = _mm_mul_ps(fscal,dz32);
560 /* Update vectorial force */
561 fix3 = _mm_add_ps(fix3,tx);
562 fiy3 = _mm_add_ps(fiy3,ty);
563 fiz3 = _mm_add_ps(fiz3,tz);
565 fjx2 = _mm_add_ps(fjx2,tx);
566 fjy2 = _mm_add_ps(fjy2,ty);
567 fjz2 = _mm_add_ps(fjz2,tz);
571 /**************************
572 * CALCULATE INTERACTIONS *
573 **************************/
575 if (gmx_mm_any_lt(rsq33,rcutoff2))
578 /* REACTION-FIELD ELECTROSTATICS */
579 velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
580 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
582 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
584 /* Update potential sum for this i atom from the interaction with this j atom. */
585 velec = _mm_and_ps(velec,cutoff_mask);
586 velecsum = _mm_add_ps(velecsum,velec);
590 fscal = _mm_and_ps(fscal,cutoff_mask);
592 /* Calculate temporary vectorial force */
593 tx = _mm_mul_ps(fscal,dx33);
594 ty = _mm_mul_ps(fscal,dy33);
595 tz = _mm_mul_ps(fscal,dz33);
597 /* Update vectorial force */
598 fix3 = _mm_add_ps(fix3,tx);
599 fiy3 = _mm_add_ps(fiy3,ty);
600 fiz3 = _mm_add_ps(fiz3,tz);
602 fjx3 = _mm_add_ps(fjx3,tx);
603 fjy3 = _mm_add_ps(fjy3,ty);
604 fjz3 = _mm_add_ps(fjz3,tz);
608 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
609 f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
610 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
612 /* Inner loop uses 324 flops */
618 /* Get j neighbor index, and coordinate index */
624 /* Sign of each element will be negative for non-real atoms.
625 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
626 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
628 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
629 jnrA = (jnrA>=0) ? jnrA : 0;
630 jnrB = (jnrB>=0) ? jnrB : 0;
631 jnrC = (jnrC>=0) ? jnrC : 0;
632 jnrD = (jnrD>=0) ? jnrD : 0;
634 j_coord_offsetA = DIM*jnrA;
635 j_coord_offsetB = DIM*jnrB;
636 j_coord_offsetC = DIM*jnrC;
637 j_coord_offsetD = DIM*jnrD;
639 /* load j atom coordinates */
640 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
641 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
642 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
644 /* Calculate displacement vector */
645 dx11 = _mm_sub_ps(ix1,jx1);
646 dy11 = _mm_sub_ps(iy1,jy1);
647 dz11 = _mm_sub_ps(iz1,jz1);
648 dx12 = _mm_sub_ps(ix1,jx2);
649 dy12 = _mm_sub_ps(iy1,jy2);
650 dz12 = _mm_sub_ps(iz1,jz2);
651 dx13 = _mm_sub_ps(ix1,jx3);
652 dy13 = _mm_sub_ps(iy1,jy3);
653 dz13 = _mm_sub_ps(iz1,jz3);
654 dx21 = _mm_sub_ps(ix2,jx1);
655 dy21 = _mm_sub_ps(iy2,jy1);
656 dz21 = _mm_sub_ps(iz2,jz1);
657 dx22 = _mm_sub_ps(ix2,jx2);
658 dy22 = _mm_sub_ps(iy2,jy2);
659 dz22 = _mm_sub_ps(iz2,jz2);
660 dx23 = _mm_sub_ps(ix2,jx3);
661 dy23 = _mm_sub_ps(iy2,jy3);
662 dz23 = _mm_sub_ps(iz2,jz3);
663 dx31 = _mm_sub_ps(ix3,jx1);
664 dy31 = _mm_sub_ps(iy3,jy1);
665 dz31 = _mm_sub_ps(iz3,jz1);
666 dx32 = _mm_sub_ps(ix3,jx2);
667 dy32 = _mm_sub_ps(iy3,jy2);
668 dz32 = _mm_sub_ps(iz3,jz2);
669 dx33 = _mm_sub_ps(ix3,jx3);
670 dy33 = _mm_sub_ps(iy3,jy3);
671 dz33 = _mm_sub_ps(iz3,jz3);
673 /* Calculate squared distance and things based on it */
674 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
675 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
676 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
677 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
678 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
679 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
680 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
681 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
682 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
684 rinv11 = gmx_mm_invsqrt_ps(rsq11);
685 rinv12 = gmx_mm_invsqrt_ps(rsq12);
686 rinv13 = gmx_mm_invsqrt_ps(rsq13);
687 rinv21 = gmx_mm_invsqrt_ps(rsq21);
688 rinv22 = gmx_mm_invsqrt_ps(rsq22);
689 rinv23 = gmx_mm_invsqrt_ps(rsq23);
690 rinv31 = gmx_mm_invsqrt_ps(rsq31);
691 rinv32 = gmx_mm_invsqrt_ps(rsq32);
692 rinv33 = gmx_mm_invsqrt_ps(rsq33);
694 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
695 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
696 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
697 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
698 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
699 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
700 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
701 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
702 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
704 fjx1 = _mm_setzero_ps();
705 fjy1 = _mm_setzero_ps();
706 fjz1 = _mm_setzero_ps();
707 fjx2 = _mm_setzero_ps();
708 fjy2 = _mm_setzero_ps();
709 fjz2 = _mm_setzero_ps();
710 fjx3 = _mm_setzero_ps();
711 fjy3 = _mm_setzero_ps();
712 fjz3 = _mm_setzero_ps();
714 /**************************
715 * CALCULATE INTERACTIONS *
716 **************************/
718 if (gmx_mm_any_lt(rsq11,rcutoff2))
721 /* REACTION-FIELD ELECTROSTATICS */
722 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
723 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
725 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
727 /* Update potential sum for this i atom from the interaction with this j atom. */
728 velec = _mm_and_ps(velec,cutoff_mask);
729 velec = _mm_andnot_ps(dummy_mask,velec);
730 velecsum = _mm_add_ps(velecsum,velec);
734 fscal = _mm_and_ps(fscal,cutoff_mask);
736 fscal = _mm_andnot_ps(dummy_mask,fscal);
738 /* Calculate temporary vectorial force */
739 tx = _mm_mul_ps(fscal,dx11);
740 ty = _mm_mul_ps(fscal,dy11);
741 tz = _mm_mul_ps(fscal,dz11);
743 /* Update vectorial force */
744 fix1 = _mm_add_ps(fix1,tx);
745 fiy1 = _mm_add_ps(fiy1,ty);
746 fiz1 = _mm_add_ps(fiz1,tz);
748 fjx1 = _mm_add_ps(fjx1,tx);
749 fjy1 = _mm_add_ps(fjy1,ty);
750 fjz1 = _mm_add_ps(fjz1,tz);
754 /**************************
755 * CALCULATE INTERACTIONS *
756 **************************/
758 if (gmx_mm_any_lt(rsq12,rcutoff2))
761 /* REACTION-FIELD ELECTROSTATICS */
762 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
763 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
765 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
767 /* Update potential sum for this i atom from the interaction with this j atom. */
768 velec = _mm_and_ps(velec,cutoff_mask);
769 velec = _mm_andnot_ps(dummy_mask,velec);
770 velecsum = _mm_add_ps(velecsum,velec);
774 fscal = _mm_and_ps(fscal,cutoff_mask);
776 fscal = _mm_andnot_ps(dummy_mask,fscal);
778 /* Calculate temporary vectorial force */
779 tx = _mm_mul_ps(fscal,dx12);
780 ty = _mm_mul_ps(fscal,dy12);
781 tz = _mm_mul_ps(fscal,dz12);
783 /* Update vectorial force */
784 fix1 = _mm_add_ps(fix1,tx);
785 fiy1 = _mm_add_ps(fiy1,ty);
786 fiz1 = _mm_add_ps(fiz1,tz);
788 fjx2 = _mm_add_ps(fjx2,tx);
789 fjy2 = _mm_add_ps(fjy2,ty);
790 fjz2 = _mm_add_ps(fjz2,tz);
794 /**************************
795 * CALCULATE INTERACTIONS *
796 **************************/
798 if (gmx_mm_any_lt(rsq13,rcutoff2))
801 /* REACTION-FIELD ELECTROSTATICS */
802 velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
803 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
805 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
807 /* Update potential sum for this i atom from the interaction with this j atom. */
808 velec = _mm_and_ps(velec,cutoff_mask);
809 velec = _mm_andnot_ps(dummy_mask,velec);
810 velecsum = _mm_add_ps(velecsum,velec);
814 fscal = _mm_and_ps(fscal,cutoff_mask);
816 fscal = _mm_andnot_ps(dummy_mask,fscal);
818 /* Calculate temporary vectorial force */
819 tx = _mm_mul_ps(fscal,dx13);
820 ty = _mm_mul_ps(fscal,dy13);
821 tz = _mm_mul_ps(fscal,dz13);
823 /* Update vectorial force */
824 fix1 = _mm_add_ps(fix1,tx);
825 fiy1 = _mm_add_ps(fiy1,ty);
826 fiz1 = _mm_add_ps(fiz1,tz);
828 fjx3 = _mm_add_ps(fjx3,tx);
829 fjy3 = _mm_add_ps(fjy3,ty);
830 fjz3 = _mm_add_ps(fjz3,tz);
834 /**************************
835 * CALCULATE INTERACTIONS *
836 **************************/
838 if (gmx_mm_any_lt(rsq21,rcutoff2))
841 /* REACTION-FIELD ELECTROSTATICS */
842 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
843 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
845 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
847 /* Update potential sum for this i atom from the interaction with this j atom. */
848 velec = _mm_and_ps(velec,cutoff_mask);
849 velec = _mm_andnot_ps(dummy_mask,velec);
850 velecsum = _mm_add_ps(velecsum,velec);
854 fscal = _mm_and_ps(fscal,cutoff_mask);
856 fscal = _mm_andnot_ps(dummy_mask,fscal);
858 /* Calculate temporary vectorial force */
859 tx = _mm_mul_ps(fscal,dx21);
860 ty = _mm_mul_ps(fscal,dy21);
861 tz = _mm_mul_ps(fscal,dz21);
863 /* Update vectorial force */
864 fix2 = _mm_add_ps(fix2,tx);
865 fiy2 = _mm_add_ps(fiy2,ty);
866 fiz2 = _mm_add_ps(fiz2,tz);
868 fjx1 = _mm_add_ps(fjx1,tx);
869 fjy1 = _mm_add_ps(fjy1,ty);
870 fjz1 = _mm_add_ps(fjz1,tz);
874 /**************************
875 * CALCULATE INTERACTIONS *
876 **************************/
878 if (gmx_mm_any_lt(rsq22,rcutoff2))
881 /* REACTION-FIELD ELECTROSTATICS */
882 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
883 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
885 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
887 /* Update potential sum for this i atom from the interaction with this j atom. */
888 velec = _mm_and_ps(velec,cutoff_mask);
889 velec = _mm_andnot_ps(dummy_mask,velec);
890 velecsum = _mm_add_ps(velecsum,velec);
894 fscal = _mm_and_ps(fscal,cutoff_mask);
896 fscal = _mm_andnot_ps(dummy_mask,fscal);
898 /* Calculate temporary vectorial force */
899 tx = _mm_mul_ps(fscal,dx22);
900 ty = _mm_mul_ps(fscal,dy22);
901 tz = _mm_mul_ps(fscal,dz22);
903 /* Update vectorial force */
904 fix2 = _mm_add_ps(fix2,tx);
905 fiy2 = _mm_add_ps(fiy2,ty);
906 fiz2 = _mm_add_ps(fiz2,tz);
908 fjx2 = _mm_add_ps(fjx2,tx);
909 fjy2 = _mm_add_ps(fjy2,ty);
910 fjz2 = _mm_add_ps(fjz2,tz);
914 /**************************
915 * CALCULATE INTERACTIONS *
916 **************************/
918 if (gmx_mm_any_lt(rsq23,rcutoff2))
921 /* REACTION-FIELD ELECTROSTATICS */
922 velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
923 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
925 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
927 /* Update potential sum for this i atom from the interaction with this j atom. */
928 velec = _mm_and_ps(velec,cutoff_mask);
929 velec = _mm_andnot_ps(dummy_mask,velec);
930 velecsum = _mm_add_ps(velecsum,velec);
934 fscal = _mm_and_ps(fscal,cutoff_mask);
936 fscal = _mm_andnot_ps(dummy_mask,fscal);
938 /* Calculate temporary vectorial force */
939 tx = _mm_mul_ps(fscal,dx23);
940 ty = _mm_mul_ps(fscal,dy23);
941 tz = _mm_mul_ps(fscal,dz23);
943 /* Update vectorial force */
944 fix2 = _mm_add_ps(fix2,tx);
945 fiy2 = _mm_add_ps(fiy2,ty);
946 fiz2 = _mm_add_ps(fiz2,tz);
948 fjx3 = _mm_add_ps(fjx3,tx);
949 fjy3 = _mm_add_ps(fjy3,ty);
950 fjz3 = _mm_add_ps(fjz3,tz);
954 /**************************
955 * CALCULATE INTERACTIONS *
956 **************************/
958 if (gmx_mm_any_lt(rsq31,rcutoff2))
961 /* REACTION-FIELD ELECTROSTATICS */
962 velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
963 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
965 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
967 /* Update potential sum for this i atom from the interaction with this j atom. */
968 velec = _mm_and_ps(velec,cutoff_mask);
969 velec = _mm_andnot_ps(dummy_mask,velec);
970 velecsum = _mm_add_ps(velecsum,velec);
974 fscal = _mm_and_ps(fscal,cutoff_mask);
976 fscal = _mm_andnot_ps(dummy_mask,fscal);
978 /* Calculate temporary vectorial force */
979 tx = _mm_mul_ps(fscal,dx31);
980 ty = _mm_mul_ps(fscal,dy31);
981 tz = _mm_mul_ps(fscal,dz31);
983 /* Update vectorial force */
984 fix3 = _mm_add_ps(fix3,tx);
985 fiy3 = _mm_add_ps(fiy3,ty);
986 fiz3 = _mm_add_ps(fiz3,tz);
988 fjx1 = _mm_add_ps(fjx1,tx);
989 fjy1 = _mm_add_ps(fjy1,ty);
990 fjz1 = _mm_add_ps(fjz1,tz);
994 /**************************
995 * CALCULATE INTERACTIONS *
996 **************************/
998 if (gmx_mm_any_lt(rsq32,rcutoff2))
1001 /* REACTION-FIELD ELECTROSTATICS */
1002 velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
1003 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1005 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1007 /* Update potential sum for this i atom from the interaction with this j atom. */
1008 velec = _mm_and_ps(velec,cutoff_mask);
1009 velec = _mm_andnot_ps(dummy_mask,velec);
1010 velecsum = _mm_add_ps(velecsum,velec);
1014 fscal = _mm_and_ps(fscal,cutoff_mask);
1016 fscal = _mm_andnot_ps(dummy_mask,fscal);
1018 /* Calculate temporary vectorial force */
1019 tx = _mm_mul_ps(fscal,dx32);
1020 ty = _mm_mul_ps(fscal,dy32);
1021 tz = _mm_mul_ps(fscal,dz32);
1023 /* Update vectorial force */
1024 fix3 = _mm_add_ps(fix3,tx);
1025 fiy3 = _mm_add_ps(fiy3,ty);
1026 fiz3 = _mm_add_ps(fiz3,tz);
1028 fjx2 = _mm_add_ps(fjx2,tx);
1029 fjy2 = _mm_add_ps(fjy2,ty);
1030 fjz2 = _mm_add_ps(fjz2,tz);
1034 /**************************
1035 * CALCULATE INTERACTIONS *
1036 **************************/
1038 if (gmx_mm_any_lt(rsq33,rcutoff2))
1041 /* REACTION-FIELD ELECTROSTATICS */
1042 velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
1043 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1045 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1047 /* Update potential sum for this i atom from the interaction with this j atom. */
1048 velec = _mm_and_ps(velec,cutoff_mask);
1049 velec = _mm_andnot_ps(dummy_mask,velec);
1050 velecsum = _mm_add_ps(velecsum,velec);
1054 fscal = _mm_and_ps(fscal,cutoff_mask);
1056 fscal = _mm_andnot_ps(dummy_mask,fscal);
1058 /* Calculate temporary vectorial force */
1059 tx = _mm_mul_ps(fscal,dx33);
1060 ty = _mm_mul_ps(fscal,dy33);
1061 tz = _mm_mul_ps(fscal,dz33);
1063 /* Update vectorial force */
1064 fix3 = _mm_add_ps(fix3,tx);
1065 fiy3 = _mm_add_ps(fiy3,ty);
1066 fiz3 = _mm_add_ps(fiz3,tz);
1068 fjx3 = _mm_add_ps(fjx3,tx);
1069 fjy3 = _mm_add_ps(fjy3,ty);
1070 fjz3 = _mm_add_ps(fjz3,tz);
1074 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
1075 f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
1076 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1078 /* Inner loop uses 324 flops */
1081 /* End of innermost loop */
1083 gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1084 f+i_coord_offset+DIM,fshift+i_shift_offset);
1087 /* Update potential energies */
1088 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1090 /* Increment number of inner iterations */
1091 inneriter += j_index_end - j_index_start;
1093 /* Outer loop uses 28 flops */
1096 /* Increment number of outer iterations */
1099 /* Update outer/inner flops */
1101 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*324);
1104 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse2_single
1105 * Electrostatics interaction: ReactionField
1106 * VdW interaction: None
1107 * Geometry: Water4-Water4
1108 * Calculate force/pot: Force
1111 nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse2_single
1112 (t_nblist * gmx_restrict nlist,
1113 rvec * gmx_restrict xx,
1114 rvec * gmx_restrict ff,
1115 t_forcerec * gmx_restrict fr,
1116 t_mdatoms * gmx_restrict mdatoms,
1117 nb_kernel_data_t * gmx_restrict kernel_data,
1118 t_nrnb * gmx_restrict nrnb)
1120 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1121 * just 0 for non-waters.
1122 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1123 * jnr indices corresponding to data put in the four positions in the SIMD register.
1125 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1126 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1127 int jnrA,jnrB,jnrC,jnrD;
1128 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1129 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1130 real shX,shY,shZ,rcutoff_scalar;
1131 real *shiftvec,*fshift,*x,*f;
1132 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1134 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1136 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1138 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1139 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1140 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1141 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1142 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1143 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1144 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1145 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1146 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1147 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1148 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1149 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1150 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1151 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1152 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1153 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1154 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1156 __m128 dummy_mask,cutoff_mask;
1157 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1158 __m128 one = _mm_set1_ps(1.0);
1159 __m128 two = _mm_set1_ps(2.0);
1165 jindex = nlist->jindex;
1167 shiftidx = nlist->shift;
1169 shiftvec = fr->shift_vec[0];
1170 fshift = fr->fshift[0];
1171 facel = _mm_set1_ps(fr->epsfac);
1172 charge = mdatoms->chargeA;
1173 krf = _mm_set1_ps(fr->ic->k_rf);
1174 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
1175 crf = _mm_set1_ps(fr->ic->c_rf);
1177 /* Setup water-specific parameters */
1178 inr = nlist->iinr[0];
1179 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1180 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1181 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1183 jq1 = _mm_set1_ps(charge[inr+1]);
1184 jq2 = _mm_set1_ps(charge[inr+2]);
1185 jq3 = _mm_set1_ps(charge[inr+3]);
1186 qq11 = _mm_mul_ps(iq1,jq1);
1187 qq12 = _mm_mul_ps(iq1,jq2);
1188 qq13 = _mm_mul_ps(iq1,jq3);
1189 qq21 = _mm_mul_ps(iq2,jq1);
1190 qq22 = _mm_mul_ps(iq2,jq2);
1191 qq23 = _mm_mul_ps(iq2,jq3);
1192 qq31 = _mm_mul_ps(iq3,jq1);
1193 qq32 = _mm_mul_ps(iq3,jq2);
1194 qq33 = _mm_mul_ps(iq3,jq3);
1196 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1197 rcutoff_scalar = fr->rcoulomb;
1198 rcutoff = _mm_set1_ps(rcutoff_scalar);
1199 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1201 /* Avoid stupid compiler warnings */
1202 jnrA = jnrB = jnrC = jnrD = 0;
1203 j_coord_offsetA = 0;
1204 j_coord_offsetB = 0;
1205 j_coord_offsetC = 0;
1206 j_coord_offsetD = 0;
1211 /* Start outer loop over neighborlists */
1212 for(iidx=0; iidx<nri; iidx++)
1214 /* Load shift vector for this list */
1215 i_shift_offset = DIM*shiftidx[iidx];
1216 shX = shiftvec[i_shift_offset+XX];
1217 shY = shiftvec[i_shift_offset+YY];
1218 shZ = shiftvec[i_shift_offset+ZZ];
1220 /* Load limits for loop over neighbors */
1221 j_index_start = jindex[iidx];
1222 j_index_end = jindex[iidx+1];
1224 /* Get outer coordinate index */
1226 i_coord_offset = DIM*inr;
1228 /* Load i particle coords and add shift vector */
1229 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1230 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1231 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1232 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1233 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1234 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1235 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
1236 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
1237 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
1239 fix1 = _mm_setzero_ps();
1240 fiy1 = _mm_setzero_ps();
1241 fiz1 = _mm_setzero_ps();
1242 fix2 = _mm_setzero_ps();
1243 fiy2 = _mm_setzero_ps();
1244 fiz2 = _mm_setzero_ps();
1245 fix3 = _mm_setzero_ps();
1246 fiy3 = _mm_setzero_ps();
1247 fiz3 = _mm_setzero_ps();
1249 /* Start inner kernel loop */
1250 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1253 /* Get j neighbor index, and coordinate index */
1255 jnrB = jjnr[jidx+1];
1256 jnrC = jjnr[jidx+2];
1257 jnrD = jjnr[jidx+3];
1259 j_coord_offsetA = DIM*jnrA;
1260 j_coord_offsetB = DIM*jnrB;
1261 j_coord_offsetC = DIM*jnrC;
1262 j_coord_offsetD = DIM*jnrD;
1264 /* load j atom coordinates */
1265 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1266 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1267 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1269 /* Calculate displacement vector */
1270 dx11 = _mm_sub_ps(ix1,jx1);
1271 dy11 = _mm_sub_ps(iy1,jy1);
1272 dz11 = _mm_sub_ps(iz1,jz1);
1273 dx12 = _mm_sub_ps(ix1,jx2);
1274 dy12 = _mm_sub_ps(iy1,jy2);
1275 dz12 = _mm_sub_ps(iz1,jz2);
1276 dx13 = _mm_sub_ps(ix1,jx3);
1277 dy13 = _mm_sub_ps(iy1,jy3);
1278 dz13 = _mm_sub_ps(iz1,jz3);
1279 dx21 = _mm_sub_ps(ix2,jx1);
1280 dy21 = _mm_sub_ps(iy2,jy1);
1281 dz21 = _mm_sub_ps(iz2,jz1);
1282 dx22 = _mm_sub_ps(ix2,jx2);
1283 dy22 = _mm_sub_ps(iy2,jy2);
1284 dz22 = _mm_sub_ps(iz2,jz2);
1285 dx23 = _mm_sub_ps(ix2,jx3);
1286 dy23 = _mm_sub_ps(iy2,jy3);
1287 dz23 = _mm_sub_ps(iz2,jz3);
1288 dx31 = _mm_sub_ps(ix3,jx1);
1289 dy31 = _mm_sub_ps(iy3,jy1);
1290 dz31 = _mm_sub_ps(iz3,jz1);
1291 dx32 = _mm_sub_ps(ix3,jx2);
1292 dy32 = _mm_sub_ps(iy3,jy2);
1293 dz32 = _mm_sub_ps(iz3,jz2);
1294 dx33 = _mm_sub_ps(ix3,jx3);
1295 dy33 = _mm_sub_ps(iy3,jy3);
1296 dz33 = _mm_sub_ps(iz3,jz3);
1298 /* Calculate squared distance and things based on it */
1299 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1300 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1301 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1302 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1303 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1304 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1305 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1306 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1307 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1309 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1310 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1311 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1312 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1313 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1314 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1315 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1316 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1317 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1319 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1320 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1321 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1322 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1323 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1324 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1325 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1326 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1327 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1329 fjx1 = _mm_setzero_ps();
1330 fjy1 = _mm_setzero_ps();
1331 fjz1 = _mm_setzero_ps();
1332 fjx2 = _mm_setzero_ps();
1333 fjy2 = _mm_setzero_ps();
1334 fjz2 = _mm_setzero_ps();
1335 fjx3 = _mm_setzero_ps();
1336 fjy3 = _mm_setzero_ps();
1337 fjz3 = _mm_setzero_ps();
1339 /**************************
1340 * CALCULATE INTERACTIONS *
1341 **************************/
1343 if (gmx_mm_any_lt(rsq11,rcutoff2))
1346 /* REACTION-FIELD ELECTROSTATICS */
1347 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1349 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1353 fscal = _mm_and_ps(fscal,cutoff_mask);
1355 /* Calculate temporary vectorial force */
1356 tx = _mm_mul_ps(fscal,dx11);
1357 ty = _mm_mul_ps(fscal,dy11);
1358 tz = _mm_mul_ps(fscal,dz11);
1360 /* Update vectorial force */
1361 fix1 = _mm_add_ps(fix1,tx);
1362 fiy1 = _mm_add_ps(fiy1,ty);
1363 fiz1 = _mm_add_ps(fiz1,tz);
1365 fjx1 = _mm_add_ps(fjx1,tx);
1366 fjy1 = _mm_add_ps(fjy1,ty);
1367 fjz1 = _mm_add_ps(fjz1,tz);
1371 /**************************
1372 * CALCULATE INTERACTIONS *
1373 **************************/
1375 if (gmx_mm_any_lt(rsq12,rcutoff2))
1378 /* REACTION-FIELD ELECTROSTATICS */
1379 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1381 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1385 fscal = _mm_and_ps(fscal,cutoff_mask);
1387 /* Calculate temporary vectorial force */
1388 tx = _mm_mul_ps(fscal,dx12);
1389 ty = _mm_mul_ps(fscal,dy12);
1390 tz = _mm_mul_ps(fscal,dz12);
1392 /* Update vectorial force */
1393 fix1 = _mm_add_ps(fix1,tx);
1394 fiy1 = _mm_add_ps(fiy1,ty);
1395 fiz1 = _mm_add_ps(fiz1,tz);
1397 fjx2 = _mm_add_ps(fjx2,tx);
1398 fjy2 = _mm_add_ps(fjy2,ty);
1399 fjz2 = _mm_add_ps(fjz2,tz);
1403 /**************************
1404 * CALCULATE INTERACTIONS *
1405 **************************/
1407 if (gmx_mm_any_lt(rsq13,rcutoff2))
1410 /* REACTION-FIELD ELECTROSTATICS */
1411 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
1413 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1417 fscal = _mm_and_ps(fscal,cutoff_mask);
1419 /* Calculate temporary vectorial force */
1420 tx = _mm_mul_ps(fscal,dx13);
1421 ty = _mm_mul_ps(fscal,dy13);
1422 tz = _mm_mul_ps(fscal,dz13);
1424 /* Update vectorial force */
1425 fix1 = _mm_add_ps(fix1,tx);
1426 fiy1 = _mm_add_ps(fiy1,ty);
1427 fiz1 = _mm_add_ps(fiz1,tz);
1429 fjx3 = _mm_add_ps(fjx3,tx);
1430 fjy3 = _mm_add_ps(fjy3,ty);
1431 fjz3 = _mm_add_ps(fjz3,tz);
1435 /**************************
1436 * CALCULATE INTERACTIONS *
1437 **************************/
1439 if (gmx_mm_any_lt(rsq21,rcutoff2))
1442 /* REACTION-FIELD ELECTROSTATICS */
1443 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1445 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1449 fscal = _mm_and_ps(fscal,cutoff_mask);
1451 /* Calculate temporary vectorial force */
1452 tx = _mm_mul_ps(fscal,dx21);
1453 ty = _mm_mul_ps(fscal,dy21);
1454 tz = _mm_mul_ps(fscal,dz21);
1456 /* Update vectorial force */
1457 fix2 = _mm_add_ps(fix2,tx);
1458 fiy2 = _mm_add_ps(fiy2,ty);
1459 fiz2 = _mm_add_ps(fiz2,tz);
1461 fjx1 = _mm_add_ps(fjx1,tx);
1462 fjy1 = _mm_add_ps(fjy1,ty);
1463 fjz1 = _mm_add_ps(fjz1,tz);
1467 /**************************
1468 * CALCULATE INTERACTIONS *
1469 **************************/
1471 if (gmx_mm_any_lt(rsq22,rcutoff2))
1474 /* REACTION-FIELD ELECTROSTATICS */
1475 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1477 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1481 fscal = _mm_and_ps(fscal,cutoff_mask);
1483 /* Calculate temporary vectorial force */
1484 tx = _mm_mul_ps(fscal,dx22);
1485 ty = _mm_mul_ps(fscal,dy22);
1486 tz = _mm_mul_ps(fscal,dz22);
1488 /* Update vectorial force */
1489 fix2 = _mm_add_ps(fix2,tx);
1490 fiy2 = _mm_add_ps(fiy2,ty);
1491 fiz2 = _mm_add_ps(fiz2,tz);
1493 fjx2 = _mm_add_ps(fjx2,tx);
1494 fjy2 = _mm_add_ps(fjy2,ty);
1495 fjz2 = _mm_add_ps(fjz2,tz);
1499 /**************************
1500 * CALCULATE INTERACTIONS *
1501 **************************/
1503 if (gmx_mm_any_lt(rsq23,rcutoff2))
1506 /* REACTION-FIELD ELECTROSTATICS */
1507 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1509 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1513 fscal = _mm_and_ps(fscal,cutoff_mask);
1515 /* Calculate temporary vectorial force */
1516 tx = _mm_mul_ps(fscal,dx23);
1517 ty = _mm_mul_ps(fscal,dy23);
1518 tz = _mm_mul_ps(fscal,dz23);
1520 /* Update vectorial force */
1521 fix2 = _mm_add_ps(fix2,tx);
1522 fiy2 = _mm_add_ps(fiy2,ty);
1523 fiz2 = _mm_add_ps(fiz2,tz);
1525 fjx3 = _mm_add_ps(fjx3,tx);
1526 fjy3 = _mm_add_ps(fjy3,ty);
1527 fjz3 = _mm_add_ps(fjz3,tz);
1531 /**************************
1532 * CALCULATE INTERACTIONS *
1533 **************************/
1535 if (gmx_mm_any_lt(rsq31,rcutoff2))
1538 /* REACTION-FIELD ELECTROSTATICS */
1539 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1541 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1545 fscal = _mm_and_ps(fscal,cutoff_mask);
1547 /* Calculate temporary vectorial force */
1548 tx = _mm_mul_ps(fscal,dx31);
1549 ty = _mm_mul_ps(fscal,dy31);
1550 tz = _mm_mul_ps(fscal,dz31);
1552 /* Update vectorial force */
1553 fix3 = _mm_add_ps(fix3,tx);
1554 fiy3 = _mm_add_ps(fiy3,ty);
1555 fiz3 = _mm_add_ps(fiz3,tz);
1557 fjx1 = _mm_add_ps(fjx1,tx);
1558 fjy1 = _mm_add_ps(fjy1,ty);
1559 fjz1 = _mm_add_ps(fjz1,tz);
1563 /**************************
1564 * CALCULATE INTERACTIONS *
1565 **************************/
1567 if (gmx_mm_any_lt(rsq32,rcutoff2))
1570 /* REACTION-FIELD ELECTROSTATICS */
1571 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1573 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1577 fscal = _mm_and_ps(fscal,cutoff_mask);
1579 /* Calculate temporary vectorial force */
1580 tx = _mm_mul_ps(fscal,dx32);
1581 ty = _mm_mul_ps(fscal,dy32);
1582 tz = _mm_mul_ps(fscal,dz32);
1584 /* Update vectorial force */
1585 fix3 = _mm_add_ps(fix3,tx);
1586 fiy3 = _mm_add_ps(fiy3,ty);
1587 fiz3 = _mm_add_ps(fiz3,tz);
1589 fjx2 = _mm_add_ps(fjx2,tx);
1590 fjy2 = _mm_add_ps(fjy2,ty);
1591 fjz2 = _mm_add_ps(fjz2,tz);
1595 /**************************
1596 * CALCULATE INTERACTIONS *
1597 **************************/
1599 if (gmx_mm_any_lt(rsq33,rcutoff2))
1602 /* REACTION-FIELD ELECTROSTATICS */
1603 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1605 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1609 fscal = _mm_and_ps(fscal,cutoff_mask);
1611 /* Calculate temporary vectorial force */
1612 tx = _mm_mul_ps(fscal,dx33);
1613 ty = _mm_mul_ps(fscal,dy33);
1614 tz = _mm_mul_ps(fscal,dz33);
1616 /* Update vectorial force */
1617 fix3 = _mm_add_ps(fix3,tx);
1618 fiy3 = _mm_add_ps(fiy3,ty);
1619 fiz3 = _mm_add_ps(fiz3,tz);
1621 fjx3 = _mm_add_ps(fjx3,tx);
1622 fjy3 = _mm_add_ps(fjy3,ty);
1623 fjz3 = _mm_add_ps(fjz3,tz);
1627 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
1628 f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
1629 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1631 /* Inner loop uses 270 flops */
1634 if(jidx<j_index_end)
1637 /* Get j neighbor index, and coordinate index */
1639 jnrB = jjnr[jidx+1];
1640 jnrC = jjnr[jidx+2];
1641 jnrD = jjnr[jidx+3];
1643 /* Sign of each element will be negative for non-real atoms.
1644 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1645 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1647 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1648 jnrA = (jnrA>=0) ? jnrA : 0;
1649 jnrB = (jnrB>=0) ? jnrB : 0;
1650 jnrC = (jnrC>=0) ? jnrC : 0;
1651 jnrD = (jnrD>=0) ? jnrD : 0;
1653 j_coord_offsetA = DIM*jnrA;
1654 j_coord_offsetB = DIM*jnrB;
1655 j_coord_offsetC = DIM*jnrC;
1656 j_coord_offsetD = DIM*jnrD;
1658 /* load j atom coordinates */
1659 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1660 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1661 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1663 /* Calculate displacement vector */
1664 dx11 = _mm_sub_ps(ix1,jx1);
1665 dy11 = _mm_sub_ps(iy1,jy1);
1666 dz11 = _mm_sub_ps(iz1,jz1);
1667 dx12 = _mm_sub_ps(ix1,jx2);
1668 dy12 = _mm_sub_ps(iy1,jy2);
1669 dz12 = _mm_sub_ps(iz1,jz2);
1670 dx13 = _mm_sub_ps(ix1,jx3);
1671 dy13 = _mm_sub_ps(iy1,jy3);
1672 dz13 = _mm_sub_ps(iz1,jz3);
1673 dx21 = _mm_sub_ps(ix2,jx1);
1674 dy21 = _mm_sub_ps(iy2,jy1);
1675 dz21 = _mm_sub_ps(iz2,jz1);
1676 dx22 = _mm_sub_ps(ix2,jx2);
1677 dy22 = _mm_sub_ps(iy2,jy2);
1678 dz22 = _mm_sub_ps(iz2,jz2);
1679 dx23 = _mm_sub_ps(ix2,jx3);
1680 dy23 = _mm_sub_ps(iy2,jy3);
1681 dz23 = _mm_sub_ps(iz2,jz3);
1682 dx31 = _mm_sub_ps(ix3,jx1);
1683 dy31 = _mm_sub_ps(iy3,jy1);
1684 dz31 = _mm_sub_ps(iz3,jz1);
1685 dx32 = _mm_sub_ps(ix3,jx2);
1686 dy32 = _mm_sub_ps(iy3,jy2);
1687 dz32 = _mm_sub_ps(iz3,jz2);
1688 dx33 = _mm_sub_ps(ix3,jx3);
1689 dy33 = _mm_sub_ps(iy3,jy3);
1690 dz33 = _mm_sub_ps(iz3,jz3);
1692 /* Calculate squared distance and things based on it */
1693 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1694 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1695 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1696 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1697 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1698 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1699 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1700 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1701 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1703 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1704 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1705 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1706 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1707 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1708 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1709 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1710 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1711 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1713 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1714 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1715 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1716 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1717 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1718 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1719 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1720 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1721 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1723 fjx1 = _mm_setzero_ps();
1724 fjy1 = _mm_setzero_ps();
1725 fjz1 = _mm_setzero_ps();
1726 fjx2 = _mm_setzero_ps();
1727 fjy2 = _mm_setzero_ps();
1728 fjz2 = _mm_setzero_ps();
1729 fjx3 = _mm_setzero_ps();
1730 fjy3 = _mm_setzero_ps();
1731 fjz3 = _mm_setzero_ps();
1733 /**************************
1734 * CALCULATE INTERACTIONS *
1735 **************************/
1737 if (gmx_mm_any_lt(rsq11,rcutoff2))
1740 /* REACTION-FIELD ELECTROSTATICS */
1741 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1743 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1747 fscal = _mm_and_ps(fscal,cutoff_mask);
1749 fscal = _mm_andnot_ps(dummy_mask,fscal);
1751 /* Calculate temporary vectorial force */
1752 tx = _mm_mul_ps(fscal,dx11);
1753 ty = _mm_mul_ps(fscal,dy11);
1754 tz = _mm_mul_ps(fscal,dz11);
1756 /* Update vectorial force */
1757 fix1 = _mm_add_ps(fix1,tx);
1758 fiy1 = _mm_add_ps(fiy1,ty);
1759 fiz1 = _mm_add_ps(fiz1,tz);
1761 fjx1 = _mm_add_ps(fjx1,tx);
1762 fjy1 = _mm_add_ps(fjy1,ty);
1763 fjz1 = _mm_add_ps(fjz1,tz);
1767 /**************************
1768 * CALCULATE INTERACTIONS *
1769 **************************/
1771 if (gmx_mm_any_lt(rsq12,rcutoff2))
1774 /* REACTION-FIELD ELECTROSTATICS */
1775 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1777 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1781 fscal = _mm_and_ps(fscal,cutoff_mask);
1783 fscal = _mm_andnot_ps(dummy_mask,fscal);
1785 /* Calculate temporary vectorial force */
1786 tx = _mm_mul_ps(fscal,dx12);
1787 ty = _mm_mul_ps(fscal,dy12);
1788 tz = _mm_mul_ps(fscal,dz12);
1790 /* Update vectorial force */
1791 fix1 = _mm_add_ps(fix1,tx);
1792 fiy1 = _mm_add_ps(fiy1,ty);
1793 fiz1 = _mm_add_ps(fiz1,tz);
1795 fjx2 = _mm_add_ps(fjx2,tx);
1796 fjy2 = _mm_add_ps(fjy2,ty);
1797 fjz2 = _mm_add_ps(fjz2,tz);
1801 /**************************
1802 * CALCULATE INTERACTIONS *
1803 **************************/
1805 if (gmx_mm_any_lt(rsq13,rcutoff2))
1808 /* REACTION-FIELD ELECTROSTATICS */
1809 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
1811 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1815 fscal = _mm_and_ps(fscal,cutoff_mask);
1817 fscal = _mm_andnot_ps(dummy_mask,fscal);
1819 /* Calculate temporary vectorial force */
1820 tx = _mm_mul_ps(fscal,dx13);
1821 ty = _mm_mul_ps(fscal,dy13);
1822 tz = _mm_mul_ps(fscal,dz13);
1824 /* Update vectorial force */
1825 fix1 = _mm_add_ps(fix1,tx);
1826 fiy1 = _mm_add_ps(fiy1,ty);
1827 fiz1 = _mm_add_ps(fiz1,tz);
1829 fjx3 = _mm_add_ps(fjx3,tx);
1830 fjy3 = _mm_add_ps(fjy3,ty);
1831 fjz3 = _mm_add_ps(fjz3,tz);
1835 /**************************
1836 * CALCULATE INTERACTIONS *
1837 **************************/
1839 if (gmx_mm_any_lt(rsq21,rcutoff2))
1842 /* REACTION-FIELD ELECTROSTATICS */
1843 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1845 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1849 fscal = _mm_and_ps(fscal,cutoff_mask);
1851 fscal = _mm_andnot_ps(dummy_mask,fscal);
1853 /* Calculate temporary vectorial force */
1854 tx = _mm_mul_ps(fscal,dx21);
1855 ty = _mm_mul_ps(fscal,dy21);
1856 tz = _mm_mul_ps(fscal,dz21);
1858 /* Update vectorial force */
1859 fix2 = _mm_add_ps(fix2,tx);
1860 fiy2 = _mm_add_ps(fiy2,ty);
1861 fiz2 = _mm_add_ps(fiz2,tz);
1863 fjx1 = _mm_add_ps(fjx1,tx);
1864 fjy1 = _mm_add_ps(fjy1,ty);
1865 fjz1 = _mm_add_ps(fjz1,tz);
1869 /**************************
1870 * CALCULATE INTERACTIONS *
1871 **************************/
1873 if (gmx_mm_any_lt(rsq22,rcutoff2))
1876 /* REACTION-FIELD ELECTROSTATICS */
1877 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1879 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1883 fscal = _mm_and_ps(fscal,cutoff_mask);
1885 fscal = _mm_andnot_ps(dummy_mask,fscal);
1887 /* Calculate temporary vectorial force */
1888 tx = _mm_mul_ps(fscal,dx22);
1889 ty = _mm_mul_ps(fscal,dy22);
1890 tz = _mm_mul_ps(fscal,dz22);
1892 /* Update vectorial force */
1893 fix2 = _mm_add_ps(fix2,tx);
1894 fiy2 = _mm_add_ps(fiy2,ty);
1895 fiz2 = _mm_add_ps(fiz2,tz);
1897 fjx2 = _mm_add_ps(fjx2,tx);
1898 fjy2 = _mm_add_ps(fjy2,ty);
1899 fjz2 = _mm_add_ps(fjz2,tz);
1903 /**************************
1904 * CALCULATE INTERACTIONS *
1905 **************************/
1907 if (gmx_mm_any_lt(rsq23,rcutoff2))
1910 /* REACTION-FIELD ELECTROSTATICS */
1911 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1913 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1917 fscal = _mm_and_ps(fscal,cutoff_mask);
1919 fscal = _mm_andnot_ps(dummy_mask,fscal);
1921 /* Calculate temporary vectorial force */
1922 tx = _mm_mul_ps(fscal,dx23);
1923 ty = _mm_mul_ps(fscal,dy23);
1924 tz = _mm_mul_ps(fscal,dz23);
1926 /* Update vectorial force */
1927 fix2 = _mm_add_ps(fix2,tx);
1928 fiy2 = _mm_add_ps(fiy2,ty);
1929 fiz2 = _mm_add_ps(fiz2,tz);
1931 fjx3 = _mm_add_ps(fjx3,tx);
1932 fjy3 = _mm_add_ps(fjy3,ty);
1933 fjz3 = _mm_add_ps(fjz3,tz);
1937 /**************************
1938 * CALCULATE INTERACTIONS *
1939 **************************/
1941 if (gmx_mm_any_lt(rsq31,rcutoff2))
1944 /* REACTION-FIELD ELECTROSTATICS */
1945 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1947 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1951 fscal = _mm_and_ps(fscal,cutoff_mask);
1953 fscal = _mm_andnot_ps(dummy_mask,fscal);
1955 /* Calculate temporary vectorial force */
1956 tx = _mm_mul_ps(fscal,dx31);
1957 ty = _mm_mul_ps(fscal,dy31);
1958 tz = _mm_mul_ps(fscal,dz31);
1960 /* Update vectorial force */
1961 fix3 = _mm_add_ps(fix3,tx);
1962 fiy3 = _mm_add_ps(fiy3,ty);
1963 fiz3 = _mm_add_ps(fiz3,tz);
1965 fjx1 = _mm_add_ps(fjx1,tx);
1966 fjy1 = _mm_add_ps(fjy1,ty);
1967 fjz1 = _mm_add_ps(fjz1,tz);
1971 /**************************
1972 * CALCULATE INTERACTIONS *
1973 **************************/
1975 if (gmx_mm_any_lt(rsq32,rcutoff2))
1978 /* REACTION-FIELD ELECTROSTATICS */
1979 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1981 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1985 fscal = _mm_and_ps(fscal,cutoff_mask);
1987 fscal = _mm_andnot_ps(dummy_mask,fscal);
1989 /* Calculate temporary vectorial force */
1990 tx = _mm_mul_ps(fscal,dx32);
1991 ty = _mm_mul_ps(fscal,dy32);
1992 tz = _mm_mul_ps(fscal,dz32);
1994 /* Update vectorial force */
1995 fix3 = _mm_add_ps(fix3,tx);
1996 fiy3 = _mm_add_ps(fiy3,ty);
1997 fiz3 = _mm_add_ps(fiz3,tz);
1999 fjx2 = _mm_add_ps(fjx2,tx);
2000 fjy2 = _mm_add_ps(fjy2,ty);
2001 fjz2 = _mm_add_ps(fjz2,tz);
2005 /**************************
2006 * CALCULATE INTERACTIONS *
2007 **************************/
2009 if (gmx_mm_any_lt(rsq33,rcutoff2))
2012 /* REACTION-FIELD ELECTROSTATICS */
2013 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
2015 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2019 fscal = _mm_and_ps(fscal,cutoff_mask);
2021 fscal = _mm_andnot_ps(dummy_mask,fscal);
2023 /* Calculate temporary vectorial force */
2024 tx = _mm_mul_ps(fscal,dx33);
2025 ty = _mm_mul_ps(fscal,dy33);
2026 tz = _mm_mul_ps(fscal,dz33);
2028 /* Update vectorial force */
2029 fix3 = _mm_add_ps(fix3,tx);
2030 fiy3 = _mm_add_ps(fiy3,ty);
2031 fiz3 = _mm_add_ps(fiz3,tz);
2033 fjx3 = _mm_add_ps(fjx3,tx);
2034 fjy3 = _mm_add_ps(fjy3,ty);
2035 fjz3 = _mm_add_ps(fjz3,tz);
2039 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
2040 f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
2041 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2043 /* Inner loop uses 270 flops */
2046 /* End of innermost loop */
2048 gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2049 f+i_coord_offset+DIM,fshift+i_shift_offset);
2051 /* Increment number of inner iterations */
2052 inneriter += j_index_end - j_index_start;
2054 /* Outer loop uses 27 flops */
2057 /* Increment number of outer iterations */
2060 /* Update outer/inner flops */
2062 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*270);