2 * Note: this file was generated by the Gromacs sse4_1_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse4_1_single.h"
34 #include "kernelutil_x86_sse4_1_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse4_1_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: None
40 * Geometry: Water4-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse4_1_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
72 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
74 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
75 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
78 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
79 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
80 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
83 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
85 __m128 dummy_mask,cutoff_mask;
86 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
87 __m128 one = _mm_set1_ps(1.0);
88 __m128 two = _mm_set1_ps(2.0);
94 jindex = nlist->jindex;
96 shiftidx = nlist->shift;
98 shiftvec = fr->shift_vec[0];
99 fshift = fr->fshift[0];
100 facel = _mm_set1_ps(fr->epsfac);
101 charge = mdatoms->chargeA;
103 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
104 ewtab = fr->ic->tabq_coul_FDV0;
105 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
106 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
108 /* Setup water-specific parameters */
109 inr = nlist->iinr[0];
110 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
111 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
112 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
114 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
115 rcutoff_scalar = fr->rcoulomb;
116 rcutoff = _mm_set1_ps(rcutoff_scalar);
117 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
119 /* Avoid stupid compiler warnings */
120 jnrA = jnrB = jnrC = jnrD = 0;
129 for(iidx=0;iidx<4*DIM;iidx++)
134 /* Start outer loop over neighborlists */
135 for(iidx=0; iidx<nri; iidx++)
137 /* Load shift vector for this list */
138 i_shift_offset = DIM*shiftidx[iidx];
140 /* Load limits for loop over neighbors */
141 j_index_start = jindex[iidx];
142 j_index_end = jindex[iidx+1];
144 /* Get outer coordinate index */
146 i_coord_offset = DIM*inr;
148 /* Load i particle coords and add shift vector */
149 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
150 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
152 fix1 = _mm_setzero_ps();
153 fiy1 = _mm_setzero_ps();
154 fiz1 = _mm_setzero_ps();
155 fix2 = _mm_setzero_ps();
156 fiy2 = _mm_setzero_ps();
157 fiz2 = _mm_setzero_ps();
158 fix3 = _mm_setzero_ps();
159 fiy3 = _mm_setzero_ps();
160 fiz3 = _mm_setzero_ps();
162 /* Reset potential sums */
163 velecsum = _mm_setzero_ps();
165 /* Start inner kernel loop */
166 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
169 /* Get j neighbor index, and coordinate index */
174 j_coord_offsetA = DIM*jnrA;
175 j_coord_offsetB = DIM*jnrB;
176 j_coord_offsetC = DIM*jnrC;
177 j_coord_offsetD = DIM*jnrD;
179 /* load j atom coordinates */
180 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
181 x+j_coord_offsetC,x+j_coord_offsetD,
184 /* Calculate displacement vector */
185 dx10 = _mm_sub_ps(ix1,jx0);
186 dy10 = _mm_sub_ps(iy1,jy0);
187 dz10 = _mm_sub_ps(iz1,jz0);
188 dx20 = _mm_sub_ps(ix2,jx0);
189 dy20 = _mm_sub_ps(iy2,jy0);
190 dz20 = _mm_sub_ps(iz2,jz0);
191 dx30 = _mm_sub_ps(ix3,jx0);
192 dy30 = _mm_sub_ps(iy3,jy0);
193 dz30 = _mm_sub_ps(iz3,jz0);
195 /* Calculate squared distance and things based on it */
196 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
197 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
198 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
200 rinv10 = gmx_mm_invsqrt_ps(rsq10);
201 rinv20 = gmx_mm_invsqrt_ps(rsq20);
202 rinv30 = gmx_mm_invsqrt_ps(rsq30);
204 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
205 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
206 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
208 /* Load parameters for j particles */
209 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
210 charge+jnrC+0,charge+jnrD+0);
212 /**************************
213 * CALCULATE INTERACTIONS *
214 **************************/
216 if (gmx_mm_any_lt(rsq10,rcutoff2))
219 r10 = _mm_mul_ps(rsq10,rinv10);
221 /* Compute parameters for interactions between i and j atoms */
222 qq10 = _mm_mul_ps(iq1,jq0);
224 /* EWALD ELECTROSTATICS */
226 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
227 ewrt = _mm_mul_ps(r10,ewtabscale);
228 ewitab = _mm_cvttps_epi32(ewrt);
229 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
230 ewitab = _mm_slli_epi32(ewitab,2);
231 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
232 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
233 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
234 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
235 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
236 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
237 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
238 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
239 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
241 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
243 /* Update potential sum for this i atom from the interaction with this j atom. */
244 velec = _mm_and_ps(velec,cutoff_mask);
245 velecsum = _mm_add_ps(velecsum,velec);
249 fscal = _mm_and_ps(fscal,cutoff_mask);
251 /* Calculate temporary vectorial force */
252 tx = _mm_mul_ps(fscal,dx10);
253 ty = _mm_mul_ps(fscal,dy10);
254 tz = _mm_mul_ps(fscal,dz10);
256 /* Update vectorial force */
257 fix1 = _mm_add_ps(fix1,tx);
258 fiy1 = _mm_add_ps(fiy1,ty);
259 fiz1 = _mm_add_ps(fiz1,tz);
261 fjptrA = f+j_coord_offsetA;
262 fjptrB = f+j_coord_offsetB;
263 fjptrC = f+j_coord_offsetC;
264 fjptrD = f+j_coord_offsetD;
265 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
269 /**************************
270 * CALCULATE INTERACTIONS *
271 **************************/
273 if (gmx_mm_any_lt(rsq20,rcutoff2))
276 r20 = _mm_mul_ps(rsq20,rinv20);
278 /* Compute parameters for interactions between i and j atoms */
279 qq20 = _mm_mul_ps(iq2,jq0);
281 /* EWALD ELECTROSTATICS */
283 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
284 ewrt = _mm_mul_ps(r20,ewtabscale);
285 ewitab = _mm_cvttps_epi32(ewrt);
286 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
287 ewitab = _mm_slli_epi32(ewitab,2);
288 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
289 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
290 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
291 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
292 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
293 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
294 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
295 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
296 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
298 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
300 /* Update potential sum for this i atom from the interaction with this j atom. */
301 velec = _mm_and_ps(velec,cutoff_mask);
302 velecsum = _mm_add_ps(velecsum,velec);
306 fscal = _mm_and_ps(fscal,cutoff_mask);
308 /* Calculate temporary vectorial force */
309 tx = _mm_mul_ps(fscal,dx20);
310 ty = _mm_mul_ps(fscal,dy20);
311 tz = _mm_mul_ps(fscal,dz20);
313 /* Update vectorial force */
314 fix2 = _mm_add_ps(fix2,tx);
315 fiy2 = _mm_add_ps(fiy2,ty);
316 fiz2 = _mm_add_ps(fiz2,tz);
318 fjptrA = f+j_coord_offsetA;
319 fjptrB = f+j_coord_offsetB;
320 fjptrC = f+j_coord_offsetC;
321 fjptrD = f+j_coord_offsetD;
322 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
326 /**************************
327 * CALCULATE INTERACTIONS *
328 **************************/
330 if (gmx_mm_any_lt(rsq30,rcutoff2))
333 r30 = _mm_mul_ps(rsq30,rinv30);
335 /* Compute parameters for interactions between i and j atoms */
336 qq30 = _mm_mul_ps(iq3,jq0);
338 /* EWALD ELECTROSTATICS */
340 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
341 ewrt = _mm_mul_ps(r30,ewtabscale);
342 ewitab = _mm_cvttps_epi32(ewrt);
343 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
344 ewitab = _mm_slli_epi32(ewitab,2);
345 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
346 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
347 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
348 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
349 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
350 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
351 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
352 velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_sub_ps(rinv30,sh_ewald),velec));
353 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
355 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
357 /* Update potential sum for this i atom from the interaction with this j atom. */
358 velec = _mm_and_ps(velec,cutoff_mask);
359 velecsum = _mm_add_ps(velecsum,velec);
363 fscal = _mm_and_ps(fscal,cutoff_mask);
365 /* Calculate temporary vectorial force */
366 tx = _mm_mul_ps(fscal,dx30);
367 ty = _mm_mul_ps(fscal,dy30);
368 tz = _mm_mul_ps(fscal,dz30);
370 /* Update vectorial force */
371 fix3 = _mm_add_ps(fix3,tx);
372 fiy3 = _mm_add_ps(fiy3,ty);
373 fiz3 = _mm_add_ps(fiz3,tz);
375 fjptrA = f+j_coord_offsetA;
376 fjptrB = f+j_coord_offsetB;
377 fjptrC = f+j_coord_offsetC;
378 fjptrD = f+j_coord_offsetD;
379 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
383 /* Inner loop uses 138 flops */
389 /* Get j neighbor index, and coordinate index */
390 jnrlistA = jjnr[jidx];
391 jnrlistB = jjnr[jidx+1];
392 jnrlistC = jjnr[jidx+2];
393 jnrlistD = jjnr[jidx+3];
394 /* Sign of each element will be negative for non-real atoms.
395 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
396 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
398 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
399 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
400 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
401 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
402 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
403 j_coord_offsetA = DIM*jnrA;
404 j_coord_offsetB = DIM*jnrB;
405 j_coord_offsetC = DIM*jnrC;
406 j_coord_offsetD = DIM*jnrD;
408 /* load j atom coordinates */
409 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
410 x+j_coord_offsetC,x+j_coord_offsetD,
413 /* Calculate displacement vector */
414 dx10 = _mm_sub_ps(ix1,jx0);
415 dy10 = _mm_sub_ps(iy1,jy0);
416 dz10 = _mm_sub_ps(iz1,jz0);
417 dx20 = _mm_sub_ps(ix2,jx0);
418 dy20 = _mm_sub_ps(iy2,jy0);
419 dz20 = _mm_sub_ps(iz2,jz0);
420 dx30 = _mm_sub_ps(ix3,jx0);
421 dy30 = _mm_sub_ps(iy3,jy0);
422 dz30 = _mm_sub_ps(iz3,jz0);
424 /* Calculate squared distance and things based on it */
425 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
426 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
427 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
429 rinv10 = gmx_mm_invsqrt_ps(rsq10);
430 rinv20 = gmx_mm_invsqrt_ps(rsq20);
431 rinv30 = gmx_mm_invsqrt_ps(rsq30);
433 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
434 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
435 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
437 /* Load parameters for j particles */
438 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
439 charge+jnrC+0,charge+jnrD+0);
441 /**************************
442 * CALCULATE INTERACTIONS *
443 **************************/
445 if (gmx_mm_any_lt(rsq10,rcutoff2))
448 r10 = _mm_mul_ps(rsq10,rinv10);
449 r10 = _mm_andnot_ps(dummy_mask,r10);
451 /* Compute parameters for interactions between i and j atoms */
452 qq10 = _mm_mul_ps(iq1,jq0);
454 /* EWALD ELECTROSTATICS */
456 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
457 ewrt = _mm_mul_ps(r10,ewtabscale);
458 ewitab = _mm_cvttps_epi32(ewrt);
459 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
460 ewitab = _mm_slli_epi32(ewitab,2);
461 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
462 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
463 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
464 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
465 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
466 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
467 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
468 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
469 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
471 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
473 /* Update potential sum for this i atom from the interaction with this j atom. */
474 velec = _mm_and_ps(velec,cutoff_mask);
475 velec = _mm_andnot_ps(dummy_mask,velec);
476 velecsum = _mm_add_ps(velecsum,velec);
480 fscal = _mm_and_ps(fscal,cutoff_mask);
482 fscal = _mm_andnot_ps(dummy_mask,fscal);
484 /* Calculate temporary vectorial force */
485 tx = _mm_mul_ps(fscal,dx10);
486 ty = _mm_mul_ps(fscal,dy10);
487 tz = _mm_mul_ps(fscal,dz10);
489 /* Update vectorial force */
490 fix1 = _mm_add_ps(fix1,tx);
491 fiy1 = _mm_add_ps(fiy1,ty);
492 fiz1 = _mm_add_ps(fiz1,tz);
494 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
495 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
496 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
497 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
498 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
502 /**************************
503 * CALCULATE INTERACTIONS *
504 **************************/
506 if (gmx_mm_any_lt(rsq20,rcutoff2))
509 r20 = _mm_mul_ps(rsq20,rinv20);
510 r20 = _mm_andnot_ps(dummy_mask,r20);
512 /* Compute parameters for interactions between i and j atoms */
513 qq20 = _mm_mul_ps(iq2,jq0);
515 /* EWALD ELECTROSTATICS */
517 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
518 ewrt = _mm_mul_ps(r20,ewtabscale);
519 ewitab = _mm_cvttps_epi32(ewrt);
520 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
521 ewitab = _mm_slli_epi32(ewitab,2);
522 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
523 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
524 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
525 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
526 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
527 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
528 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
529 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
530 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
532 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
534 /* Update potential sum for this i atom from the interaction with this j atom. */
535 velec = _mm_and_ps(velec,cutoff_mask);
536 velec = _mm_andnot_ps(dummy_mask,velec);
537 velecsum = _mm_add_ps(velecsum,velec);
541 fscal = _mm_and_ps(fscal,cutoff_mask);
543 fscal = _mm_andnot_ps(dummy_mask,fscal);
545 /* Calculate temporary vectorial force */
546 tx = _mm_mul_ps(fscal,dx20);
547 ty = _mm_mul_ps(fscal,dy20);
548 tz = _mm_mul_ps(fscal,dz20);
550 /* Update vectorial force */
551 fix2 = _mm_add_ps(fix2,tx);
552 fiy2 = _mm_add_ps(fiy2,ty);
553 fiz2 = _mm_add_ps(fiz2,tz);
555 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
556 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
557 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
558 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
559 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
563 /**************************
564 * CALCULATE INTERACTIONS *
565 **************************/
567 if (gmx_mm_any_lt(rsq30,rcutoff2))
570 r30 = _mm_mul_ps(rsq30,rinv30);
571 r30 = _mm_andnot_ps(dummy_mask,r30);
573 /* Compute parameters for interactions between i and j atoms */
574 qq30 = _mm_mul_ps(iq3,jq0);
576 /* EWALD ELECTROSTATICS */
578 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
579 ewrt = _mm_mul_ps(r30,ewtabscale);
580 ewitab = _mm_cvttps_epi32(ewrt);
581 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
582 ewitab = _mm_slli_epi32(ewitab,2);
583 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
584 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
585 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
586 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
587 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
588 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
589 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
590 velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_sub_ps(rinv30,sh_ewald),velec));
591 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
593 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
595 /* Update potential sum for this i atom from the interaction with this j atom. */
596 velec = _mm_and_ps(velec,cutoff_mask);
597 velec = _mm_andnot_ps(dummy_mask,velec);
598 velecsum = _mm_add_ps(velecsum,velec);
602 fscal = _mm_and_ps(fscal,cutoff_mask);
604 fscal = _mm_andnot_ps(dummy_mask,fscal);
606 /* Calculate temporary vectorial force */
607 tx = _mm_mul_ps(fscal,dx30);
608 ty = _mm_mul_ps(fscal,dy30);
609 tz = _mm_mul_ps(fscal,dz30);
611 /* Update vectorial force */
612 fix3 = _mm_add_ps(fix3,tx);
613 fiy3 = _mm_add_ps(fiy3,ty);
614 fiz3 = _mm_add_ps(fiz3,tz);
616 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
617 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
618 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
619 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
620 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
624 /* Inner loop uses 141 flops */
627 /* End of innermost loop */
629 gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
630 f+i_coord_offset+DIM,fshift+i_shift_offset);
633 /* Update potential energies */
634 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
636 /* Increment number of inner iterations */
637 inneriter += j_index_end - j_index_start;
639 /* Outer loop uses 19 flops */
642 /* Increment number of outer iterations */
645 /* Update outer/inner flops */
647 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*141);
650 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse4_1_single
651 * Electrostatics interaction: Ewald
652 * VdW interaction: None
653 * Geometry: Water4-Particle
654 * Calculate force/pot: Force
657 nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse4_1_single
658 (t_nblist * gmx_restrict nlist,
659 rvec * gmx_restrict xx,
660 rvec * gmx_restrict ff,
661 t_forcerec * gmx_restrict fr,
662 t_mdatoms * gmx_restrict mdatoms,
663 nb_kernel_data_t * gmx_restrict kernel_data,
664 t_nrnb * gmx_restrict nrnb)
666 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
667 * just 0 for non-waters.
668 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
669 * jnr indices corresponding to data put in the four positions in the SIMD register.
671 int i_shift_offset,i_coord_offset,outeriter,inneriter;
672 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
673 int jnrA,jnrB,jnrC,jnrD;
674 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
675 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
676 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
678 real *shiftvec,*fshift,*x,*f;
679 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
681 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
683 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
685 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
687 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
688 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
689 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
690 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
691 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
692 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
693 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
696 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
698 __m128 dummy_mask,cutoff_mask;
699 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
700 __m128 one = _mm_set1_ps(1.0);
701 __m128 two = _mm_set1_ps(2.0);
707 jindex = nlist->jindex;
709 shiftidx = nlist->shift;
711 shiftvec = fr->shift_vec[0];
712 fshift = fr->fshift[0];
713 facel = _mm_set1_ps(fr->epsfac);
714 charge = mdatoms->chargeA;
716 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
717 ewtab = fr->ic->tabq_coul_F;
718 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
719 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
721 /* Setup water-specific parameters */
722 inr = nlist->iinr[0];
723 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
724 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
725 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
727 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
728 rcutoff_scalar = fr->rcoulomb;
729 rcutoff = _mm_set1_ps(rcutoff_scalar);
730 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
732 /* Avoid stupid compiler warnings */
733 jnrA = jnrB = jnrC = jnrD = 0;
742 for(iidx=0;iidx<4*DIM;iidx++)
747 /* Start outer loop over neighborlists */
748 for(iidx=0; iidx<nri; iidx++)
750 /* Load shift vector for this list */
751 i_shift_offset = DIM*shiftidx[iidx];
753 /* Load limits for loop over neighbors */
754 j_index_start = jindex[iidx];
755 j_index_end = jindex[iidx+1];
757 /* Get outer coordinate index */
759 i_coord_offset = DIM*inr;
761 /* Load i particle coords and add shift vector */
762 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
763 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
765 fix1 = _mm_setzero_ps();
766 fiy1 = _mm_setzero_ps();
767 fiz1 = _mm_setzero_ps();
768 fix2 = _mm_setzero_ps();
769 fiy2 = _mm_setzero_ps();
770 fiz2 = _mm_setzero_ps();
771 fix3 = _mm_setzero_ps();
772 fiy3 = _mm_setzero_ps();
773 fiz3 = _mm_setzero_ps();
775 /* Start inner kernel loop */
776 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
779 /* Get j neighbor index, and coordinate index */
784 j_coord_offsetA = DIM*jnrA;
785 j_coord_offsetB = DIM*jnrB;
786 j_coord_offsetC = DIM*jnrC;
787 j_coord_offsetD = DIM*jnrD;
789 /* load j atom coordinates */
790 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
791 x+j_coord_offsetC,x+j_coord_offsetD,
794 /* Calculate displacement vector */
795 dx10 = _mm_sub_ps(ix1,jx0);
796 dy10 = _mm_sub_ps(iy1,jy0);
797 dz10 = _mm_sub_ps(iz1,jz0);
798 dx20 = _mm_sub_ps(ix2,jx0);
799 dy20 = _mm_sub_ps(iy2,jy0);
800 dz20 = _mm_sub_ps(iz2,jz0);
801 dx30 = _mm_sub_ps(ix3,jx0);
802 dy30 = _mm_sub_ps(iy3,jy0);
803 dz30 = _mm_sub_ps(iz3,jz0);
805 /* Calculate squared distance and things based on it */
806 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
807 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
808 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
810 rinv10 = gmx_mm_invsqrt_ps(rsq10);
811 rinv20 = gmx_mm_invsqrt_ps(rsq20);
812 rinv30 = gmx_mm_invsqrt_ps(rsq30);
814 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
815 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
816 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
818 /* Load parameters for j particles */
819 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
820 charge+jnrC+0,charge+jnrD+0);
822 /**************************
823 * CALCULATE INTERACTIONS *
824 **************************/
826 if (gmx_mm_any_lt(rsq10,rcutoff2))
829 r10 = _mm_mul_ps(rsq10,rinv10);
831 /* Compute parameters for interactions between i and j atoms */
832 qq10 = _mm_mul_ps(iq1,jq0);
834 /* EWALD ELECTROSTATICS */
836 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
837 ewrt = _mm_mul_ps(r10,ewtabscale);
838 ewitab = _mm_cvttps_epi32(ewrt);
839 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
840 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
841 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
843 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
844 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
846 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
850 fscal = _mm_and_ps(fscal,cutoff_mask);
852 /* Calculate temporary vectorial force */
853 tx = _mm_mul_ps(fscal,dx10);
854 ty = _mm_mul_ps(fscal,dy10);
855 tz = _mm_mul_ps(fscal,dz10);
857 /* Update vectorial force */
858 fix1 = _mm_add_ps(fix1,tx);
859 fiy1 = _mm_add_ps(fiy1,ty);
860 fiz1 = _mm_add_ps(fiz1,tz);
862 fjptrA = f+j_coord_offsetA;
863 fjptrB = f+j_coord_offsetB;
864 fjptrC = f+j_coord_offsetC;
865 fjptrD = f+j_coord_offsetD;
866 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
870 /**************************
871 * CALCULATE INTERACTIONS *
872 **************************/
874 if (gmx_mm_any_lt(rsq20,rcutoff2))
877 r20 = _mm_mul_ps(rsq20,rinv20);
879 /* Compute parameters for interactions between i and j atoms */
880 qq20 = _mm_mul_ps(iq2,jq0);
882 /* EWALD ELECTROSTATICS */
884 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
885 ewrt = _mm_mul_ps(r20,ewtabscale);
886 ewitab = _mm_cvttps_epi32(ewrt);
887 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
888 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
889 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
891 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
892 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
894 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
898 fscal = _mm_and_ps(fscal,cutoff_mask);
900 /* Calculate temporary vectorial force */
901 tx = _mm_mul_ps(fscal,dx20);
902 ty = _mm_mul_ps(fscal,dy20);
903 tz = _mm_mul_ps(fscal,dz20);
905 /* Update vectorial force */
906 fix2 = _mm_add_ps(fix2,tx);
907 fiy2 = _mm_add_ps(fiy2,ty);
908 fiz2 = _mm_add_ps(fiz2,tz);
910 fjptrA = f+j_coord_offsetA;
911 fjptrB = f+j_coord_offsetB;
912 fjptrC = f+j_coord_offsetC;
913 fjptrD = f+j_coord_offsetD;
914 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
918 /**************************
919 * CALCULATE INTERACTIONS *
920 **************************/
922 if (gmx_mm_any_lt(rsq30,rcutoff2))
925 r30 = _mm_mul_ps(rsq30,rinv30);
927 /* Compute parameters for interactions between i and j atoms */
928 qq30 = _mm_mul_ps(iq3,jq0);
930 /* EWALD ELECTROSTATICS */
932 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
933 ewrt = _mm_mul_ps(r30,ewtabscale);
934 ewitab = _mm_cvttps_epi32(ewrt);
935 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
936 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
937 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
939 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
940 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
942 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
946 fscal = _mm_and_ps(fscal,cutoff_mask);
948 /* Calculate temporary vectorial force */
949 tx = _mm_mul_ps(fscal,dx30);
950 ty = _mm_mul_ps(fscal,dy30);
951 tz = _mm_mul_ps(fscal,dz30);
953 /* Update vectorial force */
954 fix3 = _mm_add_ps(fix3,tx);
955 fiy3 = _mm_add_ps(fiy3,ty);
956 fiz3 = _mm_add_ps(fiz3,tz);
958 fjptrA = f+j_coord_offsetA;
959 fjptrB = f+j_coord_offsetB;
960 fjptrC = f+j_coord_offsetC;
961 fjptrD = f+j_coord_offsetD;
962 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
966 /* Inner loop uses 117 flops */
972 /* Get j neighbor index, and coordinate index */
973 jnrlistA = jjnr[jidx];
974 jnrlistB = jjnr[jidx+1];
975 jnrlistC = jjnr[jidx+2];
976 jnrlistD = jjnr[jidx+3];
977 /* Sign of each element will be negative for non-real atoms.
978 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
979 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
981 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
982 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
983 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
984 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
985 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
986 j_coord_offsetA = DIM*jnrA;
987 j_coord_offsetB = DIM*jnrB;
988 j_coord_offsetC = DIM*jnrC;
989 j_coord_offsetD = DIM*jnrD;
991 /* load j atom coordinates */
992 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
993 x+j_coord_offsetC,x+j_coord_offsetD,
996 /* Calculate displacement vector */
997 dx10 = _mm_sub_ps(ix1,jx0);
998 dy10 = _mm_sub_ps(iy1,jy0);
999 dz10 = _mm_sub_ps(iz1,jz0);
1000 dx20 = _mm_sub_ps(ix2,jx0);
1001 dy20 = _mm_sub_ps(iy2,jy0);
1002 dz20 = _mm_sub_ps(iz2,jz0);
1003 dx30 = _mm_sub_ps(ix3,jx0);
1004 dy30 = _mm_sub_ps(iy3,jy0);
1005 dz30 = _mm_sub_ps(iz3,jz0);
1007 /* Calculate squared distance and things based on it */
1008 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1009 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1010 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1012 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1013 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1014 rinv30 = gmx_mm_invsqrt_ps(rsq30);
1016 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1017 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1018 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
1020 /* Load parameters for j particles */
1021 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1022 charge+jnrC+0,charge+jnrD+0);
1024 /**************************
1025 * CALCULATE INTERACTIONS *
1026 **************************/
1028 if (gmx_mm_any_lt(rsq10,rcutoff2))
1031 r10 = _mm_mul_ps(rsq10,rinv10);
1032 r10 = _mm_andnot_ps(dummy_mask,r10);
1034 /* Compute parameters for interactions between i and j atoms */
1035 qq10 = _mm_mul_ps(iq1,jq0);
1037 /* EWALD ELECTROSTATICS */
1039 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1040 ewrt = _mm_mul_ps(r10,ewtabscale);
1041 ewitab = _mm_cvttps_epi32(ewrt);
1042 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1043 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1044 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1046 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1047 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1049 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1053 fscal = _mm_and_ps(fscal,cutoff_mask);
1055 fscal = _mm_andnot_ps(dummy_mask,fscal);
1057 /* Calculate temporary vectorial force */
1058 tx = _mm_mul_ps(fscal,dx10);
1059 ty = _mm_mul_ps(fscal,dy10);
1060 tz = _mm_mul_ps(fscal,dz10);
1062 /* Update vectorial force */
1063 fix1 = _mm_add_ps(fix1,tx);
1064 fiy1 = _mm_add_ps(fiy1,ty);
1065 fiz1 = _mm_add_ps(fiz1,tz);
1067 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1068 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1069 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1070 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1071 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
1075 /**************************
1076 * CALCULATE INTERACTIONS *
1077 **************************/
1079 if (gmx_mm_any_lt(rsq20,rcutoff2))
1082 r20 = _mm_mul_ps(rsq20,rinv20);
1083 r20 = _mm_andnot_ps(dummy_mask,r20);
1085 /* Compute parameters for interactions between i and j atoms */
1086 qq20 = _mm_mul_ps(iq2,jq0);
1088 /* EWALD ELECTROSTATICS */
1090 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1091 ewrt = _mm_mul_ps(r20,ewtabscale);
1092 ewitab = _mm_cvttps_epi32(ewrt);
1093 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1094 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1095 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1097 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1098 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1100 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1104 fscal = _mm_and_ps(fscal,cutoff_mask);
1106 fscal = _mm_andnot_ps(dummy_mask,fscal);
1108 /* Calculate temporary vectorial force */
1109 tx = _mm_mul_ps(fscal,dx20);
1110 ty = _mm_mul_ps(fscal,dy20);
1111 tz = _mm_mul_ps(fscal,dz20);
1113 /* Update vectorial force */
1114 fix2 = _mm_add_ps(fix2,tx);
1115 fiy2 = _mm_add_ps(fiy2,ty);
1116 fiz2 = _mm_add_ps(fiz2,tz);
1118 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1119 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1120 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1121 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1122 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
1126 /**************************
1127 * CALCULATE INTERACTIONS *
1128 **************************/
1130 if (gmx_mm_any_lt(rsq30,rcutoff2))
1133 r30 = _mm_mul_ps(rsq30,rinv30);
1134 r30 = _mm_andnot_ps(dummy_mask,r30);
1136 /* Compute parameters for interactions between i and j atoms */
1137 qq30 = _mm_mul_ps(iq3,jq0);
1139 /* EWALD ELECTROSTATICS */
1141 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1142 ewrt = _mm_mul_ps(r30,ewtabscale);
1143 ewitab = _mm_cvttps_epi32(ewrt);
1144 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1145 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1146 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1148 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1149 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1151 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
1155 fscal = _mm_and_ps(fscal,cutoff_mask);
1157 fscal = _mm_andnot_ps(dummy_mask,fscal);
1159 /* Calculate temporary vectorial force */
1160 tx = _mm_mul_ps(fscal,dx30);
1161 ty = _mm_mul_ps(fscal,dy30);
1162 tz = _mm_mul_ps(fscal,dz30);
1164 /* Update vectorial force */
1165 fix3 = _mm_add_ps(fix3,tx);
1166 fiy3 = _mm_add_ps(fiy3,ty);
1167 fiz3 = _mm_add_ps(fiz3,tz);
1169 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1170 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1171 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1172 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1173 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
1177 /* Inner loop uses 120 flops */
1180 /* End of innermost loop */
1182 gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1183 f+i_coord_offset+DIM,fshift+i_shift_offset);
1185 /* Increment number of inner iterations */
1186 inneriter += j_index_end - j_index_start;
1188 /* Outer loop uses 18 flops */
1191 /* Increment number of outer iterations */
1194 /* Update outer/inner flops */
1196 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*120);