2 * Note: this file was generated by the Gromacs sse4_1_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse4_1_single.h"
34 #include "kernelutil_x86_sse4_1_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse4_1_single
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: None
40 * Geometry: Water4-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse4_1_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
72 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
74 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
75 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
78 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
79 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
80 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
82 __m128 dummy_mask,cutoff_mask;
83 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
84 __m128 one = _mm_set1_ps(1.0);
85 __m128 two = _mm_set1_ps(2.0);
91 jindex = nlist->jindex;
93 shiftidx = nlist->shift;
95 shiftvec = fr->shift_vec[0];
96 fshift = fr->fshift[0];
97 facel = _mm_set1_ps(fr->epsfac);
98 charge = mdatoms->chargeA;
99 krf = _mm_set1_ps(fr->ic->k_rf);
100 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
101 crf = _mm_set1_ps(fr->ic->c_rf);
103 /* Setup water-specific parameters */
104 inr = nlist->iinr[0];
105 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
106 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
107 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
109 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
110 rcutoff_scalar = fr->rcoulomb;
111 rcutoff = _mm_set1_ps(rcutoff_scalar);
112 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
114 /* Avoid stupid compiler warnings */
115 jnrA = jnrB = jnrC = jnrD = 0;
124 for(iidx=0;iidx<4*DIM;iidx++)
129 /* Start outer loop over neighborlists */
130 for(iidx=0; iidx<nri; iidx++)
132 /* Load shift vector for this list */
133 i_shift_offset = DIM*shiftidx[iidx];
135 /* Load limits for loop over neighbors */
136 j_index_start = jindex[iidx];
137 j_index_end = jindex[iidx+1];
139 /* Get outer coordinate index */
141 i_coord_offset = DIM*inr;
143 /* Load i particle coords and add shift vector */
144 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
145 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
147 fix1 = _mm_setzero_ps();
148 fiy1 = _mm_setzero_ps();
149 fiz1 = _mm_setzero_ps();
150 fix2 = _mm_setzero_ps();
151 fiy2 = _mm_setzero_ps();
152 fiz2 = _mm_setzero_ps();
153 fix3 = _mm_setzero_ps();
154 fiy3 = _mm_setzero_ps();
155 fiz3 = _mm_setzero_ps();
157 /* Reset potential sums */
158 velecsum = _mm_setzero_ps();
160 /* Start inner kernel loop */
161 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
164 /* Get j neighbor index, and coordinate index */
169 j_coord_offsetA = DIM*jnrA;
170 j_coord_offsetB = DIM*jnrB;
171 j_coord_offsetC = DIM*jnrC;
172 j_coord_offsetD = DIM*jnrD;
174 /* load j atom coordinates */
175 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
176 x+j_coord_offsetC,x+j_coord_offsetD,
179 /* Calculate displacement vector */
180 dx10 = _mm_sub_ps(ix1,jx0);
181 dy10 = _mm_sub_ps(iy1,jy0);
182 dz10 = _mm_sub_ps(iz1,jz0);
183 dx20 = _mm_sub_ps(ix2,jx0);
184 dy20 = _mm_sub_ps(iy2,jy0);
185 dz20 = _mm_sub_ps(iz2,jz0);
186 dx30 = _mm_sub_ps(ix3,jx0);
187 dy30 = _mm_sub_ps(iy3,jy0);
188 dz30 = _mm_sub_ps(iz3,jz0);
190 /* Calculate squared distance and things based on it */
191 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
192 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
193 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
195 rinv10 = gmx_mm_invsqrt_ps(rsq10);
196 rinv20 = gmx_mm_invsqrt_ps(rsq20);
197 rinv30 = gmx_mm_invsqrt_ps(rsq30);
199 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
200 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
201 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
203 /* Load parameters for j particles */
204 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
205 charge+jnrC+0,charge+jnrD+0);
207 /**************************
208 * CALCULATE INTERACTIONS *
209 **************************/
211 if (gmx_mm_any_lt(rsq10,rcutoff2))
214 /* Compute parameters for interactions between i and j atoms */
215 qq10 = _mm_mul_ps(iq1,jq0);
217 /* REACTION-FIELD ELECTROSTATICS */
218 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
219 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
221 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
223 /* Update potential sum for this i atom from the interaction with this j atom. */
224 velec = _mm_and_ps(velec,cutoff_mask);
225 velecsum = _mm_add_ps(velecsum,velec);
229 fscal = _mm_and_ps(fscal,cutoff_mask);
231 /* Calculate temporary vectorial force */
232 tx = _mm_mul_ps(fscal,dx10);
233 ty = _mm_mul_ps(fscal,dy10);
234 tz = _mm_mul_ps(fscal,dz10);
236 /* Update vectorial force */
237 fix1 = _mm_add_ps(fix1,tx);
238 fiy1 = _mm_add_ps(fiy1,ty);
239 fiz1 = _mm_add_ps(fiz1,tz);
241 fjptrA = f+j_coord_offsetA;
242 fjptrB = f+j_coord_offsetB;
243 fjptrC = f+j_coord_offsetC;
244 fjptrD = f+j_coord_offsetD;
245 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
249 /**************************
250 * CALCULATE INTERACTIONS *
251 **************************/
253 if (gmx_mm_any_lt(rsq20,rcutoff2))
256 /* Compute parameters for interactions between i and j atoms */
257 qq20 = _mm_mul_ps(iq2,jq0);
259 /* REACTION-FIELD ELECTROSTATICS */
260 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
261 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
263 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
265 /* Update potential sum for this i atom from the interaction with this j atom. */
266 velec = _mm_and_ps(velec,cutoff_mask);
267 velecsum = _mm_add_ps(velecsum,velec);
271 fscal = _mm_and_ps(fscal,cutoff_mask);
273 /* Calculate temporary vectorial force */
274 tx = _mm_mul_ps(fscal,dx20);
275 ty = _mm_mul_ps(fscal,dy20);
276 tz = _mm_mul_ps(fscal,dz20);
278 /* Update vectorial force */
279 fix2 = _mm_add_ps(fix2,tx);
280 fiy2 = _mm_add_ps(fiy2,ty);
281 fiz2 = _mm_add_ps(fiz2,tz);
283 fjptrA = f+j_coord_offsetA;
284 fjptrB = f+j_coord_offsetB;
285 fjptrC = f+j_coord_offsetC;
286 fjptrD = f+j_coord_offsetD;
287 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
291 /**************************
292 * CALCULATE INTERACTIONS *
293 **************************/
295 if (gmx_mm_any_lt(rsq30,rcutoff2))
298 /* Compute parameters for interactions between i and j atoms */
299 qq30 = _mm_mul_ps(iq3,jq0);
301 /* REACTION-FIELD ELECTROSTATICS */
302 velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
303 felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
305 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
307 /* Update potential sum for this i atom from the interaction with this j atom. */
308 velec = _mm_and_ps(velec,cutoff_mask);
309 velecsum = _mm_add_ps(velecsum,velec);
313 fscal = _mm_and_ps(fscal,cutoff_mask);
315 /* Calculate temporary vectorial force */
316 tx = _mm_mul_ps(fscal,dx30);
317 ty = _mm_mul_ps(fscal,dy30);
318 tz = _mm_mul_ps(fscal,dz30);
320 /* Update vectorial force */
321 fix3 = _mm_add_ps(fix3,tx);
322 fiy3 = _mm_add_ps(fiy3,ty);
323 fiz3 = _mm_add_ps(fiz3,tz);
325 fjptrA = f+j_coord_offsetA;
326 fjptrB = f+j_coord_offsetB;
327 fjptrC = f+j_coord_offsetC;
328 fjptrD = f+j_coord_offsetD;
329 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
333 /* Inner loop uses 108 flops */
339 /* Get j neighbor index, and coordinate index */
340 jnrlistA = jjnr[jidx];
341 jnrlistB = jjnr[jidx+1];
342 jnrlistC = jjnr[jidx+2];
343 jnrlistD = jjnr[jidx+3];
344 /* Sign of each element will be negative for non-real atoms.
345 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
346 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
348 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
349 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
350 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
351 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
352 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
353 j_coord_offsetA = DIM*jnrA;
354 j_coord_offsetB = DIM*jnrB;
355 j_coord_offsetC = DIM*jnrC;
356 j_coord_offsetD = DIM*jnrD;
358 /* load j atom coordinates */
359 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
360 x+j_coord_offsetC,x+j_coord_offsetD,
363 /* Calculate displacement vector */
364 dx10 = _mm_sub_ps(ix1,jx0);
365 dy10 = _mm_sub_ps(iy1,jy0);
366 dz10 = _mm_sub_ps(iz1,jz0);
367 dx20 = _mm_sub_ps(ix2,jx0);
368 dy20 = _mm_sub_ps(iy2,jy0);
369 dz20 = _mm_sub_ps(iz2,jz0);
370 dx30 = _mm_sub_ps(ix3,jx0);
371 dy30 = _mm_sub_ps(iy3,jy0);
372 dz30 = _mm_sub_ps(iz3,jz0);
374 /* Calculate squared distance and things based on it */
375 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
376 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
377 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
379 rinv10 = gmx_mm_invsqrt_ps(rsq10);
380 rinv20 = gmx_mm_invsqrt_ps(rsq20);
381 rinv30 = gmx_mm_invsqrt_ps(rsq30);
383 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
384 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
385 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
387 /* Load parameters for j particles */
388 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
389 charge+jnrC+0,charge+jnrD+0);
391 /**************************
392 * CALCULATE INTERACTIONS *
393 **************************/
395 if (gmx_mm_any_lt(rsq10,rcutoff2))
398 /* Compute parameters for interactions between i and j atoms */
399 qq10 = _mm_mul_ps(iq1,jq0);
401 /* REACTION-FIELD ELECTROSTATICS */
402 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
403 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
405 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
407 /* Update potential sum for this i atom from the interaction with this j atom. */
408 velec = _mm_and_ps(velec,cutoff_mask);
409 velec = _mm_andnot_ps(dummy_mask,velec);
410 velecsum = _mm_add_ps(velecsum,velec);
414 fscal = _mm_and_ps(fscal,cutoff_mask);
416 fscal = _mm_andnot_ps(dummy_mask,fscal);
418 /* Calculate temporary vectorial force */
419 tx = _mm_mul_ps(fscal,dx10);
420 ty = _mm_mul_ps(fscal,dy10);
421 tz = _mm_mul_ps(fscal,dz10);
423 /* Update vectorial force */
424 fix1 = _mm_add_ps(fix1,tx);
425 fiy1 = _mm_add_ps(fiy1,ty);
426 fiz1 = _mm_add_ps(fiz1,tz);
428 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
429 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
430 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
431 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
432 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
436 /**************************
437 * CALCULATE INTERACTIONS *
438 **************************/
440 if (gmx_mm_any_lt(rsq20,rcutoff2))
443 /* Compute parameters for interactions between i and j atoms */
444 qq20 = _mm_mul_ps(iq2,jq0);
446 /* REACTION-FIELD ELECTROSTATICS */
447 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
448 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
450 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
452 /* Update potential sum for this i atom from the interaction with this j atom. */
453 velec = _mm_and_ps(velec,cutoff_mask);
454 velec = _mm_andnot_ps(dummy_mask,velec);
455 velecsum = _mm_add_ps(velecsum,velec);
459 fscal = _mm_and_ps(fscal,cutoff_mask);
461 fscal = _mm_andnot_ps(dummy_mask,fscal);
463 /* Calculate temporary vectorial force */
464 tx = _mm_mul_ps(fscal,dx20);
465 ty = _mm_mul_ps(fscal,dy20);
466 tz = _mm_mul_ps(fscal,dz20);
468 /* Update vectorial force */
469 fix2 = _mm_add_ps(fix2,tx);
470 fiy2 = _mm_add_ps(fiy2,ty);
471 fiz2 = _mm_add_ps(fiz2,tz);
473 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
474 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
475 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
476 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
477 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
481 /**************************
482 * CALCULATE INTERACTIONS *
483 **************************/
485 if (gmx_mm_any_lt(rsq30,rcutoff2))
488 /* Compute parameters for interactions between i and j atoms */
489 qq30 = _mm_mul_ps(iq3,jq0);
491 /* REACTION-FIELD ELECTROSTATICS */
492 velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
493 felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
495 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
497 /* Update potential sum for this i atom from the interaction with this j atom. */
498 velec = _mm_and_ps(velec,cutoff_mask);
499 velec = _mm_andnot_ps(dummy_mask,velec);
500 velecsum = _mm_add_ps(velecsum,velec);
504 fscal = _mm_and_ps(fscal,cutoff_mask);
506 fscal = _mm_andnot_ps(dummy_mask,fscal);
508 /* Calculate temporary vectorial force */
509 tx = _mm_mul_ps(fscal,dx30);
510 ty = _mm_mul_ps(fscal,dy30);
511 tz = _mm_mul_ps(fscal,dz30);
513 /* Update vectorial force */
514 fix3 = _mm_add_ps(fix3,tx);
515 fiy3 = _mm_add_ps(fiy3,ty);
516 fiz3 = _mm_add_ps(fiz3,tz);
518 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
519 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
520 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
521 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
522 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
526 /* Inner loop uses 108 flops */
529 /* End of innermost loop */
531 gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
532 f+i_coord_offset+DIM,fshift+i_shift_offset);
535 /* Update potential energies */
536 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
538 /* Increment number of inner iterations */
539 inneriter += j_index_end - j_index_start;
541 /* Outer loop uses 19 flops */
544 /* Increment number of outer iterations */
547 /* Update outer/inner flops */
549 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*108);
552 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse4_1_single
553 * Electrostatics interaction: ReactionField
554 * VdW interaction: None
555 * Geometry: Water4-Particle
556 * Calculate force/pot: Force
559 nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse4_1_single
560 (t_nblist * gmx_restrict nlist,
561 rvec * gmx_restrict xx,
562 rvec * gmx_restrict ff,
563 t_forcerec * gmx_restrict fr,
564 t_mdatoms * gmx_restrict mdatoms,
565 nb_kernel_data_t * gmx_restrict kernel_data,
566 t_nrnb * gmx_restrict nrnb)
568 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
569 * just 0 for non-waters.
570 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
571 * jnr indices corresponding to data put in the four positions in the SIMD register.
573 int i_shift_offset,i_coord_offset,outeriter,inneriter;
574 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
575 int jnrA,jnrB,jnrC,jnrD;
576 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
577 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
578 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
580 real *shiftvec,*fshift,*x,*f;
581 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
583 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
585 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
587 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
589 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
590 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
591 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
592 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
593 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
594 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
595 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
597 __m128 dummy_mask,cutoff_mask;
598 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
599 __m128 one = _mm_set1_ps(1.0);
600 __m128 two = _mm_set1_ps(2.0);
606 jindex = nlist->jindex;
608 shiftidx = nlist->shift;
610 shiftvec = fr->shift_vec[0];
611 fshift = fr->fshift[0];
612 facel = _mm_set1_ps(fr->epsfac);
613 charge = mdatoms->chargeA;
614 krf = _mm_set1_ps(fr->ic->k_rf);
615 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
616 crf = _mm_set1_ps(fr->ic->c_rf);
618 /* Setup water-specific parameters */
619 inr = nlist->iinr[0];
620 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
621 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
622 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
624 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
625 rcutoff_scalar = fr->rcoulomb;
626 rcutoff = _mm_set1_ps(rcutoff_scalar);
627 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
629 /* Avoid stupid compiler warnings */
630 jnrA = jnrB = jnrC = jnrD = 0;
639 for(iidx=0;iidx<4*DIM;iidx++)
644 /* Start outer loop over neighborlists */
645 for(iidx=0; iidx<nri; iidx++)
647 /* Load shift vector for this list */
648 i_shift_offset = DIM*shiftidx[iidx];
650 /* Load limits for loop over neighbors */
651 j_index_start = jindex[iidx];
652 j_index_end = jindex[iidx+1];
654 /* Get outer coordinate index */
656 i_coord_offset = DIM*inr;
658 /* Load i particle coords and add shift vector */
659 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
660 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
662 fix1 = _mm_setzero_ps();
663 fiy1 = _mm_setzero_ps();
664 fiz1 = _mm_setzero_ps();
665 fix2 = _mm_setzero_ps();
666 fiy2 = _mm_setzero_ps();
667 fiz2 = _mm_setzero_ps();
668 fix3 = _mm_setzero_ps();
669 fiy3 = _mm_setzero_ps();
670 fiz3 = _mm_setzero_ps();
672 /* Start inner kernel loop */
673 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
676 /* Get j neighbor index, and coordinate index */
681 j_coord_offsetA = DIM*jnrA;
682 j_coord_offsetB = DIM*jnrB;
683 j_coord_offsetC = DIM*jnrC;
684 j_coord_offsetD = DIM*jnrD;
686 /* load j atom coordinates */
687 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
688 x+j_coord_offsetC,x+j_coord_offsetD,
691 /* Calculate displacement vector */
692 dx10 = _mm_sub_ps(ix1,jx0);
693 dy10 = _mm_sub_ps(iy1,jy0);
694 dz10 = _mm_sub_ps(iz1,jz0);
695 dx20 = _mm_sub_ps(ix2,jx0);
696 dy20 = _mm_sub_ps(iy2,jy0);
697 dz20 = _mm_sub_ps(iz2,jz0);
698 dx30 = _mm_sub_ps(ix3,jx0);
699 dy30 = _mm_sub_ps(iy3,jy0);
700 dz30 = _mm_sub_ps(iz3,jz0);
702 /* Calculate squared distance and things based on it */
703 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
704 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
705 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
707 rinv10 = gmx_mm_invsqrt_ps(rsq10);
708 rinv20 = gmx_mm_invsqrt_ps(rsq20);
709 rinv30 = gmx_mm_invsqrt_ps(rsq30);
711 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
712 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
713 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
715 /* Load parameters for j particles */
716 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
717 charge+jnrC+0,charge+jnrD+0);
719 /**************************
720 * CALCULATE INTERACTIONS *
721 **************************/
723 if (gmx_mm_any_lt(rsq10,rcutoff2))
726 /* Compute parameters for interactions between i and j atoms */
727 qq10 = _mm_mul_ps(iq1,jq0);
729 /* REACTION-FIELD ELECTROSTATICS */
730 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
732 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
736 fscal = _mm_and_ps(fscal,cutoff_mask);
738 /* Calculate temporary vectorial force */
739 tx = _mm_mul_ps(fscal,dx10);
740 ty = _mm_mul_ps(fscal,dy10);
741 tz = _mm_mul_ps(fscal,dz10);
743 /* Update vectorial force */
744 fix1 = _mm_add_ps(fix1,tx);
745 fiy1 = _mm_add_ps(fiy1,ty);
746 fiz1 = _mm_add_ps(fiz1,tz);
748 fjptrA = f+j_coord_offsetA;
749 fjptrB = f+j_coord_offsetB;
750 fjptrC = f+j_coord_offsetC;
751 fjptrD = f+j_coord_offsetD;
752 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
756 /**************************
757 * CALCULATE INTERACTIONS *
758 **************************/
760 if (gmx_mm_any_lt(rsq20,rcutoff2))
763 /* Compute parameters for interactions between i and j atoms */
764 qq20 = _mm_mul_ps(iq2,jq0);
766 /* REACTION-FIELD ELECTROSTATICS */
767 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
769 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
773 fscal = _mm_and_ps(fscal,cutoff_mask);
775 /* Calculate temporary vectorial force */
776 tx = _mm_mul_ps(fscal,dx20);
777 ty = _mm_mul_ps(fscal,dy20);
778 tz = _mm_mul_ps(fscal,dz20);
780 /* Update vectorial force */
781 fix2 = _mm_add_ps(fix2,tx);
782 fiy2 = _mm_add_ps(fiy2,ty);
783 fiz2 = _mm_add_ps(fiz2,tz);
785 fjptrA = f+j_coord_offsetA;
786 fjptrB = f+j_coord_offsetB;
787 fjptrC = f+j_coord_offsetC;
788 fjptrD = f+j_coord_offsetD;
789 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
793 /**************************
794 * CALCULATE INTERACTIONS *
795 **************************/
797 if (gmx_mm_any_lt(rsq30,rcutoff2))
800 /* Compute parameters for interactions between i and j atoms */
801 qq30 = _mm_mul_ps(iq3,jq0);
803 /* REACTION-FIELD ELECTROSTATICS */
804 felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
806 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
810 fscal = _mm_and_ps(fscal,cutoff_mask);
812 /* Calculate temporary vectorial force */
813 tx = _mm_mul_ps(fscal,dx30);
814 ty = _mm_mul_ps(fscal,dy30);
815 tz = _mm_mul_ps(fscal,dz30);
817 /* Update vectorial force */
818 fix3 = _mm_add_ps(fix3,tx);
819 fiy3 = _mm_add_ps(fiy3,ty);
820 fiz3 = _mm_add_ps(fiz3,tz);
822 fjptrA = f+j_coord_offsetA;
823 fjptrB = f+j_coord_offsetB;
824 fjptrC = f+j_coord_offsetC;
825 fjptrD = f+j_coord_offsetD;
826 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
830 /* Inner loop uses 90 flops */
836 /* Get j neighbor index, and coordinate index */
837 jnrlistA = jjnr[jidx];
838 jnrlistB = jjnr[jidx+1];
839 jnrlistC = jjnr[jidx+2];
840 jnrlistD = jjnr[jidx+3];
841 /* Sign of each element will be negative for non-real atoms.
842 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
843 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
845 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
846 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
847 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
848 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
849 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
850 j_coord_offsetA = DIM*jnrA;
851 j_coord_offsetB = DIM*jnrB;
852 j_coord_offsetC = DIM*jnrC;
853 j_coord_offsetD = DIM*jnrD;
855 /* load j atom coordinates */
856 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
857 x+j_coord_offsetC,x+j_coord_offsetD,
860 /* Calculate displacement vector */
861 dx10 = _mm_sub_ps(ix1,jx0);
862 dy10 = _mm_sub_ps(iy1,jy0);
863 dz10 = _mm_sub_ps(iz1,jz0);
864 dx20 = _mm_sub_ps(ix2,jx0);
865 dy20 = _mm_sub_ps(iy2,jy0);
866 dz20 = _mm_sub_ps(iz2,jz0);
867 dx30 = _mm_sub_ps(ix3,jx0);
868 dy30 = _mm_sub_ps(iy3,jy0);
869 dz30 = _mm_sub_ps(iz3,jz0);
871 /* Calculate squared distance and things based on it */
872 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
873 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
874 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
876 rinv10 = gmx_mm_invsqrt_ps(rsq10);
877 rinv20 = gmx_mm_invsqrt_ps(rsq20);
878 rinv30 = gmx_mm_invsqrt_ps(rsq30);
880 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
881 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
882 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
884 /* Load parameters for j particles */
885 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
886 charge+jnrC+0,charge+jnrD+0);
888 /**************************
889 * CALCULATE INTERACTIONS *
890 **************************/
892 if (gmx_mm_any_lt(rsq10,rcutoff2))
895 /* Compute parameters for interactions between i and j atoms */
896 qq10 = _mm_mul_ps(iq1,jq0);
898 /* REACTION-FIELD ELECTROSTATICS */
899 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
901 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
905 fscal = _mm_and_ps(fscal,cutoff_mask);
907 fscal = _mm_andnot_ps(dummy_mask,fscal);
909 /* Calculate temporary vectorial force */
910 tx = _mm_mul_ps(fscal,dx10);
911 ty = _mm_mul_ps(fscal,dy10);
912 tz = _mm_mul_ps(fscal,dz10);
914 /* Update vectorial force */
915 fix1 = _mm_add_ps(fix1,tx);
916 fiy1 = _mm_add_ps(fiy1,ty);
917 fiz1 = _mm_add_ps(fiz1,tz);
919 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
920 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
921 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
922 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
923 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
927 /**************************
928 * CALCULATE INTERACTIONS *
929 **************************/
931 if (gmx_mm_any_lt(rsq20,rcutoff2))
934 /* Compute parameters for interactions between i and j atoms */
935 qq20 = _mm_mul_ps(iq2,jq0);
937 /* REACTION-FIELD ELECTROSTATICS */
938 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
940 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
944 fscal = _mm_and_ps(fscal,cutoff_mask);
946 fscal = _mm_andnot_ps(dummy_mask,fscal);
948 /* Calculate temporary vectorial force */
949 tx = _mm_mul_ps(fscal,dx20);
950 ty = _mm_mul_ps(fscal,dy20);
951 tz = _mm_mul_ps(fscal,dz20);
953 /* Update vectorial force */
954 fix2 = _mm_add_ps(fix2,tx);
955 fiy2 = _mm_add_ps(fiy2,ty);
956 fiz2 = _mm_add_ps(fiz2,tz);
958 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
959 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
960 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
961 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
962 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
966 /**************************
967 * CALCULATE INTERACTIONS *
968 **************************/
970 if (gmx_mm_any_lt(rsq30,rcutoff2))
973 /* Compute parameters for interactions between i and j atoms */
974 qq30 = _mm_mul_ps(iq3,jq0);
976 /* REACTION-FIELD ELECTROSTATICS */
977 felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
979 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
983 fscal = _mm_and_ps(fscal,cutoff_mask);
985 fscal = _mm_andnot_ps(dummy_mask,fscal);
987 /* Calculate temporary vectorial force */
988 tx = _mm_mul_ps(fscal,dx30);
989 ty = _mm_mul_ps(fscal,dy30);
990 tz = _mm_mul_ps(fscal,dz30);
992 /* Update vectorial force */
993 fix3 = _mm_add_ps(fix3,tx);
994 fiy3 = _mm_add_ps(fiy3,ty);
995 fiz3 = _mm_add_ps(fiz3,tz);
997 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
998 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
999 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1000 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1001 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
1005 /* Inner loop uses 90 flops */
1008 /* End of innermost loop */
1010 gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1011 f+i_coord_offset+DIM,fshift+i_shift_offset);
1013 /* Increment number of inner iterations */
1014 inneriter += j_index_end - j_index_start;
1016 /* Outer loop uses 18 flops */
1019 /* Increment number of outer iterations */
1022 /* Update outer/inner flops */
1024 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*90);