2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_single
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real shX,shY,shZ,rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
73 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
75 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
76 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
77 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
80 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
83 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
84 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
85 __m128 dummy_mask,cutoff_mask;
86 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
87 __m128 one = _mm_set1_ps(1.0);
88 __m128 two = _mm_set1_ps(2.0);
94 jindex = nlist->jindex;
96 shiftidx = nlist->shift;
98 shiftvec = fr->shift_vec[0];
99 fshift = fr->fshift[0];
100 facel = _mm_set1_ps(fr->epsfac);
101 charge = mdatoms->chargeA;
102 krf = _mm_set1_ps(fr->ic->k_rf);
103 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
104 crf = _mm_set1_ps(fr->ic->c_rf);
105 nvdwtype = fr->ntype;
107 vdwtype = mdatoms->typeA;
109 /* Setup water-specific parameters */
110 inr = nlist->iinr[0];
111 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
112 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
113 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
114 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
116 /* Avoid stupid compiler warnings */
117 jnrA = jnrB = jnrC = jnrD = 0;
126 /* Start outer loop over neighborlists */
127 for(iidx=0; iidx<nri; iidx++)
129 /* Load shift vector for this list */
130 i_shift_offset = DIM*shiftidx[iidx];
131 shX = shiftvec[i_shift_offset+XX];
132 shY = shiftvec[i_shift_offset+YY];
133 shZ = shiftvec[i_shift_offset+ZZ];
135 /* Load limits for loop over neighbors */
136 j_index_start = jindex[iidx];
137 j_index_end = jindex[iidx+1];
139 /* Get outer coordinate index */
141 i_coord_offset = DIM*inr;
143 /* Load i particle coords and add shift vector */
144 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
145 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
146 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
147 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
148 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
149 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
150 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
151 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
152 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
154 fix0 = _mm_setzero_ps();
155 fiy0 = _mm_setzero_ps();
156 fiz0 = _mm_setzero_ps();
157 fix1 = _mm_setzero_ps();
158 fiy1 = _mm_setzero_ps();
159 fiz1 = _mm_setzero_ps();
160 fix2 = _mm_setzero_ps();
161 fiy2 = _mm_setzero_ps();
162 fiz2 = _mm_setzero_ps();
164 /* Reset potential sums */
165 velecsum = _mm_setzero_ps();
166 vvdwsum = _mm_setzero_ps();
168 /* Start inner kernel loop */
169 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
172 /* Get j neighbor index, and coordinate index */
178 j_coord_offsetA = DIM*jnrA;
179 j_coord_offsetB = DIM*jnrB;
180 j_coord_offsetC = DIM*jnrC;
181 j_coord_offsetD = DIM*jnrD;
183 /* load j atom coordinates */
184 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
185 x+j_coord_offsetC,x+j_coord_offsetD,
188 /* Calculate displacement vector */
189 dx00 = _mm_sub_ps(ix0,jx0);
190 dy00 = _mm_sub_ps(iy0,jy0);
191 dz00 = _mm_sub_ps(iz0,jz0);
192 dx10 = _mm_sub_ps(ix1,jx0);
193 dy10 = _mm_sub_ps(iy1,jy0);
194 dz10 = _mm_sub_ps(iz1,jz0);
195 dx20 = _mm_sub_ps(ix2,jx0);
196 dy20 = _mm_sub_ps(iy2,jy0);
197 dz20 = _mm_sub_ps(iz2,jz0);
199 /* Calculate squared distance and things based on it */
200 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
201 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
202 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
204 rinv00 = gmx_mm_invsqrt_ps(rsq00);
205 rinv10 = gmx_mm_invsqrt_ps(rsq10);
206 rinv20 = gmx_mm_invsqrt_ps(rsq20);
208 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
209 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
210 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
212 /* Load parameters for j particles */
213 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
214 charge+jnrC+0,charge+jnrD+0);
215 vdwjidx0A = 2*vdwtype[jnrA+0];
216 vdwjidx0B = 2*vdwtype[jnrB+0];
217 vdwjidx0C = 2*vdwtype[jnrC+0];
218 vdwjidx0D = 2*vdwtype[jnrD+0];
220 /**************************
221 * CALCULATE INTERACTIONS *
222 **************************/
224 /* Compute parameters for interactions between i and j atoms */
225 qq00 = _mm_mul_ps(iq0,jq0);
226 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
227 vdwparam+vdwioffset0+vdwjidx0B,
228 vdwparam+vdwioffset0+vdwjidx0C,
229 vdwparam+vdwioffset0+vdwjidx0D,
232 /* REACTION-FIELD ELECTROSTATICS */
233 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
234 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
236 /* LENNARD-JONES DISPERSION/REPULSION */
238 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
239 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
240 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
241 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
242 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
244 /* Update potential sum for this i atom from the interaction with this j atom. */
245 velecsum = _mm_add_ps(velecsum,velec);
246 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
248 fscal = _mm_add_ps(felec,fvdw);
250 /* Calculate temporary vectorial force */
251 tx = _mm_mul_ps(fscal,dx00);
252 ty = _mm_mul_ps(fscal,dy00);
253 tz = _mm_mul_ps(fscal,dz00);
255 /* Update vectorial force */
256 fix0 = _mm_add_ps(fix0,tx);
257 fiy0 = _mm_add_ps(fiy0,ty);
258 fiz0 = _mm_add_ps(fiz0,tz);
260 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
261 f+j_coord_offsetC,f+j_coord_offsetD,
264 /**************************
265 * CALCULATE INTERACTIONS *
266 **************************/
268 /* Compute parameters for interactions between i and j atoms */
269 qq10 = _mm_mul_ps(iq1,jq0);
271 /* REACTION-FIELD ELECTROSTATICS */
272 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
273 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
275 /* Update potential sum for this i atom from the interaction with this j atom. */
276 velecsum = _mm_add_ps(velecsum,velec);
280 /* Calculate temporary vectorial force */
281 tx = _mm_mul_ps(fscal,dx10);
282 ty = _mm_mul_ps(fscal,dy10);
283 tz = _mm_mul_ps(fscal,dz10);
285 /* Update vectorial force */
286 fix1 = _mm_add_ps(fix1,tx);
287 fiy1 = _mm_add_ps(fiy1,ty);
288 fiz1 = _mm_add_ps(fiz1,tz);
290 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
291 f+j_coord_offsetC,f+j_coord_offsetD,
294 /**************************
295 * CALCULATE INTERACTIONS *
296 **************************/
298 /* Compute parameters for interactions between i and j atoms */
299 qq20 = _mm_mul_ps(iq2,jq0);
301 /* REACTION-FIELD ELECTROSTATICS */
302 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
303 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
305 /* Update potential sum for this i atom from the interaction with this j atom. */
306 velecsum = _mm_add_ps(velecsum,velec);
310 /* Calculate temporary vectorial force */
311 tx = _mm_mul_ps(fscal,dx20);
312 ty = _mm_mul_ps(fscal,dy20);
313 tz = _mm_mul_ps(fscal,dz20);
315 /* Update vectorial force */
316 fix2 = _mm_add_ps(fix2,tx);
317 fiy2 = _mm_add_ps(fiy2,ty);
318 fiz2 = _mm_add_ps(fiz2,tz);
320 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
321 f+j_coord_offsetC,f+j_coord_offsetD,
324 /* Inner loop uses 108 flops */
330 /* Get j neighbor index, and coordinate index */
336 /* Sign of each element will be negative for non-real atoms.
337 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
338 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
340 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
341 jnrA = (jnrA>=0) ? jnrA : 0;
342 jnrB = (jnrB>=0) ? jnrB : 0;
343 jnrC = (jnrC>=0) ? jnrC : 0;
344 jnrD = (jnrD>=0) ? jnrD : 0;
346 j_coord_offsetA = DIM*jnrA;
347 j_coord_offsetB = DIM*jnrB;
348 j_coord_offsetC = DIM*jnrC;
349 j_coord_offsetD = DIM*jnrD;
351 /* load j atom coordinates */
352 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
353 x+j_coord_offsetC,x+j_coord_offsetD,
356 /* Calculate displacement vector */
357 dx00 = _mm_sub_ps(ix0,jx0);
358 dy00 = _mm_sub_ps(iy0,jy0);
359 dz00 = _mm_sub_ps(iz0,jz0);
360 dx10 = _mm_sub_ps(ix1,jx0);
361 dy10 = _mm_sub_ps(iy1,jy0);
362 dz10 = _mm_sub_ps(iz1,jz0);
363 dx20 = _mm_sub_ps(ix2,jx0);
364 dy20 = _mm_sub_ps(iy2,jy0);
365 dz20 = _mm_sub_ps(iz2,jz0);
367 /* Calculate squared distance and things based on it */
368 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
369 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
370 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
372 rinv00 = gmx_mm_invsqrt_ps(rsq00);
373 rinv10 = gmx_mm_invsqrt_ps(rsq10);
374 rinv20 = gmx_mm_invsqrt_ps(rsq20);
376 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
377 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
378 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
380 /* Load parameters for j particles */
381 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
382 charge+jnrC+0,charge+jnrD+0);
383 vdwjidx0A = 2*vdwtype[jnrA+0];
384 vdwjidx0B = 2*vdwtype[jnrB+0];
385 vdwjidx0C = 2*vdwtype[jnrC+0];
386 vdwjidx0D = 2*vdwtype[jnrD+0];
388 /**************************
389 * CALCULATE INTERACTIONS *
390 **************************/
392 /* Compute parameters for interactions between i and j atoms */
393 qq00 = _mm_mul_ps(iq0,jq0);
394 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
395 vdwparam+vdwioffset0+vdwjidx0B,
396 vdwparam+vdwioffset0+vdwjidx0C,
397 vdwparam+vdwioffset0+vdwjidx0D,
400 /* REACTION-FIELD ELECTROSTATICS */
401 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
402 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
404 /* LENNARD-JONES DISPERSION/REPULSION */
406 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
407 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
408 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
409 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
410 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
412 /* Update potential sum for this i atom from the interaction with this j atom. */
413 velec = _mm_andnot_ps(dummy_mask,velec);
414 velecsum = _mm_add_ps(velecsum,velec);
415 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
416 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
418 fscal = _mm_add_ps(felec,fvdw);
420 fscal = _mm_andnot_ps(dummy_mask,fscal);
422 /* Calculate temporary vectorial force */
423 tx = _mm_mul_ps(fscal,dx00);
424 ty = _mm_mul_ps(fscal,dy00);
425 tz = _mm_mul_ps(fscal,dz00);
427 /* Update vectorial force */
428 fix0 = _mm_add_ps(fix0,tx);
429 fiy0 = _mm_add_ps(fiy0,ty);
430 fiz0 = _mm_add_ps(fiz0,tz);
432 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
433 f+j_coord_offsetC,f+j_coord_offsetD,
436 /**************************
437 * CALCULATE INTERACTIONS *
438 **************************/
440 /* Compute parameters for interactions between i and j atoms */
441 qq10 = _mm_mul_ps(iq1,jq0);
443 /* REACTION-FIELD ELECTROSTATICS */
444 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
445 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
447 /* Update potential sum for this i atom from the interaction with this j atom. */
448 velec = _mm_andnot_ps(dummy_mask,velec);
449 velecsum = _mm_add_ps(velecsum,velec);
453 fscal = _mm_andnot_ps(dummy_mask,fscal);
455 /* Calculate temporary vectorial force */
456 tx = _mm_mul_ps(fscal,dx10);
457 ty = _mm_mul_ps(fscal,dy10);
458 tz = _mm_mul_ps(fscal,dz10);
460 /* Update vectorial force */
461 fix1 = _mm_add_ps(fix1,tx);
462 fiy1 = _mm_add_ps(fiy1,ty);
463 fiz1 = _mm_add_ps(fiz1,tz);
465 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
466 f+j_coord_offsetC,f+j_coord_offsetD,
469 /**************************
470 * CALCULATE INTERACTIONS *
471 **************************/
473 /* Compute parameters for interactions between i and j atoms */
474 qq20 = _mm_mul_ps(iq2,jq0);
476 /* REACTION-FIELD ELECTROSTATICS */
477 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
478 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
480 /* Update potential sum for this i atom from the interaction with this j atom. */
481 velec = _mm_andnot_ps(dummy_mask,velec);
482 velecsum = _mm_add_ps(velecsum,velec);
486 fscal = _mm_andnot_ps(dummy_mask,fscal);
488 /* Calculate temporary vectorial force */
489 tx = _mm_mul_ps(fscal,dx20);
490 ty = _mm_mul_ps(fscal,dy20);
491 tz = _mm_mul_ps(fscal,dz20);
493 /* Update vectorial force */
494 fix2 = _mm_add_ps(fix2,tx);
495 fiy2 = _mm_add_ps(fiy2,ty);
496 fiz2 = _mm_add_ps(fiz2,tz);
498 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
499 f+j_coord_offsetC,f+j_coord_offsetD,
502 /* Inner loop uses 108 flops */
505 /* End of innermost loop */
507 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
508 f+i_coord_offset,fshift+i_shift_offset);
511 /* Update potential energies */
512 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
513 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
515 /* Increment number of inner iterations */
516 inneriter += j_index_end - j_index_start;
518 /* Outer loop uses 29 flops */
521 /* Increment number of outer iterations */
524 /* Update outer/inner flops */
526 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*108);
529 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_single
530 * Electrostatics interaction: ReactionField
531 * VdW interaction: LennardJones
532 * Geometry: Water3-Particle
533 * Calculate force/pot: Force
536 nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_single
537 (t_nblist * gmx_restrict nlist,
538 rvec * gmx_restrict xx,
539 rvec * gmx_restrict ff,
540 t_forcerec * gmx_restrict fr,
541 t_mdatoms * gmx_restrict mdatoms,
542 nb_kernel_data_t * gmx_restrict kernel_data,
543 t_nrnb * gmx_restrict nrnb)
545 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
546 * just 0 for non-waters.
547 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
548 * jnr indices corresponding to data put in the four positions in the SIMD register.
550 int i_shift_offset,i_coord_offset,outeriter,inneriter;
551 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
552 int jnrA,jnrB,jnrC,jnrD;
553 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
554 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
555 real shX,shY,shZ,rcutoff_scalar;
556 real *shiftvec,*fshift,*x,*f;
557 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
559 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
561 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
563 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
564 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
565 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
566 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
567 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
568 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
569 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
572 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
575 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
576 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
577 __m128 dummy_mask,cutoff_mask;
578 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
579 __m128 one = _mm_set1_ps(1.0);
580 __m128 two = _mm_set1_ps(2.0);
586 jindex = nlist->jindex;
588 shiftidx = nlist->shift;
590 shiftvec = fr->shift_vec[0];
591 fshift = fr->fshift[0];
592 facel = _mm_set1_ps(fr->epsfac);
593 charge = mdatoms->chargeA;
594 krf = _mm_set1_ps(fr->ic->k_rf);
595 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
596 crf = _mm_set1_ps(fr->ic->c_rf);
597 nvdwtype = fr->ntype;
599 vdwtype = mdatoms->typeA;
601 /* Setup water-specific parameters */
602 inr = nlist->iinr[0];
603 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
604 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
605 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
606 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
608 /* Avoid stupid compiler warnings */
609 jnrA = jnrB = jnrC = jnrD = 0;
618 /* Start outer loop over neighborlists */
619 for(iidx=0; iidx<nri; iidx++)
621 /* Load shift vector for this list */
622 i_shift_offset = DIM*shiftidx[iidx];
623 shX = shiftvec[i_shift_offset+XX];
624 shY = shiftvec[i_shift_offset+YY];
625 shZ = shiftvec[i_shift_offset+ZZ];
627 /* Load limits for loop over neighbors */
628 j_index_start = jindex[iidx];
629 j_index_end = jindex[iidx+1];
631 /* Get outer coordinate index */
633 i_coord_offset = DIM*inr;
635 /* Load i particle coords and add shift vector */
636 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
637 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
638 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
639 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
640 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
641 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
642 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
643 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
644 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
646 fix0 = _mm_setzero_ps();
647 fiy0 = _mm_setzero_ps();
648 fiz0 = _mm_setzero_ps();
649 fix1 = _mm_setzero_ps();
650 fiy1 = _mm_setzero_ps();
651 fiz1 = _mm_setzero_ps();
652 fix2 = _mm_setzero_ps();
653 fiy2 = _mm_setzero_ps();
654 fiz2 = _mm_setzero_ps();
656 /* Start inner kernel loop */
657 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
660 /* Get j neighbor index, and coordinate index */
666 j_coord_offsetA = DIM*jnrA;
667 j_coord_offsetB = DIM*jnrB;
668 j_coord_offsetC = DIM*jnrC;
669 j_coord_offsetD = DIM*jnrD;
671 /* load j atom coordinates */
672 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
673 x+j_coord_offsetC,x+j_coord_offsetD,
676 /* Calculate displacement vector */
677 dx00 = _mm_sub_ps(ix0,jx0);
678 dy00 = _mm_sub_ps(iy0,jy0);
679 dz00 = _mm_sub_ps(iz0,jz0);
680 dx10 = _mm_sub_ps(ix1,jx0);
681 dy10 = _mm_sub_ps(iy1,jy0);
682 dz10 = _mm_sub_ps(iz1,jz0);
683 dx20 = _mm_sub_ps(ix2,jx0);
684 dy20 = _mm_sub_ps(iy2,jy0);
685 dz20 = _mm_sub_ps(iz2,jz0);
687 /* Calculate squared distance and things based on it */
688 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
689 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
690 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
692 rinv00 = gmx_mm_invsqrt_ps(rsq00);
693 rinv10 = gmx_mm_invsqrt_ps(rsq10);
694 rinv20 = gmx_mm_invsqrt_ps(rsq20);
696 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
697 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
698 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
700 /* Load parameters for j particles */
701 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
702 charge+jnrC+0,charge+jnrD+0);
703 vdwjidx0A = 2*vdwtype[jnrA+0];
704 vdwjidx0B = 2*vdwtype[jnrB+0];
705 vdwjidx0C = 2*vdwtype[jnrC+0];
706 vdwjidx0D = 2*vdwtype[jnrD+0];
708 /**************************
709 * CALCULATE INTERACTIONS *
710 **************************/
712 /* Compute parameters for interactions between i and j atoms */
713 qq00 = _mm_mul_ps(iq0,jq0);
714 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
715 vdwparam+vdwioffset0+vdwjidx0B,
716 vdwparam+vdwioffset0+vdwjidx0C,
717 vdwparam+vdwioffset0+vdwjidx0D,
720 /* REACTION-FIELD ELECTROSTATICS */
721 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
723 /* LENNARD-JONES DISPERSION/REPULSION */
725 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
726 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
728 fscal = _mm_add_ps(felec,fvdw);
730 /* Calculate temporary vectorial force */
731 tx = _mm_mul_ps(fscal,dx00);
732 ty = _mm_mul_ps(fscal,dy00);
733 tz = _mm_mul_ps(fscal,dz00);
735 /* Update vectorial force */
736 fix0 = _mm_add_ps(fix0,tx);
737 fiy0 = _mm_add_ps(fiy0,ty);
738 fiz0 = _mm_add_ps(fiz0,tz);
740 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
741 f+j_coord_offsetC,f+j_coord_offsetD,
744 /**************************
745 * CALCULATE INTERACTIONS *
746 **************************/
748 /* Compute parameters for interactions between i and j atoms */
749 qq10 = _mm_mul_ps(iq1,jq0);
751 /* REACTION-FIELD ELECTROSTATICS */
752 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
756 /* Calculate temporary vectorial force */
757 tx = _mm_mul_ps(fscal,dx10);
758 ty = _mm_mul_ps(fscal,dy10);
759 tz = _mm_mul_ps(fscal,dz10);
761 /* Update vectorial force */
762 fix1 = _mm_add_ps(fix1,tx);
763 fiy1 = _mm_add_ps(fiy1,ty);
764 fiz1 = _mm_add_ps(fiz1,tz);
766 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
767 f+j_coord_offsetC,f+j_coord_offsetD,
770 /**************************
771 * CALCULATE INTERACTIONS *
772 **************************/
774 /* Compute parameters for interactions between i and j atoms */
775 qq20 = _mm_mul_ps(iq2,jq0);
777 /* REACTION-FIELD ELECTROSTATICS */
778 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
782 /* Calculate temporary vectorial force */
783 tx = _mm_mul_ps(fscal,dx20);
784 ty = _mm_mul_ps(fscal,dy20);
785 tz = _mm_mul_ps(fscal,dz20);
787 /* Update vectorial force */
788 fix2 = _mm_add_ps(fix2,tx);
789 fiy2 = _mm_add_ps(fiy2,ty);
790 fiz2 = _mm_add_ps(fiz2,tz);
792 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
793 f+j_coord_offsetC,f+j_coord_offsetD,
796 /* Inner loop uses 88 flops */
802 /* Get j neighbor index, and coordinate index */
808 /* Sign of each element will be negative for non-real atoms.
809 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
810 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
812 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
813 jnrA = (jnrA>=0) ? jnrA : 0;
814 jnrB = (jnrB>=0) ? jnrB : 0;
815 jnrC = (jnrC>=0) ? jnrC : 0;
816 jnrD = (jnrD>=0) ? jnrD : 0;
818 j_coord_offsetA = DIM*jnrA;
819 j_coord_offsetB = DIM*jnrB;
820 j_coord_offsetC = DIM*jnrC;
821 j_coord_offsetD = DIM*jnrD;
823 /* load j atom coordinates */
824 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
825 x+j_coord_offsetC,x+j_coord_offsetD,
828 /* Calculate displacement vector */
829 dx00 = _mm_sub_ps(ix0,jx0);
830 dy00 = _mm_sub_ps(iy0,jy0);
831 dz00 = _mm_sub_ps(iz0,jz0);
832 dx10 = _mm_sub_ps(ix1,jx0);
833 dy10 = _mm_sub_ps(iy1,jy0);
834 dz10 = _mm_sub_ps(iz1,jz0);
835 dx20 = _mm_sub_ps(ix2,jx0);
836 dy20 = _mm_sub_ps(iy2,jy0);
837 dz20 = _mm_sub_ps(iz2,jz0);
839 /* Calculate squared distance and things based on it */
840 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
841 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
842 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
844 rinv00 = gmx_mm_invsqrt_ps(rsq00);
845 rinv10 = gmx_mm_invsqrt_ps(rsq10);
846 rinv20 = gmx_mm_invsqrt_ps(rsq20);
848 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
849 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
850 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
852 /* Load parameters for j particles */
853 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
854 charge+jnrC+0,charge+jnrD+0);
855 vdwjidx0A = 2*vdwtype[jnrA+0];
856 vdwjidx0B = 2*vdwtype[jnrB+0];
857 vdwjidx0C = 2*vdwtype[jnrC+0];
858 vdwjidx0D = 2*vdwtype[jnrD+0];
860 /**************************
861 * CALCULATE INTERACTIONS *
862 **************************/
864 /* Compute parameters for interactions between i and j atoms */
865 qq00 = _mm_mul_ps(iq0,jq0);
866 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
867 vdwparam+vdwioffset0+vdwjidx0B,
868 vdwparam+vdwioffset0+vdwjidx0C,
869 vdwparam+vdwioffset0+vdwjidx0D,
872 /* REACTION-FIELD ELECTROSTATICS */
873 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
875 /* LENNARD-JONES DISPERSION/REPULSION */
877 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
878 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
880 fscal = _mm_add_ps(felec,fvdw);
882 fscal = _mm_andnot_ps(dummy_mask,fscal);
884 /* Calculate temporary vectorial force */
885 tx = _mm_mul_ps(fscal,dx00);
886 ty = _mm_mul_ps(fscal,dy00);
887 tz = _mm_mul_ps(fscal,dz00);
889 /* Update vectorial force */
890 fix0 = _mm_add_ps(fix0,tx);
891 fiy0 = _mm_add_ps(fiy0,ty);
892 fiz0 = _mm_add_ps(fiz0,tz);
894 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
895 f+j_coord_offsetC,f+j_coord_offsetD,
898 /**************************
899 * CALCULATE INTERACTIONS *
900 **************************/
902 /* Compute parameters for interactions between i and j atoms */
903 qq10 = _mm_mul_ps(iq1,jq0);
905 /* REACTION-FIELD ELECTROSTATICS */
906 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
910 fscal = _mm_andnot_ps(dummy_mask,fscal);
912 /* Calculate temporary vectorial force */
913 tx = _mm_mul_ps(fscal,dx10);
914 ty = _mm_mul_ps(fscal,dy10);
915 tz = _mm_mul_ps(fscal,dz10);
917 /* Update vectorial force */
918 fix1 = _mm_add_ps(fix1,tx);
919 fiy1 = _mm_add_ps(fiy1,ty);
920 fiz1 = _mm_add_ps(fiz1,tz);
922 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
923 f+j_coord_offsetC,f+j_coord_offsetD,
926 /**************************
927 * CALCULATE INTERACTIONS *
928 **************************/
930 /* Compute parameters for interactions between i and j atoms */
931 qq20 = _mm_mul_ps(iq2,jq0);
933 /* REACTION-FIELD ELECTROSTATICS */
934 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
938 fscal = _mm_andnot_ps(dummy_mask,fscal);
940 /* Calculate temporary vectorial force */
941 tx = _mm_mul_ps(fscal,dx20);
942 ty = _mm_mul_ps(fscal,dy20);
943 tz = _mm_mul_ps(fscal,dz20);
945 /* Update vectorial force */
946 fix2 = _mm_add_ps(fix2,tx);
947 fiy2 = _mm_add_ps(fiy2,ty);
948 fiz2 = _mm_add_ps(fiz2,tz);
950 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
951 f+j_coord_offsetC,f+j_coord_offsetD,
954 /* Inner loop uses 88 flops */
957 /* End of innermost loop */
959 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
960 f+i_coord_offset,fshift+i_shift_offset);
962 /* Increment number of inner iterations */
963 inneriter += j_index_end - j_index_start;
965 /* Outer loop uses 27 flops */
968 /* Increment number of outer iterations */
971 /* Update outer/inner flops */
973 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*88);