2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_single
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: None
40 * Geometry: Water4-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real shX,shY,shZ,rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
69 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
71 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
72 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
73 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
75 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
76 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
77 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
79 __m128 dummy_mask,cutoff_mask;
80 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
81 __m128 one = _mm_set1_ps(1.0);
82 __m128 two = _mm_set1_ps(2.0);
88 jindex = nlist->jindex;
90 shiftidx = nlist->shift;
92 shiftvec = fr->shift_vec[0];
93 fshift = fr->fshift[0];
94 facel = _mm_set1_ps(fr->epsfac);
95 charge = mdatoms->chargeA;
96 krf = _mm_set1_ps(fr->ic->k_rf);
97 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
98 crf = _mm_set1_ps(fr->ic->c_rf);
100 /* Setup water-specific parameters */
101 inr = nlist->iinr[0];
102 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
103 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
104 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
106 /* Avoid stupid compiler warnings */
107 jnrA = jnrB = jnrC = jnrD = 0;
116 /* Start outer loop over neighborlists */
117 for(iidx=0; iidx<nri; iidx++)
119 /* Load shift vector for this list */
120 i_shift_offset = DIM*shiftidx[iidx];
121 shX = shiftvec[i_shift_offset+XX];
122 shY = shiftvec[i_shift_offset+YY];
123 shZ = shiftvec[i_shift_offset+ZZ];
125 /* Load limits for loop over neighbors */
126 j_index_start = jindex[iidx];
127 j_index_end = jindex[iidx+1];
129 /* Get outer coordinate index */
131 i_coord_offset = DIM*inr;
133 /* Load i particle coords and add shift vector */
134 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
135 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
136 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
137 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
138 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
139 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
140 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
141 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
142 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
144 fix1 = _mm_setzero_ps();
145 fiy1 = _mm_setzero_ps();
146 fiz1 = _mm_setzero_ps();
147 fix2 = _mm_setzero_ps();
148 fiy2 = _mm_setzero_ps();
149 fiz2 = _mm_setzero_ps();
150 fix3 = _mm_setzero_ps();
151 fiy3 = _mm_setzero_ps();
152 fiz3 = _mm_setzero_ps();
154 /* Reset potential sums */
155 velecsum = _mm_setzero_ps();
157 /* Start inner kernel loop */
158 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
161 /* Get j neighbor index, and coordinate index */
167 j_coord_offsetA = DIM*jnrA;
168 j_coord_offsetB = DIM*jnrB;
169 j_coord_offsetC = DIM*jnrC;
170 j_coord_offsetD = DIM*jnrD;
172 /* load j atom coordinates */
173 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
174 x+j_coord_offsetC,x+j_coord_offsetD,
177 /* Calculate displacement vector */
178 dx10 = _mm_sub_ps(ix1,jx0);
179 dy10 = _mm_sub_ps(iy1,jy0);
180 dz10 = _mm_sub_ps(iz1,jz0);
181 dx20 = _mm_sub_ps(ix2,jx0);
182 dy20 = _mm_sub_ps(iy2,jy0);
183 dz20 = _mm_sub_ps(iz2,jz0);
184 dx30 = _mm_sub_ps(ix3,jx0);
185 dy30 = _mm_sub_ps(iy3,jy0);
186 dz30 = _mm_sub_ps(iz3,jz0);
188 /* Calculate squared distance and things based on it */
189 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
190 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
191 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
193 rinv10 = gmx_mm_invsqrt_ps(rsq10);
194 rinv20 = gmx_mm_invsqrt_ps(rsq20);
195 rinv30 = gmx_mm_invsqrt_ps(rsq30);
197 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
198 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
199 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
201 /* Load parameters for j particles */
202 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
203 charge+jnrC+0,charge+jnrD+0);
205 /**************************
206 * CALCULATE INTERACTIONS *
207 **************************/
209 /* Compute parameters for interactions between i and j atoms */
210 qq10 = _mm_mul_ps(iq1,jq0);
212 /* REACTION-FIELD ELECTROSTATICS */
213 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
214 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
216 /* Update potential sum for this i atom from the interaction with this j atom. */
217 velecsum = _mm_add_ps(velecsum,velec);
221 /* Calculate temporary vectorial force */
222 tx = _mm_mul_ps(fscal,dx10);
223 ty = _mm_mul_ps(fscal,dy10);
224 tz = _mm_mul_ps(fscal,dz10);
226 /* Update vectorial force */
227 fix1 = _mm_add_ps(fix1,tx);
228 fiy1 = _mm_add_ps(fiy1,ty);
229 fiz1 = _mm_add_ps(fiz1,tz);
231 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
232 f+j_coord_offsetC,f+j_coord_offsetD,
235 /**************************
236 * CALCULATE INTERACTIONS *
237 **************************/
239 /* Compute parameters for interactions between i and j atoms */
240 qq20 = _mm_mul_ps(iq2,jq0);
242 /* REACTION-FIELD ELECTROSTATICS */
243 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
244 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
246 /* Update potential sum for this i atom from the interaction with this j atom. */
247 velecsum = _mm_add_ps(velecsum,velec);
251 /* Calculate temporary vectorial force */
252 tx = _mm_mul_ps(fscal,dx20);
253 ty = _mm_mul_ps(fscal,dy20);
254 tz = _mm_mul_ps(fscal,dz20);
256 /* Update vectorial force */
257 fix2 = _mm_add_ps(fix2,tx);
258 fiy2 = _mm_add_ps(fiy2,ty);
259 fiz2 = _mm_add_ps(fiz2,tz);
261 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
262 f+j_coord_offsetC,f+j_coord_offsetD,
265 /**************************
266 * CALCULATE INTERACTIONS *
267 **************************/
269 /* Compute parameters for interactions between i and j atoms */
270 qq30 = _mm_mul_ps(iq3,jq0);
272 /* REACTION-FIELD ELECTROSTATICS */
273 velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
274 felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
276 /* Update potential sum for this i atom from the interaction with this j atom. */
277 velecsum = _mm_add_ps(velecsum,velec);
281 /* Calculate temporary vectorial force */
282 tx = _mm_mul_ps(fscal,dx30);
283 ty = _mm_mul_ps(fscal,dy30);
284 tz = _mm_mul_ps(fscal,dz30);
286 /* Update vectorial force */
287 fix3 = _mm_add_ps(fix3,tx);
288 fiy3 = _mm_add_ps(fiy3,ty);
289 fiz3 = _mm_add_ps(fiz3,tz);
291 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
292 f+j_coord_offsetC,f+j_coord_offsetD,
295 /* Inner loop uses 96 flops */
301 /* Get j neighbor index, and coordinate index */
307 /* Sign of each element will be negative for non-real atoms.
308 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
309 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
311 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
312 jnrA = (jnrA>=0) ? jnrA : 0;
313 jnrB = (jnrB>=0) ? jnrB : 0;
314 jnrC = (jnrC>=0) ? jnrC : 0;
315 jnrD = (jnrD>=0) ? jnrD : 0;
317 j_coord_offsetA = DIM*jnrA;
318 j_coord_offsetB = DIM*jnrB;
319 j_coord_offsetC = DIM*jnrC;
320 j_coord_offsetD = DIM*jnrD;
322 /* load j atom coordinates */
323 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
324 x+j_coord_offsetC,x+j_coord_offsetD,
327 /* Calculate displacement vector */
328 dx10 = _mm_sub_ps(ix1,jx0);
329 dy10 = _mm_sub_ps(iy1,jy0);
330 dz10 = _mm_sub_ps(iz1,jz0);
331 dx20 = _mm_sub_ps(ix2,jx0);
332 dy20 = _mm_sub_ps(iy2,jy0);
333 dz20 = _mm_sub_ps(iz2,jz0);
334 dx30 = _mm_sub_ps(ix3,jx0);
335 dy30 = _mm_sub_ps(iy3,jy0);
336 dz30 = _mm_sub_ps(iz3,jz0);
338 /* Calculate squared distance and things based on it */
339 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
340 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
341 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
343 rinv10 = gmx_mm_invsqrt_ps(rsq10);
344 rinv20 = gmx_mm_invsqrt_ps(rsq20);
345 rinv30 = gmx_mm_invsqrt_ps(rsq30);
347 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
348 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
349 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
351 /* Load parameters for j particles */
352 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
353 charge+jnrC+0,charge+jnrD+0);
355 /**************************
356 * CALCULATE INTERACTIONS *
357 **************************/
359 /* Compute parameters for interactions between i and j atoms */
360 qq10 = _mm_mul_ps(iq1,jq0);
362 /* REACTION-FIELD ELECTROSTATICS */
363 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
364 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
366 /* Update potential sum for this i atom from the interaction with this j atom. */
367 velec = _mm_andnot_ps(dummy_mask,velec);
368 velecsum = _mm_add_ps(velecsum,velec);
372 fscal = _mm_andnot_ps(dummy_mask,fscal);
374 /* Calculate temporary vectorial force */
375 tx = _mm_mul_ps(fscal,dx10);
376 ty = _mm_mul_ps(fscal,dy10);
377 tz = _mm_mul_ps(fscal,dz10);
379 /* Update vectorial force */
380 fix1 = _mm_add_ps(fix1,tx);
381 fiy1 = _mm_add_ps(fiy1,ty);
382 fiz1 = _mm_add_ps(fiz1,tz);
384 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
385 f+j_coord_offsetC,f+j_coord_offsetD,
388 /**************************
389 * CALCULATE INTERACTIONS *
390 **************************/
392 /* Compute parameters for interactions between i and j atoms */
393 qq20 = _mm_mul_ps(iq2,jq0);
395 /* REACTION-FIELD ELECTROSTATICS */
396 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
397 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
399 /* Update potential sum for this i atom from the interaction with this j atom. */
400 velec = _mm_andnot_ps(dummy_mask,velec);
401 velecsum = _mm_add_ps(velecsum,velec);
405 fscal = _mm_andnot_ps(dummy_mask,fscal);
407 /* Calculate temporary vectorial force */
408 tx = _mm_mul_ps(fscal,dx20);
409 ty = _mm_mul_ps(fscal,dy20);
410 tz = _mm_mul_ps(fscal,dz20);
412 /* Update vectorial force */
413 fix2 = _mm_add_ps(fix2,tx);
414 fiy2 = _mm_add_ps(fiy2,ty);
415 fiz2 = _mm_add_ps(fiz2,tz);
417 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
418 f+j_coord_offsetC,f+j_coord_offsetD,
421 /**************************
422 * CALCULATE INTERACTIONS *
423 **************************/
425 /* Compute parameters for interactions between i and j atoms */
426 qq30 = _mm_mul_ps(iq3,jq0);
428 /* REACTION-FIELD ELECTROSTATICS */
429 velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
430 felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
432 /* Update potential sum for this i atom from the interaction with this j atom. */
433 velec = _mm_andnot_ps(dummy_mask,velec);
434 velecsum = _mm_add_ps(velecsum,velec);
438 fscal = _mm_andnot_ps(dummy_mask,fscal);
440 /* Calculate temporary vectorial force */
441 tx = _mm_mul_ps(fscal,dx30);
442 ty = _mm_mul_ps(fscal,dy30);
443 tz = _mm_mul_ps(fscal,dz30);
445 /* Update vectorial force */
446 fix3 = _mm_add_ps(fix3,tx);
447 fiy3 = _mm_add_ps(fiy3,ty);
448 fiz3 = _mm_add_ps(fiz3,tz);
450 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
451 f+j_coord_offsetC,f+j_coord_offsetD,
454 /* Inner loop uses 96 flops */
457 /* End of innermost loop */
459 gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
460 f+i_coord_offset+DIM,fshift+i_shift_offset);
463 /* Update potential energies */
464 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
466 /* Increment number of inner iterations */
467 inneriter += j_index_end - j_index_start;
469 /* Outer loop uses 28 flops */
472 /* Increment number of outer iterations */
475 /* Update outer/inner flops */
477 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*96);
480 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_single
481 * Electrostatics interaction: ReactionField
482 * VdW interaction: None
483 * Geometry: Water4-Particle
484 * Calculate force/pot: Force
487 nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_single
488 (t_nblist * gmx_restrict nlist,
489 rvec * gmx_restrict xx,
490 rvec * gmx_restrict ff,
491 t_forcerec * gmx_restrict fr,
492 t_mdatoms * gmx_restrict mdatoms,
493 nb_kernel_data_t * gmx_restrict kernel_data,
494 t_nrnb * gmx_restrict nrnb)
496 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
497 * just 0 for non-waters.
498 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
499 * jnr indices corresponding to data put in the four positions in the SIMD register.
501 int i_shift_offset,i_coord_offset,outeriter,inneriter;
502 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
503 int jnrA,jnrB,jnrC,jnrD;
504 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
505 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
506 real shX,shY,shZ,rcutoff_scalar;
507 real *shiftvec,*fshift,*x,*f;
508 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
510 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
512 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
514 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
515 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
516 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
517 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
518 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
519 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
520 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
522 __m128 dummy_mask,cutoff_mask;
523 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
524 __m128 one = _mm_set1_ps(1.0);
525 __m128 two = _mm_set1_ps(2.0);
531 jindex = nlist->jindex;
533 shiftidx = nlist->shift;
535 shiftvec = fr->shift_vec[0];
536 fshift = fr->fshift[0];
537 facel = _mm_set1_ps(fr->epsfac);
538 charge = mdatoms->chargeA;
539 krf = _mm_set1_ps(fr->ic->k_rf);
540 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
541 crf = _mm_set1_ps(fr->ic->c_rf);
543 /* Setup water-specific parameters */
544 inr = nlist->iinr[0];
545 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
546 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
547 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
549 /* Avoid stupid compiler warnings */
550 jnrA = jnrB = jnrC = jnrD = 0;
559 /* Start outer loop over neighborlists */
560 for(iidx=0; iidx<nri; iidx++)
562 /* Load shift vector for this list */
563 i_shift_offset = DIM*shiftidx[iidx];
564 shX = shiftvec[i_shift_offset+XX];
565 shY = shiftvec[i_shift_offset+YY];
566 shZ = shiftvec[i_shift_offset+ZZ];
568 /* Load limits for loop over neighbors */
569 j_index_start = jindex[iidx];
570 j_index_end = jindex[iidx+1];
572 /* Get outer coordinate index */
574 i_coord_offset = DIM*inr;
576 /* Load i particle coords and add shift vector */
577 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
578 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
579 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
580 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
581 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
582 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
583 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
584 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
585 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
587 fix1 = _mm_setzero_ps();
588 fiy1 = _mm_setzero_ps();
589 fiz1 = _mm_setzero_ps();
590 fix2 = _mm_setzero_ps();
591 fiy2 = _mm_setzero_ps();
592 fiz2 = _mm_setzero_ps();
593 fix3 = _mm_setzero_ps();
594 fiy3 = _mm_setzero_ps();
595 fiz3 = _mm_setzero_ps();
597 /* Start inner kernel loop */
598 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
601 /* Get j neighbor index, and coordinate index */
607 j_coord_offsetA = DIM*jnrA;
608 j_coord_offsetB = DIM*jnrB;
609 j_coord_offsetC = DIM*jnrC;
610 j_coord_offsetD = DIM*jnrD;
612 /* load j atom coordinates */
613 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
614 x+j_coord_offsetC,x+j_coord_offsetD,
617 /* Calculate displacement vector */
618 dx10 = _mm_sub_ps(ix1,jx0);
619 dy10 = _mm_sub_ps(iy1,jy0);
620 dz10 = _mm_sub_ps(iz1,jz0);
621 dx20 = _mm_sub_ps(ix2,jx0);
622 dy20 = _mm_sub_ps(iy2,jy0);
623 dz20 = _mm_sub_ps(iz2,jz0);
624 dx30 = _mm_sub_ps(ix3,jx0);
625 dy30 = _mm_sub_ps(iy3,jy0);
626 dz30 = _mm_sub_ps(iz3,jz0);
628 /* Calculate squared distance and things based on it */
629 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
630 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
631 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
633 rinv10 = gmx_mm_invsqrt_ps(rsq10);
634 rinv20 = gmx_mm_invsqrt_ps(rsq20);
635 rinv30 = gmx_mm_invsqrt_ps(rsq30);
637 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
638 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
639 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
641 /* Load parameters for j particles */
642 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
643 charge+jnrC+0,charge+jnrD+0);
645 /**************************
646 * CALCULATE INTERACTIONS *
647 **************************/
649 /* Compute parameters for interactions between i and j atoms */
650 qq10 = _mm_mul_ps(iq1,jq0);
652 /* REACTION-FIELD ELECTROSTATICS */
653 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
657 /* Calculate temporary vectorial force */
658 tx = _mm_mul_ps(fscal,dx10);
659 ty = _mm_mul_ps(fscal,dy10);
660 tz = _mm_mul_ps(fscal,dz10);
662 /* Update vectorial force */
663 fix1 = _mm_add_ps(fix1,tx);
664 fiy1 = _mm_add_ps(fiy1,ty);
665 fiz1 = _mm_add_ps(fiz1,tz);
667 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
668 f+j_coord_offsetC,f+j_coord_offsetD,
671 /**************************
672 * CALCULATE INTERACTIONS *
673 **************************/
675 /* Compute parameters for interactions between i and j atoms */
676 qq20 = _mm_mul_ps(iq2,jq0);
678 /* REACTION-FIELD ELECTROSTATICS */
679 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
683 /* Calculate temporary vectorial force */
684 tx = _mm_mul_ps(fscal,dx20);
685 ty = _mm_mul_ps(fscal,dy20);
686 tz = _mm_mul_ps(fscal,dz20);
688 /* Update vectorial force */
689 fix2 = _mm_add_ps(fix2,tx);
690 fiy2 = _mm_add_ps(fiy2,ty);
691 fiz2 = _mm_add_ps(fiz2,tz);
693 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
694 f+j_coord_offsetC,f+j_coord_offsetD,
697 /**************************
698 * CALCULATE INTERACTIONS *
699 **************************/
701 /* Compute parameters for interactions between i and j atoms */
702 qq30 = _mm_mul_ps(iq3,jq0);
704 /* REACTION-FIELD ELECTROSTATICS */
705 felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
709 /* Calculate temporary vectorial force */
710 tx = _mm_mul_ps(fscal,dx30);
711 ty = _mm_mul_ps(fscal,dy30);
712 tz = _mm_mul_ps(fscal,dz30);
714 /* Update vectorial force */
715 fix3 = _mm_add_ps(fix3,tx);
716 fiy3 = _mm_add_ps(fiy3,ty);
717 fiz3 = _mm_add_ps(fiz3,tz);
719 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
720 f+j_coord_offsetC,f+j_coord_offsetD,
723 /* Inner loop uses 81 flops */
729 /* Get j neighbor index, and coordinate index */
735 /* Sign of each element will be negative for non-real atoms.
736 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
737 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
739 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
740 jnrA = (jnrA>=0) ? jnrA : 0;
741 jnrB = (jnrB>=0) ? jnrB : 0;
742 jnrC = (jnrC>=0) ? jnrC : 0;
743 jnrD = (jnrD>=0) ? jnrD : 0;
745 j_coord_offsetA = DIM*jnrA;
746 j_coord_offsetB = DIM*jnrB;
747 j_coord_offsetC = DIM*jnrC;
748 j_coord_offsetD = DIM*jnrD;
750 /* load j atom coordinates */
751 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
752 x+j_coord_offsetC,x+j_coord_offsetD,
755 /* Calculate displacement vector */
756 dx10 = _mm_sub_ps(ix1,jx0);
757 dy10 = _mm_sub_ps(iy1,jy0);
758 dz10 = _mm_sub_ps(iz1,jz0);
759 dx20 = _mm_sub_ps(ix2,jx0);
760 dy20 = _mm_sub_ps(iy2,jy0);
761 dz20 = _mm_sub_ps(iz2,jz0);
762 dx30 = _mm_sub_ps(ix3,jx0);
763 dy30 = _mm_sub_ps(iy3,jy0);
764 dz30 = _mm_sub_ps(iz3,jz0);
766 /* Calculate squared distance and things based on it */
767 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
768 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
769 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
771 rinv10 = gmx_mm_invsqrt_ps(rsq10);
772 rinv20 = gmx_mm_invsqrt_ps(rsq20);
773 rinv30 = gmx_mm_invsqrt_ps(rsq30);
775 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
776 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
777 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
779 /* Load parameters for j particles */
780 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
781 charge+jnrC+0,charge+jnrD+0);
783 /**************************
784 * CALCULATE INTERACTIONS *
785 **************************/
787 /* Compute parameters for interactions between i and j atoms */
788 qq10 = _mm_mul_ps(iq1,jq0);
790 /* REACTION-FIELD ELECTROSTATICS */
791 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
795 fscal = _mm_andnot_ps(dummy_mask,fscal);
797 /* Calculate temporary vectorial force */
798 tx = _mm_mul_ps(fscal,dx10);
799 ty = _mm_mul_ps(fscal,dy10);
800 tz = _mm_mul_ps(fscal,dz10);
802 /* Update vectorial force */
803 fix1 = _mm_add_ps(fix1,tx);
804 fiy1 = _mm_add_ps(fiy1,ty);
805 fiz1 = _mm_add_ps(fiz1,tz);
807 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
808 f+j_coord_offsetC,f+j_coord_offsetD,
811 /**************************
812 * CALCULATE INTERACTIONS *
813 **************************/
815 /* Compute parameters for interactions between i and j atoms */
816 qq20 = _mm_mul_ps(iq2,jq0);
818 /* REACTION-FIELD ELECTROSTATICS */
819 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
823 fscal = _mm_andnot_ps(dummy_mask,fscal);
825 /* Calculate temporary vectorial force */
826 tx = _mm_mul_ps(fscal,dx20);
827 ty = _mm_mul_ps(fscal,dy20);
828 tz = _mm_mul_ps(fscal,dz20);
830 /* Update vectorial force */
831 fix2 = _mm_add_ps(fix2,tx);
832 fiy2 = _mm_add_ps(fiy2,ty);
833 fiz2 = _mm_add_ps(fiz2,tz);
835 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
836 f+j_coord_offsetC,f+j_coord_offsetD,
839 /**************************
840 * CALCULATE INTERACTIONS *
841 **************************/
843 /* Compute parameters for interactions between i and j atoms */
844 qq30 = _mm_mul_ps(iq3,jq0);
846 /* REACTION-FIELD ELECTROSTATICS */
847 felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
851 fscal = _mm_andnot_ps(dummy_mask,fscal);
853 /* Calculate temporary vectorial force */
854 tx = _mm_mul_ps(fscal,dx30);
855 ty = _mm_mul_ps(fscal,dy30);
856 tz = _mm_mul_ps(fscal,dz30);
858 /* Update vectorial force */
859 fix3 = _mm_add_ps(fix3,tx);
860 fiy3 = _mm_add_ps(fiy3,ty);
861 fiz3 = _mm_add_ps(fiz3,tz);
863 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
864 f+j_coord_offsetC,f+j_coord_offsetD,
867 /* Inner loop uses 81 flops */
870 /* End of innermost loop */
872 gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
873 f+i_coord_offset+DIM,fshift+i_shift_offset);
875 /* Increment number of inner iterations */
876 inneriter += j_index_end - j_index_start;
878 /* Outer loop uses 27 flops */
881 /* Increment number of outer iterations */
884 /* Update outer/inner flops */
886 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*81);