2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_sse4_1_single.h"
48 #include "kernelutil_x86_sse4_1_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse4_1_single
52 * Electrostatics interaction: ReactionField
53 * VdW interaction: LennardJones
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse4_1_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
98 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
99 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
100 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
112 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
116 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
117 __m128 dummy_mask,cutoff_mask;
118 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
119 __m128 one = _mm_set1_ps(1.0);
120 __m128 two = _mm_set1_ps(2.0);
126 jindex = nlist->jindex;
128 shiftidx = nlist->shift;
130 shiftvec = fr->shift_vec[0];
131 fshift = fr->fshift[0];
132 facel = _mm_set1_ps(fr->epsfac);
133 charge = mdatoms->chargeA;
134 krf = _mm_set1_ps(fr->ic->k_rf);
135 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
136 crf = _mm_set1_ps(fr->ic->c_rf);
137 nvdwtype = fr->ntype;
139 vdwtype = mdatoms->typeA;
141 /* Setup water-specific parameters */
142 inr = nlist->iinr[0];
143 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
144 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
145 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
146 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
148 jq1 = _mm_set1_ps(charge[inr+1]);
149 jq2 = _mm_set1_ps(charge[inr+2]);
150 jq3 = _mm_set1_ps(charge[inr+3]);
151 vdwjidx0A = 2*vdwtype[inr+0];
152 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
153 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
154 qq11 = _mm_mul_ps(iq1,jq1);
155 qq12 = _mm_mul_ps(iq1,jq2);
156 qq13 = _mm_mul_ps(iq1,jq3);
157 qq21 = _mm_mul_ps(iq2,jq1);
158 qq22 = _mm_mul_ps(iq2,jq2);
159 qq23 = _mm_mul_ps(iq2,jq3);
160 qq31 = _mm_mul_ps(iq3,jq1);
161 qq32 = _mm_mul_ps(iq3,jq2);
162 qq33 = _mm_mul_ps(iq3,jq3);
164 /* Avoid stupid compiler warnings */
165 jnrA = jnrB = jnrC = jnrD = 0;
174 for(iidx=0;iidx<4*DIM;iidx++)
179 /* Start outer loop over neighborlists */
180 for(iidx=0; iidx<nri; iidx++)
182 /* Load shift vector for this list */
183 i_shift_offset = DIM*shiftidx[iidx];
185 /* Load limits for loop over neighbors */
186 j_index_start = jindex[iidx];
187 j_index_end = jindex[iidx+1];
189 /* Get outer coordinate index */
191 i_coord_offset = DIM*inr;
193 /* Load i particle coords and add shift vector */
194 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
195 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
197 fix0 = _mm_setzero_ps();
198 fiy0 = _mm_setzero_ps();
199 fiz0 = _mm_setzero_ps();
200 fix1 = _mm_setzero_ps();
201 fiy1 = _mm_setzero_ps();
202 fiz1 = _mm_setzero_ps();
203 fix2 = _mm_setzero_ps();
204 fiy2 = _mm_setzero_ps();
205 fiz2 = _mm_setzero_ps();
206 fix3 = _mm_setzero_ps();
207 fiy3 = _mm_setzero_ps();
208 fiz3 = _mm_setzero_ps();
210 /* Reset potential sums */
211 velecsum = _mm_setzero_ps();
212 vvdwsum = _mm_setzero_ps();
214 /* Start inner kernel loop */
215 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
218 /* Get j neighbor index, and coordinate index */
223 j_coord_offsetA = DIM*jnrA;
224 j_coord_offsetB = DIM*jnrB;
225 j_coord_offsetC = DIM*jnrC;
226 j_coord_offsetD = DIM*jnrD;
228 /* load j atom coordinates */
229 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
230 x+j_coord_offsetC,x+j_coord_offsetD,
231 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
232 &jy2,&jz2,&jx3,&jy3,&jz3);
234 /* Calculate displacement vector */
235 dx00 = _mm_sub_ps(ix0,jx0);
236 dy00 = _mm_sub_ps(iy0,jy0);
237 dz00 = _mm_sub_ps(iz0,jz0);
238 dx11 = _mm_sub_ps(ix1,jx1);
239 dy11 = _mm_sub_ps(iy1,jy1);
240 dz11 = _mm_sub_ps(iz1,jz1);
241 dx12 = _mm_sub_ps(ix1,jx2);
242 dy12 = _mm_sub_ps(iy1,jy2);
243 dz12 = _mm_sub_ps(iz1,jz2);
244 dx13 = _mm_sub_ps(ix1,jx3);
245 dy13 = _mm_sub_ps(iy1,jy3);
246 dz13 = _mm_sub_ps(iz1,jz3);
247 dx21 = _mm_sub_ps(ix2,jx1);
248 dy21 = _mm_sub_ps(iy2,jy1);
249 dz21 = _mm_sub_ps(iz2,jz1);
250 dx22 = _mm_sub_ps(ix2,jx2);
251 dy22 = _mm_sub_ps(iy2,jy2);
252 dz22 = _mm_sub_ps(iz2,jz2);
253 dx23 = _mm_sub_ps(ix2,jx3);
254 dy23 = _mm_sub_ps(iy2,jy3);
255 dz23 = _mm_sub_ps(iz2,jz3);
256 dx31 = _mm_sub_ps(ix3,jx1);
257 dy31 = _mm_sub_ps(iy3,jy1);
258 dz31 = _mm_sub_ps(iz3,jz1);
259 dx32 = _mm_sub_ps(ix3,jx2);
260 dy32 = _mm_sub_ps(iy3,jy2);
261 dz32 = _mm_sub_ps(iz3,jz2);
262 dx33 = _mm_sub_ps(ix3,jx3);
263 dy33 = _mm_sub_ps(iy3,jy3);
264 dz33 = _mm_sub_ps(iz3,jz3);
266 /* Calculate squared distance and things based on it */
267 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
268 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
269 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
270 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
271 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
272 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
273 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
274 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
275 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
276 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
278 rinv11 = gmx_mm_invsqrt_ps(rsq11);
279 rinv12 = gmx_mm_invsqrt_ps(rsq12);
280 rinv13 = gmx_mm_invsqrt_ps(rsq13);
281 rinv21 = gmx_mm_invsqrt_ps(rsq21);
282 rinv22 = gmx_mm_invsqrt_ps(rsq22);
283 rinv23 = gmx_mm_invsqrt_ps(rsq23);
284 rinv31 = gmx_mm_invsqrt_ps(rsq31);
285 rinv32 = gmx_mm_invsqrt_ps(rsq32);
286 rinv33 = gmx_mm_invsqrt_ps(rsq33);
288 rinvsq00 = gmx_mm_inv_ps(rsq00);
289 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
290 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
291 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
292 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
293 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
294 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
295 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
296 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
297 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
299 fjx0 = _mm_setzero_ps();
300 fjy0 = _mm_setzero_ps();
301 fjz0 = _mm_setzero_ps();
302 fjx1 = _mm_setzero_ps();
303 fjy1 = _mm_setzero_ps();
304 fjz1 = _mm_setzero_ps();
305 fjx2 = _mm_setzero_ps();
306 fjy2 = _mm_setzero_ps();
307 fjz2 = _mm_setzero_ps();
308 fjx3 = _mm_setzero_ps();
309 fjy3 = _mm_setzero_ps();
310 fjz3 = _mm_setzero_ps();
312 /**************************
313 * CALCULATE INTERACTIONS *
314 **************************/
316 /* LENNARD-JONES DISPERSION/REPULSION */
318 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
319 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
320 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
321 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
322 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
324 /* Update potential sum for this i atom from the interaction with this j atom. */
325 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
329 /* Calculate temporary vectorial force */
330 tx = _mm_mul_ps(fscal,dx00);
331 ty = _mm_mul_ps(fscal,dy00);
332 tz = _mm_mul_ps(fscal,dz00);
334 /* Update vectorial force */
335 fix0 = _mm_add_ps(fix0,tx);
336 fiy0 = _mm_add_ps(fiy0,ty);
337 fiz0 = _mm_add_ps(fiz0,tz);
339 fjx0 = _mm_add_ps(fjx0,tx);
340 fjy0 = _mm_add_ps(fjy0,ty);
341 fjz0 = _mm_add_ps(fjz0,tz);
343 /**************************
344 * CALCULATE INTERACTIONS *
345 **************************/
347 /* REACTION-FIELD ELECTROSTATICS */
348 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
349 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
351 /* Update potential sum for this i atom from the interaction with this j atom. */
352 velecsum = _mm_add_ps(velecsum,velec);
356 /* Calculate temporary vectorial force */
357 tx = _mm_mul_ps(fscal,dx11);
358 ty = _mm_mul_ps(fscal,dy11);
359 tz = _mm_mul_ps(fscal,dz11);
361 /* Update vectorial force */
362 fix1 = _mm_add_ps(fix1,tx);
363 fiy1 = _mm_add_ps(fiy1,ty);
364 fiz1 = _mm_add_ps(fiz1,tz);
366 fjx1 = _mm_add_ps(fjx1,tx);
367 fjy1 = _mm_add_ps(fjy1,ty);
368 fjz1 = _mm_add_ps(fjz1,tz);
370 /**************************
371 * CALCULATE INTERACTIONS *
372 **************************/
374 /* REACTION-FIELD ELECTROSTATICS */
375 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
376 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
378 /* Update potential sum for this i atom from the interaction with this j atom. */
379 velecsum = _mm_add_ps(velecsum,velec);
383 /* Calculate temporary vectorial force */
384 tx = _mm_mul_ps(fscal,dx12);
385 ty = _mm_mul_ps(fscal,dy12);
386 tz = _mm_mul_ps(fscal,dz12);
388 /* Update vectorial force */
389 fix1 = _mm_add_ps(fix1,tx);
390 fiy1 = _mm_add_ps(fiy1,ty);
391 fiz1 = _mm_add_ps(fiz1,tz);
393 fjx2 = _mm_add_ps(fjx2,tx);
394 fjy2 = _mm_add_ps(fjy2,ty);
395 fjz2 = _mm_add_ps(fjz2,tz);
397 /**************************
398 * CALCULATE INTERACTIONS *
399 **************************/
401 /* REACTION-FIELD ELECTROSTATICS */
402 velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
403 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
405 /* Update potential sum for this i atom from the interaction with this j atom. */
406 velecsum = _mm_add_ps(velecsum,velec);
410 /* Calculate temporary vectorial force */
411 tx = _mm_mul_ps(fscal,dx13);
412 ty = _mm_mul_ps(fscal,dy13);
413 tz = _mm_mul_ps(fscal,dz13);
415 /* Update vectorial force */
416 fix1 = _mm_add_ps(fix1,tx);
417 fiy1 = _mm_add_ps(fiy1,ty);
418 fiz1 = _mm_add_ps(fiz1,tz);
420 fjx3 = _mm_add_ps(fjx3,tx);
421 fjy3 = _mm_add_ps(fjy3,ty);
422 fjz3 = _mm_add_ps(fjz3,tz);
424 /**************************
425 * CALCULATE INTERACTIONS *
426 **************************/
428 /* REACTION-FIELD ELECTROSTATICS */
429 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
430 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
432 /* Update potential sum for this i atom from the interaction with this j atom. */
433 velecsum = _mm_add_ps(velecsum,velec);
437 /* Calculate temporary vectorial force */
438 tx = _mm_mul_ps(fscal,dx21);
439 ty = _mm_mul_ps(fscal,dy21);
440 tz = _mm_mul_ps(fscal,dz21);
442 /* Update vectorial force */
443 fix2 = _mm_add_ps(fix2,tx);
444 fiy2 = _mm_add_ps(fiy2,ty);
445 fiz2 = _mm_add_ps(fiz2,tz);
447 fjx1 = _mm_add_ps(fjx1,tx);
448 fjy1 = _mm_add_ps(fjy1,ty);
449 fjz1 = _mm_add_ps(fjz1,tz);
451 /**************************
452 * CALCULATE INTERACTIONS *
453 **************************/
455 /* REACTION-FIELD ELECTROSTATICS */
456 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
457 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
459 /* Update potential sum for this i atom from the interaction with this j atom. */
460 velecsum = _mm_add_ps(velecsum,velec);
464 /* Calculate temporary vectorial force */
465 tx = _mm_mul_ps(fscal,dx22);
466 ty = _mm_mul_ps(fscal,dy22);
467 tz = _mm_mul_ps(fscal,dz22);
469 /* Update vectorial force */
470 fix2 = _mm_add_ps(fix2,tx);
471 fiy2 = _mm_add_ps(fiy2,ty);
472 fiz2 = _mm_add_ps(fiz2,tz);
474 fjx2 = _mm_add_ps(fjx2,tx);
475 fjy2 = _mm_add_ps(fjy2,ty);
476 fjz2 = _mm_add_ps(fjz2,tz);
478 /**************************
479 * CALCULATE INTERACTIONS *
480 **************************/
482 /* REACTION-FIELD ELECTROSTATICS */
483 velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
484 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
486 /* Update potential sum for this i atom from the interaction with this j atom. */
487 velecsum = _mm_add_ps(velecsum,velec);
491 /* Calculate temporary vectorial force */
492 tx = _mm_mul_ps(fscal,dx23);
493 ty = _mm_mul_ps(fscal,dy23);
494 tz = _mm_mul_ps(fscal,dz23);
496 /* Update vectorial force */
497 fix2 = _mm_add_ps(fix2,tx);
498 fiy2 = _mm_add_ps(fiy2,ty);
499 fiz2 = _mm_add_ps(fiz2,tz);
501 fjx3 = _mm_add_ps(fjx3,tx);
502 fjy3 = _mm_add_ps(fjy3,ty);
503 fjz3 = _mm_add_ps(fjz3,tz);
505 /**************************
506 * CALCULATE INTERACTIONS *
507 **************************/
509 /* REACTION-FIELD ELECTROSTATICS */
510 velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
511 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
513 /* Update potential sum for this i atom from the interaction with this j atom. */
514 velecsum = _mm_add_ps(velecsum,velec);
518 /* Calculate temporary vectorial force */
519 tx = _mm_mul_ps(fscal,dx31);
520 ty = _mm_mul_ps(fscal,dy31);
521 tz = _mm_mul_ps(fscal,dz31);
523 /* Update vectorial force */
524 fix3 = _mm_add_ps(fix3,tx);
525 fiy3 = _mm_add_ps(fiy3,ty);
526 fiz3 = _mm_add_ps(fiz3,tz);
528 fjx1 = _mm_add_ps(fjx1,tx);
529 fjy1 = _mm_add_ps(fjy1,ty);
530 fjz1 = _mm_add_ps(fjz1,tz);
532 /**************************
533 * CALCULATE INTERACTIONS *
534 **************************/
536 /* REACTION-FIELD ELECTROSTATICS */
537 velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
538 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
540 /* Update potential sum for this i atom from the interaction with this j atom. */
541 velecsum = _mm_add_ps(velecsum,velec);
545 /* Calculate temporary vectorial force */
546 tx = _mm_mul_ps(fscal,dx32);
547 ty = _mm_mul_ps(fscal,dy32);
548 tz = _mm_mul_ps(fscal,dz32);
550 /* Update vectorial force */
551 fix3 = _mm_add_ps(fix3,tx);
552 fiy3 = _mm_add_ps(fiy3,ty);
553 fiz3 = _mm_add_ps(fiz3,tz);
555 fjx2 = _mm_add_ps(fjx2,tx);
556 fjy2 = _mm_add_ps(fjy2,ty);
557 fjz2 = _mm_add_ps(fjz2,tz);
559 /**************************
560 * CALCULATE INTERACTIONS *
561 **************************/
563 /* REACTION-FIELD ELECTROSTATICS */
564 velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
565 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
567 /* Update potential sum for this i atom from the interaction with this j atom. */
568 velecsum = _mm_add_ps(velecsum,velec);
572 /* Calculate temporary vectorial force */
573 tx = _mm_mul_ps(fscal,dx33);
574 ty = _mm_mul_ps(fscal,dy33);
575 tz = _mm_mul_ps(fscal,dz33);
577 /* Update vectorial force */
578 fix3 = _mm_add_ps(fix3,tx);
579 fiy3 = _mm_add_ps(fiy3,ty);
580 fiz3 = _mm_add_ps(fiz3,tz);
582 fjx3 = _mm_add_ps(fjx3,tx);
583 fjy3 = _mm_add_ps(fjy3,ty);
584 fjz3 = _mm_add_ps(fjz3,tz);
586 fjptrA = f+j_coord_offsetA;
587 fjptrB = f+j_coord_offsetB;
588 fjptrC = f+j_coord_offsetC;
589 fjptrD = f+j_coord_offsetD;
591 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
592 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
593 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
595 /* Inner loop uses 323 flops */
601 /* Get j neighbor index, and coordinate index */
602 jnrlistA = jjnr[jidx];
603 jnrlistB = jjnr[jidx+1];
604 jnrlistC = jjnr[jidx+2];
605 jnrlistD = jjnr[jidx+3];
606 /* Sign of each element will be negative for non-real atoms.
607 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
608 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
610 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
611 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
612 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
613 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
614 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
615 j_coord_offsetA = DIM*jnrA;
616 j_coord_offsetB = DIM*jnrB;
617 j_coord_offsetC = DIM*jnrC;
618 j_coord_offsetD = DIM*jnrD;
620 /* load j atom coordinates */
621 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
622 x+j_coord_offsetC,x+j_coord_offsetD,
623 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
624 &jy2,&jz2,&jx3,&jy3,&jz3);
626 /* Calculate displacement vector */
627 dx00 = _mm_sub_ps(ix0,jx0);
628 dy00 = _mm_sub_ps(iy0,jy0);
629 dz00 = _mm_sub_ps(iz0,jz0);
630 dx11 = _mm_sub_ps(ix1,jx1);
631 dy11 = _mm_sub_ps(iy1,jy1);
632 dz11 = _mm_sub_ps(iz1,jz1);
633 dx12 = _mm_sub_ps(ix1,jx2);
634 dy12 = _mm_sub_ps(iy1,jy2);
635 dz12 = _mm_sub_ps(iz1,jz2);
636 dx13 = _mm_sub_ps(ix1,jx3);
637 dy13 = _mm_sub_ps(iy1,jy3);
638 dz13 = _mm_sub_ps(iz1,jz3);
639 dx21 = _mm_sub_ps(ix2,jx1);
640 dy21 = _mm_sub_ps(iy2,jy1);
641 dz21 = _mm_sub_ps(iz2,jz1);
642 dx22 = _mm_sub_ps(ix2,jx2);
643 dy22 = _mm_sub_ps(iy2,jy2);
644 dz22 = _mm_sub_ps(iz2,jz2);
645 dx23 = _mm_sub_ps(ix2,jx3);
646 dy23 = _mm_sub_ps(iy2,jy3);
647 dz23 = _mm_sub_ps(iz2,jz3);
648 dx31 = _mm_sub_ps(ix3,jx1);
649 dy31 = _mm_sub_ps(iy3,jy1);
650 dz31 = _mm_sub_ps(iz3,jz1);
651 dx32 = _mm_sub_ps(ix3,jx2);
652 dy32 = _mm_sub_ps(iy3,jy2);
653 dz32 = _mm_sub_ps(iz3,jz2);
654 dx33 = _mm_sub_ps(ix3,jx3);
655 dy33 = _mm_sub_ps(iy3,jy3);
656 dz33 = _mm_sub_ps(iz3,jz3);
658 /* Calculate squared distance and things based on it */
659 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
660 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
661 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
662 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
663 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
664 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
665 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
666 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
667 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
668 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
670 rinv11 = gmx_mm_invsqrt_ps(rsq11);
671 rinv12 = gmx_mm_invsqrt_ps(rsq12);
672 rinv13 = gmx_mm_invsqrt_ps(rsq13);
673 rinv21 = gmx_mm_invsqrt_ps(rsq21);
674 rinv22 = gmx_mm_invsqrt_ps(rsq22);
675 rinv23 = gmx_mm_invsqrt_ps(rsq23);
676 rinv31 = gmx_mm_invsqrt_ps(rsq31);
677 rinv32 = gmx_mm_invsqrt_ps(rsq32);
678 rinv33 = gmx_mm_invsqrt_ps(rsq33);
680 rinvsq00 = gmx_mm_inv_ps(rsq00);
681 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
682 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
683 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
684 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
685 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
686 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
687 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
688 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
689 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
691 fjx0 = _mm_setzero_ps();
692 fjy0 = _mm_setzero_ps();
693 fjz0 = _mm_setzero_ps();
694 fjx1 = _mm_setzero_ps();
695 fjy1 = _mm_setzero_ps();
696 fjz1 = _mm_setzero_ps();
697 fjx2 = _mm_setzero_ps();
698 fjy2 = _mm_setzero_ps();
699 fjz2 = _mm_setzero_ps();
700 fjx3 = _mm_setzero_ps();
701 fjy3 = _mm_setzero_ps();
702 fjz3 = _mm_setzero_ps();
704 /**************************
705 * CALCULATE INTERACTIONS *
706 **************************/
708 /* LENNARD-JONES DISPERSION/REPULSION */
710 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
711 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
712 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
713 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
714 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
716 /* Update potential sum for this i atom from the interaction with this j atom. */
717 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
718 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
722 fscal = _mm_andnot_ps(dummy_mask,fscal);
724 /* Calculate temporary vectorial force */
725 tx = _mm_mul_ps(fscal,dx00);
726 ty = _mm_mul_ps(fscal,dy00);
727 tz = _mm_mul_ps(fscal,dz00);
729 /* Update vectorial force */
730 fix0 = _mm_add_ps(fix0,tx);
731 fiy0 = _mm_add_ps(fiy0,ty);
732 fiz0 = _mm_add_ps(fiz0,tz);
734 fjx0 = _mm_add_ps(fjx0,tx);
735 fjy0 = _mm_add_ps(fjy0,ty);
736 fjz0 = _mm_add_ps(fjz0,tz);
738 /**************************
739 * CALCULATE INTERACTIONS *
740 **************************/
742 /* REACTION-FIELD ELECTROSTATICS */
743 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
744 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
746 /* Update potential sum for this i atom from the interaction with this j atom. */
747 velec = _mm_andnot_ps(dummy_mask,velec);
748 velecsum = _mm_add_ps(velecsum,velec);
752 fscal = _mm_andnot_ps(dummy_mask,fscal);
754 /* Calculate temporary vectorial force */
755 tx = _mm_mul_ps(fscal,dx11);
756 ty = _mm_mul_ps(fscal,dy11);
757 tz = _mm_mul_ps(fscal,dz11);
759 /* Update vectorial force */
760 fix1 = _mm_add_ps(fix1,tx);
761 fiy1 = _mm_add_ps(fiy1,ty);
762 fiz1 = _mm_add_ps(fiz1,tz);
764 fjx1 = _mm_add_ps(fjx1,tx);
765 fjy1 = _mm_add_ps(fjy1,ty);
766 fjz1 = _mm_add_ps(fjz1,tz);
768 /**************************
769 * CALCULATE INTERACTIONS *
770 **************************/
772 /* REACTION-FIELD ELECTROSTATICS */
773 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
774 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
776 /* Update potential sum for this i atom from the interaction with this j atom. */
777 velec = _mm_andnot_ps(dummy_mask,velec);
778 velecsum = _mm_add_ps(velecsum,velec);
782 fscal = _mm_andnot_ps(dummy_mask,fscal);
784 /* Calculate temporary vectorial force */
785 tx = _mm_mul_ps(fscal,dx12);
786 ty = _mm_mul_ps(fscal,dy12);
787 tz = _mm_mul_ps(fscal,dz12);
789 /* Update vectorial force */
790 fix1 = _mm_add_ps(fix1,tx);
791 fiy1 = _mm_add_ps(fiy1,ty);
792 fiz1 = _mm_add_ps(fiz1,tz);
794 fjx2 = _mm_add_ps(fjx2,tx);
795 fjy2 = _mm_add_ps(fjy2,ty);
796 fjz2 = _mm_add_ps(fjz2,tz);
798 /**************************
799 * CALCULATE INTERACTIONS *
800 **************************/
802 /* REACTION-FIELD ELECTROSTATICS */
803 velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
804 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
806 /* Update potential sum for this i atom from the interaction with this j atom. */
807 velec = _mm_andnot_ps(dummy_mask,velec);
808 velecsum = _mm_add_ps(velecsum,velec);
812 fscal = _mm_andnot_ps(dummy_mask,fscal);
814 /* Calculate temporary vectorial force */
815 tx = _mm_mul_ps(fscal,dx13);
816 ty = _mm_mul_ps(fscal,dy13);
817 tz = _mm_mul_ps(fscal,dz13);
819 /* Update vectorial force */
820 fix1 = _mm_add_ps(fix1,tx);
821 fiy1 = _mm_add_ps(fiy1,ty);
822 fiz1 = _mm_add_ps(fiz1,tz);
824 fjx3 = _mm_add_ps(fjx3,tx);
825 fjy3 = _mm_add_ps(fjy3,ty);
826 fjz3 = _mm_add_ps(fjz3,tz);
828 /**************************
829 * CALCULATE INTERACTIONS *
830 **************************/
832 /* REACTION-FIELD ELECTROSTATICS */
833 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
834 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
836 /* Update potential sum for this i atom from the interaction with this j atom. */
837 velec = _mm_andnot_ps(dummy_mask,velec);
838 velecsum = _mm_add_ps(velecsum,velec);
842 fscal = _mm_andnot_ps(dummy_mask,fscal);
844 /* Calculate temporary vectorial force */
845 tx = _mm_mul_ps(fscal,dx21);
846 ty = _mm_mul_ps(fscal,dy21);
847 tz = _mm_mul_ps(fscal,dz21);
849 /* Update vectorial force */
850 fix2 = _mm_add_ps(fix2,tx);
851 fiy2 = _mm_add_ps(fiy2,ty);
852 fiz2 = _mm_add_ps(fiz2,tz);
854 fjx1 = _mm_add_ps(fjx1,tx);
855 fjy1 = _mm_add_ps(fjy1,ty);
856 fjz1 = _mm_add_ps(fjz1,tz);
858 /**************************
859 * CALCULATE INTERACTIONS *
860 **************************/
862 /* REACTION-FIELD ELECTROSTATICS */
863 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
864 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
866 /* Update potential sum for this i atom from the interaction with this j atom. */
867 velec = _mm_andnot_ps(dummy_mask,velec);
868 velecsum = _mm_add_ps(velecsum,velec);
872 fscal = _mm_andnot_ps(dummy_mask,fscal);
874 /* Calculate temporary vectorial force */
875 tx = _mm_mul_ps(fscal,dx22);
876 ty = _mm_mul_ps(fscal,dy22);
877 tz = _mm_mul_ps(fscal,dz22);
879 /* Update vectorial force */
880 fix2 = _mm_add_ps(fix2,tx);
881 fiy2 = _mm_add_ps(fiy2,ty);
882 fiz2 = _mm_add_ps(fiz2,tz);
884 fjx2 = _mm_add_ps(fjx2,tx);
885 fjy2 = _mm_add_ps(fjy2,ty);
886 fjz2 = _mm_add_ps(fjz2,tz);
888 /**************************
889 * CALCULATE INTERACTIONS *
890 **************************/
892 /* REACTION-FIELD ELECTROSTATICS */
893 velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
894 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
896 /* Update potential sum for this i atom from the interaction with this j atom. */
897 velec = _mm_andnot_ps(dummy_mask,velec);
898 velecsum = _mm_add_ps(velecsum,velec);
902 fscal = _mm_andnot_ps(dummy_mask,fscal);
904 /* Calculate temporary vectorial force */
905 tx = _mm_mul_ps(fscal,dx23);
906 ty = _mm_mul_ps(fscal,dy23);
907 tz = _mm_mul_ps(fscal,dz23);
909 /* Update vectorial force */
910 fix2 = _mm_add_ps(fix2,tx);
911 fiy2 = _mm_add_ps(fiy2,ty);
912 fiz2 = _mm_add_ps(fiz2,tz);
914 fjx3 = _mm_add_ps(fjx3,tx);
915 fjy3 = _mm_add_ps(fjy3,ty);
916 fjz3 = _mm_add_ps(fjz3,tz);
918 /**************************
919 * CALCULATE INTERACTIONS *
920 **************************/
922 /* REACTION-FIELD ELECTROSTATICS */
923 velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
924 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
926 /* Update potential sum for this i atom from the interaction with this j atom. */
927 velec = _mm_andnot_ps(dummy_mask,velec);
928 velecsum = _mm_add_ps(velecsum,velec);
932 fscal = _mm_andnot_ps(dummy_mask,fscal);
934 /* Calculate temporary vectorial force */
935 tx = _mm_mul_ps(fscal,dx31);
936 ty = _mm_mul_ps(fscal,dy31);
937 tz = _mm_mul_ps(fscal,dz31);
939 /* Update vectorial force */
940 fix3 = _mm_add_ps(fix3,tx);
941 fiy3 = _mm_add_ps(fiy3,ty);
942 fiz3 = _mm_add_ps(fiz3,tz);
944 fjx1 = _mm_add_ps(fjx1,tx);
945 fjy1 = _mm_add_ps(fjy1,ty);
946 fjz1 = _mm_add_ps(fjz1,tz);
948 /**************************
949 * CALCULATE INTERACTIONS *
950 **************************/
952 /* REACTION-FIELD ELECTROSTATICS */
953 velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
954 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
956 /* Update potential sum for this i atom from the interaction with this j atom. */
957 velec = _mm_andnot_ps(dummy_mask,velec);
958 velecsum = _mm_add_ps(velecsum,velec);
962 fscal = _mm_andnot_ps(dummy_mask,fscal);
964 /* Calculate temporary vectorial force */
965 tx = _mm_mul_ps(fscal,dx32);
966 ty = _mm_mul_ps(fscal,dy32);
967 tz = _mm_mul_ps(fscal,dz32);
969 /* Update vectorial force */
970 fix3 = _mm_add_ps(fix3,tx);
971 fiy3 = _mm_add_ps(fiy3,ty);
972 fiz3 = _mm_add_ps(fiz3,tz);
974 fjx2 = _mm_add_ps(fjx2,tx);
975 fjy2 = _mm_add_ps(fjy2,ty);
976 fjz2 = _mm_add_ps(fjz2,tz);
978 /**************************
979 * CALCULATE INTERACTIONS *
980 **************************/
982 /* REACTION-FIELD ELECTROSTATICS */
983 velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
984 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
986 /* Update potential sum for this i atom from the interaction with this j atom. */
987 velec = _mm_andnot_ps(dummy_mask,velec);
988 velecsum = _mm_add_ps(velecsum,velec);
992 fscal = _mm_andnot_ps(dummy_mask,fscal);
994 /* Calculate temporary vectorial force */
995 tx = _mm_mul_ps(fscal,dx33);
996 ty = _mm_mul_ps(fscal,dy33);
997 tz = _mm_mul_ps(fscal,dz33);
999 /* Update vectorial force */
1000 fix3 = _mm_add_ps(fix3,tx);
1001 fiy3 = _mm_add_ps(fiy3,ty);
1002 fiz3 = _mm_add_ps(fiz3,tz);
1004 fjx3 = _mm_add_ps(fjx3,tx);
1005 fjy3 = _mm_add_ps(fjy3,ty);
1006 fjz3 = _mm_add_ps(fjz3,tz);
1008 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1009 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1010 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1011 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1013 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1014 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1015 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1017 /* Inner loop uses 323 flops */
1020 /* End of innermost loop */
1022 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1023 f+i_coord_offset,fshift+i_shift_offset);
1026 /* Update potential energies */
1027 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1028 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1030 /* Increment number of inner iterations */
1031 inneriter += j_index_end - j_index_start;
1033 /* Outer loop uses 26 flops */
1036 /* Increment number of outer iterations */
1039 /* Update outer/inner flops */
1041 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*323);
1044 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse4_1_single
1045 * Electrostatics interaction: ReactionField
1046 * VdW interaction: LennardJones
1047 * Geometry: Water4-Water4
1048 * Calculate force/pot: Force
1051 nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse4_1_single
1052 (t_nblist * gmx_restrict nlist,
1053 rvec * gmx_restrict xx,
1054 rvec * gmx_restrict ff,
1055 t_forcerec * gmx_restrict fr,
1056 t_mdatoms * gmx_restrict mdatoms,
1057 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1058 t_nrnb * gmx_restrict nrnb)
1060 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1061 * just 0 for non-waters.
1062 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1063 * jnr indices corresponding to data put in the four positions in the SIMD register.
1065 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1066 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1067 int jnrA,jnrB,jnrC,jnrD;
1068 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1069 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1070 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1071 real rcutoff_scalar;
1072 real *shiftvec,*fshift,*x,*f;
1073 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1074 real scratch[4*DIM];
1075 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1077 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1079 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1081 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1083 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1084 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1085 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1086 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1087 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1088 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1089 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1090 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1091 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1092 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1093 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1094 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1095 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1096 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1097 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1098 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1099 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1100 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1101 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1102 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1105 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1108 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1109 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1110 __m128 dummy_mask,cutoff_mask;
1111 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1112 __m128 one = _mm_set1_ps(1.0);
1113 __m128 two = _mm_set1_ps(2.0);
1119 jindex = nlist->jindex;
1121 shiftidx = nlist->shift;
1123 shiftvec = fr->shift_vec[0];
1124 fshift = fr->fshift[0];
1125 facel = _mm_set1_ps(fr->epsfac);
1126 charge = mdatoms->chargeA;
1127 krf = _mm_set1_ps(fr->ic->k_rf);
1128 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
1129 crf = _mm_set1_ps(fr->ic->c_rf);
1130 nvdwtype = fr->ntype;
1131 vdwparam = fr->nbfp;
1132 vdwtype = mdatoms->typeA;
1134 /* Setup water-specific parameters */
1135 inr = nlist->iinr[0];
1136 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1137 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1138 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1139 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1141 jq1 = _mm_set1_ps(charge[inr+1]);
1142 jq2 = _mm_set1_ps(charge[inr+2]);
1143 jq3 = _mm_set1_ps(charge[inr+3]);
1144 vdwjidx0A = 2*vdwtype[inr+0];
1145 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1146 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1147 qq11 = _mm_mul_ps(iq1,jq1);
1148 qq12 = _mm_mul_ps(iq1,jq2);
1149 qq13 = _mm_mul_ps(iq1,jq3);
1150 qq21 = _mm_mul_ps(iq2,jq1);
1151 qq22 = _mm_mul_ps(iq2,jq2);
1152 qq23 = _mm_mul_ps(iq2,jq3);
1153 qq31 = _mm_mul_ps(iq3,jq1);
1154 qq32 = _mm_mul_ps(iq3,jq2);
1155 qq33 = _mm_mul_ps(iq3,jq3);
1157 /* Avoid stupid compiler warnings */
1158 jnrA = jnrB = jnrC = jnrD = 0;
1159 j_coord_offsetA = 0;
1160 j_coord_offsetB = 0;
1161 j_coord_offsetC = 0;
1162 j_coord_offsetD = 0;
1167 for(iidx=0;iidx<4*DIM;iidx++)
1169 scratch[iidx] = 0.0;
1172 /* Start outer loop over neighborlists */
1173 for(iidx=0; iidx<nri; iidx++)
1175 /* Load shift vector for this list */
1176 i_shift_offset = DIM*shiftidx[iidx];
1178 /* Load limits for loop over neighbors */
1179 j_index_start = jindex[iidx];
1180 j_index_end = jindex[iidx+1];
1182 /* Get outer coordinate index */
1184 i_coord_offset = DIM*inr;
1186 /* Load i particle coords and add shift vector */
1187 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1188 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1190 fix0 = _mm_setzero_ps();
1191 fiy0 = _mm_setzero_ps();
1192 fiz0 = _mm_setzero_ps();
1193 fix1 = _mm_setzero_ps();
1194 fiy1 = _mm_setzero_ps();
1195 fiz1 = _mm_setzero_ps();
1196 fix2 = _mm_setzero_ps();
1197 fiy2 = _mm_setzero_ps();
1198 fiz2 = _mm_setzero_ps();
1199 fix3 = _mm_setzero_ps();
1200 fiy3 = _mm_setzero_ps();
1201 fiz3 = _mm_setzero_ps();
1203 /* Start inner kernel loop */
1204 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1207 /* Get j neighbor index, and coordinate index */
1209 jnrB = jjnr[jidx+1];
1210 jnrC = jjnr[jidx+2];
1211 jnrD = jjnr[jidx+3];
1212 j_coord_offsetA = DIM*jnrA;
1213 j_coord_offsetB = DIM*jnrB;
1214 j_coord_offsetC = DIM*jnrC;
1215 j_coord_offsetD = DIM*jnrD;
1217 /* load j atom coordinates */
1218 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1219 x+j_coord_offsetC,x+j_coord_offsetD,
1220 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1221 &jy2,&jz2,&jx3,&jy3,&jz3);
1223 /* Calculate displacement vector */
1224 dx00 = _mm_sub_ps(ix0,jx0);
1225 dy00 = _mm_sub_ps(iy0,jy0);
1226 dz00 = _mm_sub_ps(iz0,jz0);
1227 dx11 = _mm_sub_ps(ix1,jx1);
1228 dy11 = _mm_sub_ps(iy1,jy1);
1229 dz11 = _mm_sub_ps(iz1,jz1);
1230 dx12 = _mm_sub_ps(ix1,jx2);
1231 dy12 = _mm_sub_ps(iy1,jy2);
1232 dz12 = _mm_sub_ps(iz1,jz2);
1233 dx13 = _mm_sub_ps(ix1,jx3);
1234 dy13 = _mm_sub_ps(iy1,jy3);
1235 dz13 = _mm_sub_ps(iz1,jz3);
1236 dx21 = _mm_sub_ps(ix2,jx1);
1237 dy21 = _mm_sub_ps(iy2,jy1);
1238 dz21 = _mm_sub_ps(iz2,jz1);
1239 dx22 = _mm_sub_ps(ix2,jx2);
1240 dy22 = _mm_sub_ps(iy2,jy2);
1241 dz22 = _mm_sub_ps(iz2,jz2);
1242 dx23 = _mm_sub_ps(ix2,jx3);
1243 dy23 = _mm_sub_ps(iy2,jy3);
1244 dz23 = _mm_sub_ps(iz2,jz3);
1245 dx31 = _mm_sub_ps(ix3,jx1);
1246 dy31 = _mm_sub_ps(iy3,jy1);
1247 dz31 = _mm_sub_ps(iz3,jz1);
1248 dx32 = _mm_sub_ps(ix3,jx2);
1249 dy32 = _mm_sub_ps(iy3,jy2);
1250 dz32 = _mm_sub_ps(iz3,jz2);
1251 dx33 = _mm_sub_ps(ix3,jx3);
1252 dy33 = _mm_sub_ps(iy3,jy3);
1253 dz33 = _mm_sub_ps(iz3,jz3);
1255 /* Calculate squared distance and things based on it */
1256 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1257 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1258 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1259 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1260 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1261 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1262 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1263 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1264 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1265 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1267 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1268 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1269 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1270 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1271 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1272 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1273 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1274 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1275 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1277 rinvsq00 = gmx_mm_inv_ps(rsq00);
1278 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1279 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1280 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1281 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1282 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1283 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1284 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1285 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1286 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1288 fjx0 = _mm_setzero_ps();
1289 fjy0 = _mm_setzero_ps();
1290 fjz0 = _mm_setzero_ps();
1291 fjx1 = _mm_setzero_ps();
1292 fjy1 = _mm_setzero_ps();
1293 fjz1 = _mm_setzero_ps();
1294 fjx2 = _mm_setzero_ps();
1295 fjy2 = _mm_setzero_ps();
1296 fjz2 = _mm_setzero_ps();
1297 fjx3 = _mm_setzero_ps();
1298 fjy3 = _mm_setzero_ps();
1299 fjz3 = _mm_setzero_ps();
1301 /**************************
1302 * CALCULATE INTERACTIONS *
1303 **************************/
1305 /* LENNARD-JONES DISPERSION/REPULSION */
1307 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1308 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1312 /* Calculate temporary vectorial force */
1313 tx = _mm_mul_ps(fscal,dx00);
1314 ty = _mm_mul_ps(fscal,dy00);
1315 tz = _mm_mul_ps(fscal,dz00);
1317 /* Update vectorial force */
1318 fix0 = _mm_add_ps(fix0,tx);
1319 fiy0 = _mm_add_ps(fiy0,ty);
1320 fiz0 = _mm_add_ps(fiz0,tz);
1322 fjx0 = _mm_add_ps(fjx0,tx);
1323 fjy0 = _mm_add_ps(fjy0,ty);
1324 fjz0 = _mm_add_ps(fjz0,tz);
1326 /**************************
1327 * CALCULATE INTERACTIONS *
1328 **************************/
1330 /* REACTION-FIELD ELECTROSTATICS */
1331 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1335 /* Calculate temporary vectorial force */
1336 tx = _mm_mul_ps(fscal,dx11);
1337 ty = _mm_mul_ps(fscal,dy11);
1338 tz = _mm_mul_ps(fscal,dz11);
1340 /* Update vectorial force */
1341 fix1 = _mm_add_ps(fix1,tx);
1342 fiy1 = _mm_add_ps(fiy1,ty);
1343 fiz1 = _mm_add_ps(fiz1,tz);
1345 fjx1 = _mm_add_ps(fjx1,tx);
1346 fjy1 = _mm_add_ps(fjy1,ty);
1347 fjz1 = _mm_add_ps(fjz1,tz);
1349 /**************************
1350 * CALCULATE INTERACTIONS *
1351 **************************/
1353 /* REACTION-FIELD ELECTROSTATICS */
1354 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1358 /* Calculate temporary vectorial force */
1359 tx = _mm_mul_ps(fscal,dx12);
1360 ty = _mm_mul_ps(fscal,dy12);
1361 tz = _mm_mul_ps(fscal,dz12);
1363 /* Update vectorial force */
1364 fix1 = _mm_add_ps(fix1,tx);
1365 fiy1 = _mm_add_ps(fiy1,ty);
1366 fiz1 = _mm_add_ps(fiz1,tz);
1368 fjx2 = _mm_add_ps(fjx2,tx);
1369 fjy2 = _mm_add_ps(fjy2,ty);
1370 fjz2 = _mm_add_ps(fjz2,tz);
1372 /**************************
1373 * CALCULATE INTERACTIONS *
1374 **************************/
1376 /* REACTION-FIELD ELECTROSTATICS */
1377 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
1381 /* Calculate temporary vectorial force */
1382 tx = _mm_mul_ps(fscal,dx13);
1383 ty = _mm_mul_ps(fscal,dy13);
1384 tz = _mm_mul_ps(fscal,dz13);
1386 /* Update vectorial force */
1387 fix1 = _mm_add_ps(fix1,tx);
1388 fiy1 = _mm_add_ps(fiy1,ty);
1389 fiz1 = _mm_add_ps(fiz1,tz);
1391 fjx3 = _mm_add_ps(fjx3,tx);
1392 fjy3 = _mm_add_ps(fjy3,ty);
1393 fjz3 = _mm_add_ps(fjz3,tz);
1395 /**************************
1396 * CALCULATE INTERACTIONS *
1397 **************************/
1399 /* REACTION-FIELD ELECTROSTATICS */
1400 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1404 /* Calculate temporary vectorial force */
1405 tx = _mm_mul_ps(fscal,dx21);
1406 ty = _mm_mul_ps(fscal,dy21);
1407 tz = _mm_mul_ps(fscal,dz21);
1409 /* Update vectorial force */
1410 fix2 = _mm_add_ps(fix2,tx);
1411 fiy2 = _mm_add_ps(fiy2,ty);
1412 fiz2 = _mm_add_ps(fiz2,tz);
1414 fjx1 = _mm_add_ps(fjx1,tx);
1415 fjy1 = _mm_add_ps(fjy1,ty);
1416 fjz1 = _mm_add_ps(fjz1,tz);
1418 /**************************
1419 * CALCULATE INTERACTIONS *
1420 **************************/
1422 /* REACTION-FIELD ELECTROSTATICS */
1423 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1427 /* Calculate temporary vectorial force */
1428 tx = _mm_mul_ps(fscal,dx22);
1429 ty = _mm_mul_ps(fscal,dy22);
1430 tz = _mm_mul_ps(fscal,dz22);
1432 /* Update vectorial force */
1433 fix2 = _mm_add_ps(fix2,tx);
1434 fiy2 = _mm_add_ps(fiy2,ty);
1435 fiz2 = _mm_add_ps(fiz2,tz);
1437 fjx2 = _mm_add_ps(fjx2,tx);
1438 fjy2 = _mm_add_ps(fjy2,ty);
1439 fjz2 = _mm_add_ps(fjz2,tz);
1441 /**************************
1442 * CALCULATE INTERACTIONS *
1443 **************************/
1445 /* REACTION-FIELD ELECTROSTATICS */
1446 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1450 /* Calculate temporary vectorial force */
1451 tx = _mm_mul_ps(fscal,dx23);
1452 ty = _mm_mul_ps(fscal,dy23);
1453 tz = _mm_mul_ps(fscal,dz23);
1455 /* Update vectorial force */
1456 fix2 = _mm_add_ps(fix2,tx);
1457 fiy2 = _mm_add_ps(fiy2,ty);
1458 fiz2 = _mm_add_ps(fiz2,tz);
1460 fjx3 = _mm_add_ps(fjx3,tx);
1461 fjy3 = _mm_add_ps(fjy3,ty);
1462 fjz3 = _mm_add_ps(fjz3,tz);
1464 /**************************
1465 * CALCULATE INTERACTIONS *
1466 **************************/
1468 /* REACTION-FIELD ELECTROSTATICS */
1469 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1473 /* Calculate temporary vectorial force */
1474 tx = _mm_mul_ps(fscal,dx31);
1475 ty = _mm_mul_ps(fscal,dy31);
1476 tz = _mm_mul_ps(fscal,dz31);
1478 /* Update vectorial force */
1479 fix3 = _mm_add_ps(fix3,tx);
1480 fiy3 = _mm_add_ps(fiy3,ty);
1481 fiz3 = _mm_add_ps(fiz3,tz);
1483 fjx1 = _mm_add_ps(fjx1,tx);
1484 fjy1 = _mm_add_ps(fjy1,ty);
1485 fjz1 = _mm_add_ps(fjz1,tz);
1487 /**************************
1488 * CALCULATE INTERACTIONS *
1489 **************************/
1491 /* REACTION-FIELD ELECTROSTATICS */
1492 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1496 /* Calculate temporary vectorial force */
1497 tx = _mm_mul_ps(fscal,dx32);
1498 ty = _mm_mul_ps(fscal,dy32);
1499 tz = _mm_mul_ps(fscal,dz32);
1501 /* Update vectorial force */
1502 fix3 = _mm_add_ps(fix3,tx);
1503 fiy3 = _mm_add_ps(fiy3,ty);
1504 fiz3 = _mm_add_ps(fiz3,tz);
1506 fjx2 = _mm_add_ps(fjx2,tx);
1507 fjy2 = _mm_add_ps(fjy2,ty);
1508 fjz2 = _mm_add_ps(fjz2,tz);
1510 /**************************
1511 * CALCULATE INTERACTIONS *
1512 **************************/
1514 /* REACTION-FIELD ELECTROSTATICS */
1515 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1519 /* Calculate temporary vectorial force */
1520 tx = _mm_mul_ps(fscal,dx33);
1521 ty = _mm_mul_ps(fscal,dy33);
1522 tz = _mm_mul_ps(fscal,dz33);
1524 /* Update vectorial force */
1525 fix3 = _mm_add_ps(fix3,tx);
1526 fiy3 = _mm_add_ps(fiy3,ty);
1527 fiz3 = _mm_add_ps(fiz3,tz);
1529 fjx3 = _mm_add_ps(fjx3,tx);
1530 fjy3 = _mm_add_ps(fjy3,ty);
1531 fjz3 = _mm_add_ps(fjz3,tz);
1533 fjptrA = f+j_coord_offsetA;
1534 fjptrB = f+j_coord_offsetB;
1535 fjptrC = f+j_coord_offsetC;
1536 fjptrD = f+j_coord_offsetD;
1538 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1539 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1540 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1542 /* Inner loop uses 273 flops */
1545 if(jidx<j_index_end)
1548 /* Get j neighbor index, and coordinate index */
1549 jnrlistA = jjnr[jidx];
1550 jnrlistB = jjnr[jidx+1];
1551 jnrlistC = jjnr[jidx+2];
1552 jnrlistD = jjnr[jidx+3];
1553 /* Sign of each element will be negative for non-real atoms.
1554 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1555 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1557 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1558 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1559 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1560 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1561 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1562 j_coord_offsetA = DIM*jnrA;
1563 j_coord_offsetB = DIM*jnrB;
1564 j_coord_offsetC = DIM*jnrC;
1565 j_coord_offsetD = DIM*jnrD;
1567 /* load j atom coordinates */
1568 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1569 x+j_coord_offsetC,x+j_coord_offsetD,
1570 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1571 &jy2,&jz2,&jx3,&jy3,&jz3);
1573 /* Calculate displacement vector */
1574 dx00 = _mm_sub_ps(ix0,jx0);
1575 dy00 = _mm_sub_ps(iy0,jy0);
1576 dz00 = _mm_sub_ps(iz0,jz0);
1577 dx11 = _mm_sub_ps(ix1,jx1);
1578 dy11 = _mm_sub_ps(iy1,jy1);
1579 dz11 = _mm_sub_ps(iz1,jz1);
1580 dx12 = _mm_sub_ps(ix1,jx2);
1581 dy12 = _mm_sub_ps(iy1,jy2);
1582 dz12 = _mm_sub_ps(iz1,jz2);
1583 dx13 = _mm_sub_ps(ix1,jx3);
1584 dy13 = _mm_sub_ps(iy1,jy3);
1585 dz13 = _mm_sub_ps(iz1,jz3);
1586 dx21 = _mm_sub_ps(ix2,jx1);
1587 dy21 = _mm_sub_ps(iy2,jy1);
1588 dz21 = _mm_sub_ps(iz2,jz1);
1589 dx22 = _mm_sub_ps(ix2,jx2);
1590 dy22 = _mm_sub_ps(iy2,jy2);
1591 dz22 = _mm_sub_ps(iz2,jz2);
1592 dx23 = _mm_sub_ps(ix2,jx3);
1593 dy23 = _mm_sub_ps(iy2,jy3);
1594 dz23 = _mm_sub_ps(iz2,jz3);
1595 dx31 = _mm_sub_ps(ix3,jx1);
1596 dy31 = _mm_sub_ps(iy3,jy1);
1597 dz31 = _mm_sub_ps(iz3,jz1);
1598 dx32 = _mm_sub_ps(ix3,jx2);
1599 dy32 = _mm_sub_ps(iy3,jy2);
1600 dz32 = _mm_sub_ps(iz3,jz2);
1601 dx33 = _mm_sub_ps(ix3,jx3);
1602 dy33 = _mm_sub_ps(iy3,jy3);
1603 dz33 = _mm_sub_ps(iz3,jz3);
1605 /* Calculate squared distance and things based on it */
1606 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1607 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1608 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1609 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1610 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1611 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1612 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1613 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1614 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1615 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1617 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1618 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1619 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1620 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1621 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1622 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1623 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1624 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1625 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1627 rinvsq00 = gmx_mm_inv_ps(rsq00);
1628 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1629 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1630 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1631 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1632 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1633 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1634 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1635 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1636 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1638 fjx0 = _mm_setzero_ps();
1639 fjy0 = _mm_setzero_ps();
1640 fjz0 = _mm_setzero_ps();
1641 fjx1 = _mm_setzero_ps();
1642 fjy1 = _mm_setzero_ps();
1643 fjz1 = _mm_setzero_ps();
1644 fjx2 = _mm_setzero_ps();
1645 fjy2 = _mm_setzero_ps();
1646 fjz2 = _mm_setzero_ps();
1647 fjx3 = _mm_setzero_ps();
1648 fjy3 = _mm_setzero_ps();
1649 fjz3 = _mm_setzero_ps();
1651 /**************************
1652 * CALCULATE INTERACTIONS *
1653 **************************/
1655 /* LENNARD-JONES DISPERSION/REPULSION */
1657 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1658 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1662 fscal = _mm_andnot_ps(dummy_mask,fscal);
1664 /* Calculate temporary vectorial force */
1665 tx = _mm_mul_ps(fscal,dx00);
1666 ty = _mm_mul_ps(fscal,dy00);
1667 tz = _mm_mul_ps(fscal,dz00);
1669 /* Update vectorial force */
1670 fix0 = _mm_add_ps(fix0,tx);
1671 fiy0 = _mm_add_ps(fiy0,ty);
1672 fiz0 = _mm_add_ps(fiz0,tz);
1674 fjx0 = _mm_add_ps(fjx0,tx);
1675 fjy0 = _mm_add_ps(fjy0,ty);
1676 fjz0 = _mm_add_ps(fjz0,tz);
1678 /**************************
1679 * CALCULATE INTERACTIONS *
1680 **************************/
1682 /* REACTION-FIELD ELECTROSTATICS */
1683 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1687 fscal = _mm_andnot_ps(dummy_mask,fscal);
1689 /* Calculate temporary vectorial force */
1690 tx = _mm_mul_ps(fscal,dx11);
1691 ty = _mm_mul_ps(fscal,dy11);
1692 tz = _mm_mul_ps(fscal,dz11);
1694 /* Update vectorial force */
1695 fix1 = _mm_add_ps(fix1,tx);
1696 fiy1 = _mm_add_ps(fiy1,ty);
1697 fiz1 = _mm_add_ps(fiz1,tz);
1699 fjx1 = _mm_add_ps(fjx1,tx);
1700 fjy1 = _mm_add_ps(fjy1,ty);
1701 fjz1 = _mm_add_ps(fjz1,tz);
1703 /**************************
1704 * CALCULATE INTERACTIONS *
1705 **************************/
1707 /* REACTION-FIELD ELECTROSTATICS */
1708 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1712 fscal = _mm_andnot_ps(dummy_mask,fscal);
1714 /* Calculate temporary vectorial force */
1715 tx = _mm_mul_ps(fscal,dx12);
1716 ty = _mm_mul_ps(fscal,dy12);
1717 tz = _mm_mul_ps(fscal,dz12);
1719 /* Update vectorial force */
1720 fix1 = _mm_add_ps(fix1,tx);
1721 fiy1 = _mm_add_ps(fiy1,ty);
1722 fiz1 = _mm_add_ps(fiz1,tz);
1724 fjx2 = _mm_add_ps(fjx2,tx);
1725 fjy2 = _mm_add_ps(fjy2,ty);
1726 fjz2 = _mm_add_ps(fjz2,tz);
1728 /**************************
1729 * CALCULATE INTERACTIONS *
1730 **************************/
1732 /* REACTION-FIELD ELECTROSTATICS */
1733 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
1737 fscal = _mm_andnot_ps(dummy_mask,fscal);
1739 /* Calculate temporary vectorial force */
1740 tx = _mm_mul_ps(fscal,dx13);
1741 ty = _mm_mul_ps(fscal,dy13);
1742 tz = _mm_mul_ps(fscal,dz13);
1744 /* Update vectorial force */
1745 fix1 = _mm_add_ps(fix1,tx);
1746 fiy1 = _mm_add_ps(fiy1,ty);
1747 fiz1 = _mm_add_ps(fiz1,tz);
1749 fjx3 = _mm_add_ps(fjx3,tx);
1750 fjy3 = _mm_add_ps(fjy3,ty);
1751 fjz3 = _mm_add_ps(fjz3,tz);
1753 /**************************
1754 * CALCULATE INTERACTIONS *
1755 **************************/
1757 /* REACTION-FIELD ELECTROSTATICS */
1758 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1762 fscal = _mm_andnot_ps(dummy_mask,fscal);
1764 /* Calculate temporary vectorial force */
1765 tx = _mm_mul_ps(fscal,dx21);
1766 ty = _mm_mul_ps(fscal,dy21);
1767 tz = _mm_mul_ps(fscal,dz21);
1769 /* Update vectorial force */
1770 fix2 = _mm_add_ps(fix2,tx);
1771 fiy2 = _mm_add_ps(fiy2,ty);
1772 fiz2 = _mm_add_ps(fiz2,tz);
1774 fjx1 = _mm_add_ps(fjx1,tx);
1775 fjy1 = _mm_add_ps(fjy1,ty);
1776 fjz1 = _mm_add_ps(fjz1,tz);
1778 /**************************
1779 * CALCULATE INTERACTIONS *
1780 **************************/
1782 /* REACTION-FIELD ELECTROSTATICS */
1783 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1787 fscal = _mm_andnot_ps(dummy_mask,fscal);
1789 /* Calculate temporary vectorial force */
1790 tx = _mm_mul_ps(fscal,dx22);
1791 ty = _mm_mul_ps(fscal,dy22);
1792 tz = _mm_mul_ps(fscal,dz22);
1794 /* Update vectorial force */
1795 fix2 = _mm_add_ps(fix2,tx);
1796 fiy2 = _mm_add_ps(fiy2,ty);
1797 fiz2 = _mm_add_ps(fiz2,tz);
1799 fjx2 = _mm_add_ps(fjx2,tx);
1800 fjy2 = _mm_add_ps(fjy2,ty);
1801 fjz2 = _mm_add_ps(fjz2,tz);
1803 /**************************
1804 * CALCULATE INTERACTIONS *
1805 **************************/
1807 /* REACTION-FIELD ELECTROSTATICS */
1808 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1812 fscal = _mm_andnot_ps(dummy_mask,fscal);
1814 /* Calculate temporary vectorial force */
1815 tx = _mm_mul_ps(fscal,dx23);
1816 ty = _mm_mul_ps(fscal,dy23);
1817 tz = _mm_mul_ps(fscal,dz23);
1819 /* Update vectorial force */
1820 fix2 = _mm_add_ps(fix2,tx);
1821 fiy2 = _mm_add_ps(fiy2,ty);
1822 fiz2 = _mm_add_ps(fiz2,tz);
1824 fjx3 = _mm_add_ps(fjx3,tx);
1825 fjy3 = _mm_add_ps(fjy3,ty);
1826 fjz3 = _mm_add_ps(fjz3,tz);
1828 /**************************
1829 * CALCULATE INTERACTIONS *
1830 **************************/
1832 /* REACTION-FIELD ELECTROSTATICS */
1833 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1837 fscal = _mm_andnot_ps(dummy_mask,fscal);
1839 /* Calculate temporary vectorial force */
1840 tx = _mm_mul_ps(fscal,dx31);
1841 ty = _mm_mul_ps(fscal,dy31);
1842 tz = _mm_mul_ps(fscal,dz31);
1844 /* Update vectorial force */
1845 fix3 = _mm_add_ps(fix3,tx);
1846 fiy3 = _mm_add_ps(fiy3,ty);
1847 fiz3 = _mm_add_ps(fiz3,tz);
1849 fjx1 = _mm_add_ps(fjx1,tx);
1850 fjy1 = _mm_add_ps(fjy1,ty);
1851 fjz1 = _mm_add_ps(fjz1,tz);
1853 /**************************
1854 * CALCULATE INTERACTIONS *
1855 **************************/
1857 /* REACTION-FIELD ELECTROSTATICS */
1858 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1862 fscal = _mm_andnot_ps(dummy_mask,fscal);
1864 /* Calculate temporary vectorial force */
1865 tx = _mm_mul_ps(fscal,dx32);
1866 ty = _mm_mul_ps(fscal,dy32);
1867 tz = _mm_mul_ps(fscal,dz32);
1869 /* Update vectorial force */
1870 fix3 = _mm_add_ps(fix3,tx);
1871 fiy3 = _mm_add_ps(fiy3,ty);
1872 fiz3 = _mm_add_ps(fiz3,tz);
1874 fjx2 = _mm_add_ps(fjx2,tx);
1875 fjy2 = _mm_add_ps(fjy2,ty);
1876 fjz2 = _mm_add_ps(fjz2,tz);
1878 /**************************
1879 * CALCULATE INTERACTIONS *
1880 **************************/
1882 /* REACTION-FIELD ELECTROSTATICS */
1883 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1887 fscal = _mm_andnot_ps(dummy_mask,fscal);
1889 /* Calculate temporary vectorial force */
1890 tx = _mm_mul_ps(fscal,dx33);
1891 ty = _mm_mul_ps(fscal,dy33);
1892 tz = _mm_mul_ps(fscal,dz33);
1894 /* Update vectorial force */
1895 fix3 = _mm_add_ps(fix3,tx);
1896 fiy3 = _mm_add_ps(fiy3,ty);
1897 fiz3 = _mm_add_ps(fiz3,tz);
1899 fjx3 = _mm_add_ps(fjx3,tx);
1900 fjy3 = _mm_add_ps(fjy3,ty);
1901 fjz3 = _mm_add_ps(fjz3,tz);
1903 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1904 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1905 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1906 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1908 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1909 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1910 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1912 /* Inner loop uses 273 flops */
1915 /* End of innermost loop */
1917 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1918 f+i_coord_offset,fshift+i_shift_offset);
1920 /* Increment number of inner iterations */
1921 inneriter += j_index_end - j_index_start;
1923 /* Outer loop uses 24 flops */
1926 /* Increment number of outer iterations */
1929 /* Update outer/inner flops */
1931 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*273);