2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_sse4_1_single.h"
48 #include "kernelutil_x86_sse4_1_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse4_1_single
52 * Electrostatics interaction: ReactionField
53 * VdW interaction: LennardJones
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse4_1_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
98 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
99 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
100 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
112 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
116 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
117 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
118 real rswitch_scalar,d_scalar;
119 __m128 dummy_mask,cutoff_mask;
120 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
121 __m128 one = _mm_set1_ps(1.0);
122 __m128 two = _mm_set1_ps(2.0);
128 jindex = nlist->jindex;
130 shiftidx = nlist->shift;
132 shiftvec = fr->shift_vec[0];
133 fshift = fr->fshift[0];
134 facel = _mm_set1_ps(fr->epsfac);
135 charge = mdatoms->chargeA;
136 krf = _mm_set1_ps(fr->ic->k_rf);
137 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
138 crf = _mm_set1_ps(fr->ic->c_rf);
139 nvdwtype = fr->ntype;
141 vdwtype = mdatoms->typeA;
143 /* Setup water-specific parameters */
144 inr = nlist->iinr[0];
145 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
146 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
147 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
148 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
150 jq1 = _mm_set1_ps(charge[inr+1]);
151 jq2 = _mm_set1_ps(charge[inr+2]);
152 jq3 = _mm_set1_ps(charge[inr+3]);
153 vdwjidx0A = 2*vdwtype[inr+0];
154 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
155 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
156 qq11 = _mm_mul_ps(iq1,jq1);
157 qq12 = _mm_mul_ps(iq1,jq2);
158 qq13 = _mm_mul_ps(iq1,jq3);
159 qq21 = _mm_mul_ps(iq2,jq1);
160 qq22 = _mm_mul_ps(iq2,jq2);
161 qq23 = _mm_mul_ps(iq2,jq3);
162 qq31 = _mm_mul_ps(iq3,jq1);
163 qq32 = _mm_mul_ps(iq3,jq2);
164 qq33 = _mm_mul_ps(iq3,jq3);
166 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
167 rcutoff_scalar = fr->rcoulomb;
168 rcutoff = _mm_set1_ps(rcutoff_scalar);
169 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
171 rswitch_scalar = fr->rvdw_switch;
172 rswitch = _mm_set1_ps(rswitch_scalar);
173 /* Setup switch parameters */
174 d_scalar = rcutoff_scalar-rswitch_scalar;
175 d = _mm_set1_ps(d_scalar);
176 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
177 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
178 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
179 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
180 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
181 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
183 /* Avoid stupid compiler warnings */
184 jnrA = jnrB = jnrC = jnrD = 0;
193 for(iidx=0;iidx<4*DIM;iidx++)
198 /* Start outer loop over neighborlists */
199 for(iidx=0; iidx<nri; iidx++)
201 /* Load shift vector for this list */
202 i_shift_offset = DIM*shiftidx[iidx];
204 /* Load limits for loop over neighbors */
205 j_index_start = jindex[iidx];
206 j_index_end = jindex[iidx+1];
208 /* Get outer coordinate index */
210 i_coord_offset = DIM*inr;
212 /* Load i particle coords and add shift vector */
213 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
214 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
216 fix0 = _mm_setzero_ps();
217 fiy0 = _mm_setzero_ps();
218 fiz0 = _mm_setzero_ps();
219 fix1 = _mm_setzero_ps();
220 fiy1 = _mm_setzero_ps();
221 fiz1 = _mm_setzero_ps();
222 fix2 = _mm_setzero_ps();
223 fiy2 = _mm_setzero_ps();
224 fiz2 = _mm_setzero_ps();
225 fix3 = _mm_setzero_ps();
226 fiy3 = _mm_setzero_ps();
227 fiz3 = _mm_setzero_ps();
229 /* Reset potential sums */
230 velecsum = _mm_setzero_ps();
231 vvdwsum = _mm_setzero_ps();
233 /* Start inner kernel loop */
234 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
237 /* Get j neighbor index, and coordinate index */
242 j_coord_offsetA = DIM*jnrA;
243 j_coord_offsetB = DIM*jnrB;
244 j_coord_offsetC = DIM*jnrC;
245 j_coord_offsetD = DIM*jnrD;
247 /* load j atom coordinates */
248 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
249 x+j_coord_offsetC,x+j_coord_offsetD,
250 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
251 &jy2,&jz2,&jx3,&jy3,&jz3);
253 /* Calculate displacement vector */
254 dx00 = _mm_sub_ps(ix0,jx0);
255 dy00 = _mm_sub_ps(iy0,jy0);
256 dz00 = _mm_sub_ps(iz0,jz0);
257 dx11 = _mm_sub_ps(ix1,jx1);
258 dy11 = _mm_sub_ps(iy1,jy1);
259 dz11 = _mm_sub_ps(iz1,jz1);
260 dx12 = _mm_sub_ps(ix1,jx2);
261 dy12 = _mm_sub_ps(iy1,jy2);
262 dz12 = _mm_sub_ps(iz1,jz2);
263 dx13 = _mm_sub_ps(ix1,jx3);
264 dy13 = _mm_sub_ps(iy1,jy3);
265 dz13 = _mm_sub_ps(iz1,jz3);
266 dx21 = _mm_sub_ps(ix2,jx1);
267 dy21 = _mm_sub_ps(iy2,jy1);
268 dz21 = _mm_sub_ps(iz2,jz1);
269 dx22 = _mm_sub_ps(ix2,jx2);
270 dy22 = _mm_sub_ps(iy2,jy2);
271 dz22 = _mm_sub_ps(iz2,jz2);
272 dx23 = _mm_sub_ps(ix2,jx3);
273 dy23 = _mm_sub_ps(iy2,jy3);
274 dz23 = _mm_sub_ps(iz2,jz3);
275 dx31 = _mm_sub_ps(ix3,jx1);
276 dy31 = _mm_sub_ps(iy3,jy1);
277 dz31 = _mm_sub_ps(iz3,jz1);
278 dx32 = _mm_sub_ps(ix3,jx2);
279 dy32 = _mm_sub_ps(iy3,jy2);
280 dz32 = _mm_sub_ps(iz3,jz2);
281 dx33 = _mm_sub_ps(ix3,jx3);
282 dy33 = _mm_sub_ps(iy3,jy3);
283 dz33 = _mm_sub_ps(iz3,jz3);
285 /* Calculate squared distance and things based on it */
286 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
287 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
288 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
289 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
290 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
291 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
292 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
293 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
294 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
295 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
297 rinv00 = gmx_mm_invsqrt_ps(rsq00);
298 rinv11 = gmx_mm_invsqrt_ps(rsq11);
299 rinv12 = gmx_mm_invsqrt_ps(rsq12);
300 rinv13 = gmx_mm_invsqrt_ps(rsq13);
301 rinv21 = gmx_mm_invsqrt_ps(rsq21);
302 rinv22 = gmx_mm_invsqrt_ps(rsq22);
303 rinv23 = gmx_mm_invsqrt_ps(rsq23);
304 rinv31 = gmx_mm_invsqrt_ps(rsq31);
305 rinv32 = gmx_mm_invsqrt_ps(rsq32);
306 rinv33 = gmx_mm_invsqrt_ps(rsq33);
308 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
309 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
310 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
311 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
312 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
313 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
314 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
315 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
316 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
317 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
319 fjx0 = _mm_setzero_ps();
320 fjy0 = _mm_setzero_ps();
321 fjz0 = _mm_setzero_ps();
322 fjx1 = _mm_setzero_ps();
323 fjy1 = _mm_setzero_ps();
324 fjz1 = _mm_setzero_ps();
325 fjx2 = _mm_setzero_ps();
326 fjy2 = _mm_setzero_ps();
327 fjz2 = _mm_setzero_ps();
328 fjx3 = _mm_setzero_ps();
329 fjy3 = _mm_setzero_ps();
330 fjz3 = _mm_setzero_ps();
332 /**************************
333 * CALCULATE INTERACTIONS *
334 **************************/
336 if (gmx_mm_any_lt(rsq00,rcutoff2))
339 r00 = _mm_mul_ps(rsq00,rinv00);
341 /* LENNARD-JONES DISPERSION/REPULSION */
343 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
344 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
345 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
346 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
347 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
349 d = _mm_sub_ps(r00,rswitch);
350 d = _mm_max_ps(d,_mm_setzero_ps());
351 d2 = _mm_mul_ps(d,d);
352 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
354 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
356 /* Evaluate switch function */
357 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
358 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
359 vvdw = _mm_mul_ps(vvdw,sw);
360 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
362 /* Update potential sum for this i atom from the interaction with this j atom. */
363 vvdw = _mm_and_ps(vvdw,cutoff_mask);
364 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
368 fscal = _mm_and_ps(fscal,cutoff_mask);
370 /* Calculate temporary vectorial force */
371 tx = _mm_mul_ps(fscal,dx00);
372 ty = _mm_mul_ps(fscal,dy00);
373 tz = _mm_mul_ps(fscal,dz00);
375 /* Update vectorial force */
376 fix0 = _mm_add_ps(fix0,tx);
377 fiy0 = _mm_add_ps(fiy0,ty);
378 fiz0 = _mm_add_ps(fiz0,tz);
380 fjx0 = _mm_add_ps(fjx0,tx);
381 fjy0 = _mm_add_ps(fjy0,ty);
382 fjz0 = _mm_add_ps(fjz0,tz);
386 /**************************
387 * CALCULATE INTERACTIONS *
388 **************************/
390 if (gmx_mm_any_lt(rsq11,rcutoff2))
393 /* REACTION-FIELD ELECTROSTATICS */
394 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
395 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
397 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
399 /* Update potential sum for this i atom from the interaction with this j atom. */
400 velec = _mm_and_ps(velec,cutoff_mask);
401 velecsum = _mm_add_ps(velecsum,velec);
405 fscal = _mm_and_ps(fscal,cutoff_mask);
407 /* Calculate temporary vectorial force */
408 tx = _mm_mul_ps(fscal,dx11);
409 ty = _mm_mul_ps(fscal,dy11);
410 tz = _mm_mul_ps(fscal,dz11);
412 /* Update vectorial force */
413 fix1 = _mm_add_ps(fix1,tx);
414 fiy1 = _mm_add_ps(fiy1,ty);
415 fiz1 = _mm_add_ps(fiz1,tz);
417 fjx1 = _mm_add_ps(fjx1,tx);
418 fjy1 = _mm_add_ps(fjy1,ty);
419 fjz1 = _mm_add_ps(fjz1,tz);
423 /**************************
424 * CALCULATE INTERACTIONS *
425 **************************/
427 if (gmx_mm_any_lt(rsq12,rcutoff2))
430 /* REACTION-FIELD ELECTROSTATICS */
431 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
432 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
434 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
436 /* Update potential sum for this i atom from the interaction with this j atom. */
437 velec = _mm_and_ps(velec,cutoff_mask);
438 velecsum = _mm_add_ps(velecsum,velec);
442 fscal = _mm_and_ps(fscal,cutoff_mask);
444 /* Calculate temporary vectorial force */
445 tx = _mm_mul_ps(fscal,dx12);
446 ty = _mm_mul_ps(fscal,dy12);
447 tz = _mm_mul_ps(fscal,dz12);
449 /* Update vectorial force */
450 fix1 = _mm_add_ps(fix1,tx);
451 fiy1 = _mm_add_ps(fiy1,ty);
452 fiz1 = _mm_add_ps(fiz1,tz);
454 fjx2 = _mm_add_ps(fjx2,tx);
455 fjy2 = _mm_add_ps(fjy2,ty);
456 fjz2 = _mm_add_ps(fjz2,tz);
460 /**************************
461 * CALCULATE INTERACTIONS *
462 **************************/
464 if (gmx_mm_any_lt(rsq13,rcutoff2))
467 /* REACTION-FIELD ELECTROSTATICS */
468 velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
469 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
471 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
473 /* Update potential sum for this i atom from the interaction with this j atom. */
474 velec = _mm_and_ps(velec,cutoff_mask);
475 velecsum = _mm_add_ps(velecsum,velec);
479 fscal = _mm_and_ps(fscal,cutoff_mask);
481 /* Calculate temporary vectorial force */
482 tx = _mm_mul_ps(fscal,dx13);
483 ty = _mm_mul_ps(fscal,dy13);
484 tz = _mm_mul_ps(fscal,dz13);
486 /* Update vectorial force */
487 fix1 = _mm_add_ps(fix1,tx);
488 fiy1 = _mm_add_ps(fiy1,ty);
489 fiz1 = _mm_add_ps(fiz1,tz);
491 fjx3 = _mm_add_ps(fjx3,tx);
492 fjy3 = _mm_add_ps(fjy3,ty);
493 fjz3 = _mm_add_ps(fjz3,tz);
497 /**************************
498 * CALCULATE INTERACTIONS *
499 **************************/
501 if (gmx_mm_any_lt(rsq21,rcutoff2))
504 /* REACTION-FIELD ELECTROSTATICS */
505 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
506 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
508 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
510 /* Update potential sum for this i atom from the interaction with this j atom. */
511 velec = _mm_and_ps(velec,cutoff_mask);
512 velecsum = _mm_add_ps(velecsum,velec);
516 fscal = _mm_and_ps(fscal,cutoff_mask);
518 /* Calculate temporary vectorial force */
519 tx = _mm_mul_ps(fscal,dx21);
520 ty = _mm_mul_ps(fscal,dy21);
521 tz = _mm_mul_ps(fscal,dz21);
523 /* Update vectorial force */
524 fix2 = _mm_add_ps(fix2,tx);
525 fiy2 = _mm_add_ps(fiy2,ty);
526 fiz2 = _mm_add_ps(fiz2,tz);
528 fjx1 = _mm_add_ps(fjx1,tx);
529 fjy1 = _mm_add_ps(fjy1,ty);
530 fjz1 = _mm_add_ps(fjz1,tz);
534 /**************************
535 * CALCULATE INTERACTIONS *
536 **************************/
538 if (gmx_mm_any_lt(rsq22,rcutoff2))
541 /* REACTION-FIELD ELECTROSTATICS */
542 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
543 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
545 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
547 /* Update potential sum for this i atom from the interaction with this j atom. */
548 velec = _mm_and_ps(velec,cutoff_mask);
549 velecsum = _mm_add_ps(velecsum,velec);
553 fscal = _mm_and_ps(fscal,cutoff_mask);
555 /* Calculate temporary vectorial force */
556 tx = _mm_mul_ps(fscal,dx22);
557 ty = _mm_mul_ps(fscal,dy22);
558 tz = _mm_mul_ps(fscal,dz22);
560 /* Update vectorial force */
561 fix2 = _mm_add_ps(fix2,tx);
562 fiy2 = _mm_add_ps(fiy2,ty);
563 fiz2 = _mm_add_ps(fiz2,tz);
565 fjx2 = _mm_add_ps(fjx2,tx);
566 fjy2 = _mm_add_ps(fjy2,ty);
567 fjz2 = _mm_add_ps(fjz2,tz);
571 /**************************
572 * CALCULATE INTERACTIONS *
573 **************************/
575 if (gmx_mm_any_lt(rsq23,rcutoff2))
578 /* REACTION-FIELD ELECTROSTATICS */
579 velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
580 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
582 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
584 /* Update potential sum for this i atom from the interaction with this j atom. */
585 velec = _mm_and_ps(velec,cutoff_mask);
586 velecsum = _mm_add_ps(velecsum,velec);
590 fscal = _mm_and_ps(fscal,cutoff_mask);
592 /* Calculate temporary vectorial force */
593 tx = _mm_mul_ps(fscal,dx23);
594 ty = _mm_mul_ps(fscal,dy23);
595 tz = _mm_mul_ps(fscal,dz23);
597 /* Update vectorial force */
598 fix2 = _mm_add_ps(fix2,tx);
599 fiy2 = _mm_add_ps(fiy2,ty);
600 fiz2 = _mm_add_ps(fiz2,tz);
602 fjx3 = _mm_add_ps(fjx3,tx);
603 fjy3 = _mm_add_ps(fjy3,ty);
604 fjz3 = _mm_add_ps(fjz3,tz);
608 /**************************
609 * CALCULATE INTERACTIONS *
610 **************************/
612 if (gmx_mm_any_lt(rsq31,rcutoff2))
615 /* REACTION-FIELD ELECTROSTATICS */
616 velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
617 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
619 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
621 /* Update potential sum for this i atom from the interaction with this j atom. */
622 velec = _mm_and_ps(velec,cutoff_mask);
623 velecsum = _mm_add_ps(velecsum,velec);
627 fscal = _mm_and_ps(fscal,cutoff_mask);
629 /* Calculate temporary vectorial force */
630 tx = _mm_mul_ps(fscal,dx31);
631 ty = _mm_mul_ps(fscal,dy31);
632 tz = _mm_mul_ps(fscal,dz31);
634 /* Update vectorial force */
635 fix3 = _mm_add_ps(fix3,tx);
636 fiy3 = _mm_add_ps(fiy3,ty);
637 fiz3 = _mm_add_ps(fiz3,tz);
639 fjx1 = _mm_add_ps(fjx1,tx);
640 fjy1 = _mm_add_ps(fjy1,ty);
641 fjz1 = _mm_add_ps(fjz1,tz);
645 /**************************
646 * CALCULATE INTERACTIONS *
647 **************************/
649 if (gmx_mm_any_lt(rsq32,rcutoff2))
652 /* REACTION-FIELD ELECTROSTATICS */
653 velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
654 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
656 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
658 /* Update potential sum for this i atom from the interaction with this j atom. */
659 velec = _mm_and_ps(velec,cutoff_mask);
660 velecsum = _mm_add_ps(velecsum,velec);
664 fscal = _mm_and_ps(fscal,cutoff_mask);
666 /* Calculate temporary vectorial force */
667 tx = _mm_mul_ps(fscal,dx32);
668 ty = _mm_mul_ps(fscal,dy32);
669 tz = _mm_mul_ps(fscal,dz32);
671 /* Update vectorial force */
672 fix3 = _mm_add_ps(fix3,tx);
673 fiy3 = _mm_add_ps(fiy3,ty);
674 fiz3 = _mm_add_ps(fiz3,tz);
676 fjx2 = _mm_add_ps(fjx2,tx);
677 fjy2 = _mm_add_ps(fjy2,ty);
678 fjz2 = _mm_add_ps(fjz2,tz);
682 /**************************
683 * CALCULATE INTERACTIONS *
684 **************************/
686 if (gmx_mm_any_lt(rsq33,rcutoff2))
689 /* REACTION-FIELD ELECTROSTATICS */
690 velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
691 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
693 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
695 /* Update potential sum for this i atom from the interaction with this j atom. */
696 velec = _mm_and_ps(velec,cutoff_mask);
697 velecsum = _mm_add_ps(velecsum,velec);
701 fscal = _mm_and_ps(fscal,cutoff_mask);
703 /* Calculate temporary vectorial force */
704 tx = _mm_mul_ps(fscal,dx33);
705 ty = _mm_mul_ps(fscal,dy33);
706 tz = _mm_mul_ps(fscal,dz33);
708 /* Update vectorial force */
709 fix3 = _mm_add_ps(fix3,tx);
710 fiy3 = _mm_add_ps(fiy3,ty);
711 fiz3 = _mm_add_ps(fiz3,tz);
713 fjx3 = _mm_add_ps(fjx3,tx);
714 fjy3 = _mm_add_ps(fjy3,ty);
715 fjz3 = _mm_add_ps(fjz3,tz);
719 fjptrA = f+j_coord_offsetA;
720 fjptrB = f+j_coord_offsetB;
721 fjptrC = f+j_coord_offsetC;
722 fjptrD = f+j_coord_offsetD;
724 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
725 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
726 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
728 /* Inner loop uses 386 flops */
734 /* Get j neighbor index, and coordinate index */
735 jnrlistA = jjnr[jidx];
736 jnrlistB = jjnr[jidx+1];
737 jnrlistC = jjnr[jidx+2];
738 jnrlistD = jjnr[jidx+3];
739 /* Sign of each element will be negative for non-real atoms.
740 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
741 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
743 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
744 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
745 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
746 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
747 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
748 j_coord_offsetA = DIM*jnrA;
749 j_coord_offsetB = DIM*jnrB;
750 j_coord_offsetC = DIM*jnrC;
751 j_coord_offsetD = DIM*jnrD;
753 /* load j atom coordinates */
754 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
755 x+j_coord_offsetC,x+j_coord_offsetD,
756 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
757 &jy2,&jz2,&jx3,&jy3,&jz3);
759 /* Calculate displacement vector */
760 dx00 = _mm_sub_ps(ix0,jx0);
761 dy00 = _mm_sub_ps(iy0,jy0);
762 dz00 = _mm_sub_ps(iz0,jz0);
763 dx11 = _mm_sub_ps(ix1,jx1);
764 dy11 = _mm_sub_ps(iy1,jy1);
765 dz11 = _mm_sub_ps(iz1,jz1);
766 dx12 = _mm_sub_ps(ix1,jx2);
767 dy12 = _mm_sub_ps(iy1,jy2);
768 dz12 = _mm_sub_ps(iz1,jz2);
769 dx13 = _mm_sub_ps(ix1,jx3);
770 dy13 = _mm_sub_ps(iy1,jy3);
771 dz13 = _mm_sub_ps(iz1,jz3);
772 dx21 = _mm_sub_ps(ix2,jx1);
773 dy21 = _mm_sub_ps(iy2,jy1);
774 dz21 = _mm_sub_ps(iz2,jz1);
775 dx22 = _mm_sub_ps(ix2,jx2);
776 dy22 = _mm_sub_ps(iy2,jy2);
777 dz22 = _mm_sub_ps(iz2,jz2);
778 dx23 = _mm_sub_ps(ix2,jx3);
779 dy23 = _mm_sub_ps(iy2,jy3);
780 dz23 = _mm_sub_ps(iz2,jz3);
781 dx31 = _mm_sub_ps(ix3,jx1);
782 dy31 = _mm_sub_ps(iy3,jy1);
783 dz31 = _mm_sub_ps(iz3,jz1);
784 dx32 = _mm_sub_ps(ix3,jx2);
785 dy32 = _mm_sub_ps(iy3,jy2);
786 dz32 = _mm_sub_ps(iz3,jz2);
787 dx33 = _mm_sub_ps(ix3,jx3);
788 dy33 = _mm_sub_ps(iy3,jy3);
789 dz33 = _mm_sub_ps(iz3,jz3);
791 /* Calculate squared distance and things based on it */
792 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
793 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
794 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
795 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
796 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
797 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
798 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
799 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
800 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
801 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
803 rinv00 = gmx_mm_invsqrt_ps(rsq00);
804 rinv11 = gmx_mm_invsqrt_ps(rsq11);
805 rinv12 = gmx_mm_invsqrt_ps(rsq12);
806 rinv13 = gmx_mm_invsqrt_ps(rsq13);
807 rinv21 = gmx_mm_invsqrt_ps(rsq21);
808 rinv22 = gmx_mm_invsqrt_ps(rsq22);
809 rinv23 = gmx_mm_invsqrt_ps(rsq23);
810 rinv31 = gmx_mm_invsqrt_ps(rsq31);
811 rinv32 = gmx_mm_invsqrt_ps(rsq32);
812 rinv33 = gmx_mm_invsqrt_ps(rsq33);
814 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
815 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
816 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
817 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
818 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
819 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
820 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
821 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
822 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
823 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
825 fjx0 = _mm_setzero_ps();
826 fjy0 = _mm_setzero_ps();
827 fjz0 = _mm_setzero_ps();
828 fjx1 = _mm_setzero_ps();
829 fjy1 = _mm_setzero_ps();
830 fjz1 = _mm_setzero_ps();
831 fjx2 = _mm_setzero_ps();
832 fjy2 = _mm_setzero_ps();
833 fjz2 = _mm_setzero_ps();
834 fjx3 = _mm_setzero_ps();
835 fjy3 = _mm_setzero_ps();
836 fjz3 = _mm_setzero_ps();
838 /**************************
839 * CALCULATE INTERACTIONS *
840 **************************/
842 if (gmx_mm_any_lt(rsq00,rcutoff2))
845 r00 = _mm_mul_ps(rsq00,rinv00);
846 r00 = _mm_andnot_ps(dummy_mask,r00);
848 /* LENNARD-JONES DISPERSION/REPULSION */
850 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
851 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
852 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
853 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
854 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
856 d = _mm_sub_ps(r00,rswitch);
857 d = _mm_max_ps(d,_mm_setzero_ps());
858 d2 = _mm_mul_ps(d,d);
859 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
861 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
863 /* Evaluate switch function */
864 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
865 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
866 vvdw = _mm_mul_ps(vvdw,sw);
867 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
869 /* Update potential sum for this i atom from the interaction with this j atom. */
870 vvdw = _mm_and_ps(vvdw,cutoff_mask);
871 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
872 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
876 fscal = _mm_and_ps(fscal,cutoff_mask);
878 fscal = _mm_andnot_ps(dummy_mask,fscal);
880 /* Calculate temporary vectorial force */
881 tx = _mm_mul_ps(fscal,dx00);
882 ty = _mm_mul_ps(fscal,dy00);
883 tz = _mm_mul_ps(fscal,dz00);
885 /* Update vectorial force */
886 fix0 = _mm_add_ps(fix0,tx);
887 fiy0 = _mm_add_ps(fiy0,ty);
888 fiz0 = _mm_add_ps(fiz0,tz);
890 fjx0 = _mm_add_ps(fjx0,tx);
891 fjy0 = _mm_add_ps(fjy0,ty);
892 fjz0 = _mm_add_ps(fjz0,tz);
896 /**************************
897 * CALCULATE INTERACTIONS *
898 **************************/
900 if (gmx_mm_any_lt(rsq11,rcutoff2))
903 /* REACTION-FIELD ELECTROSTATICS */
904 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
905 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
907 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
909 /* Update potential sum for this i atom from the interaction with this j atom. */
910 velec = _mm_and_ps(velec,cutoff_mask);
911 velec = _mm_andnot_ps(dummy_mask,velec);
912 velecsum = _mm_add_ps(velecsum,velec);
916 fscal = _mm_and_ps(fscal,cutoff_mask);
918 fscal = _mm_andnot_ps(dummy_mask,fscal);
920 /* Calculate temporary vectorial force */
921 tx = _mm_mul_ps(fscal,dx11);
922 ty = _mm_mul_ps(fscal,dy11);
923 tz = _mm_mul_ps(fscal,dz11);
925 /* Update vectorial force */
926 fix1 = _mm_add_ps(fix1,tx);
927 fiy1 = _mm_add_ps(fiy1,ty);
928 fiz1 = _mm_add_ps(fiz1,tz);
930 fjx1 = _mm_add_ps(fjx1,tx);
931 fjy1 = _mm_add_ps(fjy1,ty);
932 fjz1 = _mm_add_ps(fjz1,tz);
936 /**************************
937 * CALCULATE INTERACTIONS *
938 **************************/
940 if (gmx_mm_any_lt(rsq12,rcutoff2))
943 /* REACTION-FIELD ELECTROSTATICS */
944 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
945 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
947 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
949 /* Update potential sum for this i atom from the interaction with this j atom. */
950 velec = _mm_and_ps(velec,cutoff_mask);
951 velec = _mm_andnot_ps(dummy_mask,velec);
952 velecsum = _mm_add_ps(velecsum,velec);
956 fscal = _mm_and_ps(fscal,cutoff_mask);
958 fscal = _mm_andnot_ps(dummy_mask,fscal);
960 /* Calculate temporary vectorial force */
961 tx = _mm_mul_ps(fscal,dx12);
962 ty = _mm_mul_ps(fscal,dy12);
963 tz = _mm_mul_ps(fscal,dz12);
965 /* Update vectorial force */
966 fix1 = _mm_add_ps(fix1,tx);
967 fiy1 = _mm_add_ps(fiy1,ty);
968 fiz1 = _mm_add_ps(fiz1,tz);
970 fjx2 = _mm_add_ps(fjx2,tx);
971 fjy2 = _mm_add_ps(fjy2,ty);
972 fjz2 = _mm_add_ps(fjz2,tz);
976 /**************************
977 * CALCULATE INTERACTIONS *
978 **************************/
980 if (gmx_mm_any_lt(rsq13,rcutoff2))
983 /* REACTION-FIELD ELECTROSTATICS */
984 velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
985 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
987 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
989 /* Update potential sum for this i atom from the interaction with this j atom. */
990 velec = _mm_and_ps(velec,cutoff_mask);
991 velec = _mm_andnot_ps(dummy_mask,velec);
992 velecsum = _mm_add_ps(velecsum,velec);
996 fscal = _mm_and_ps(fscal,cutoff_mask);
998 fscal = _mm_andnot_ps(dummy_mask,fscal);
1000 /* Calculate temporary vectorial force */
1001 tx = _mm_mul_ps(fscal,dx13);
1002 ty = _mm_mul_ps(fscal,dy13);
1003 tz = _mm_mul_ps(fscal,dz13);
1005 /* Update vectorial force */
1006 fix1 = _mm_add_ps(fix1,tx);
1007 fiy1 = _mm_add_ps(fiy1,ty);
1008 fiz1 = _mm_add_ps(fiz1,tz);
1010 fjx3 = _mm_add_ps(fjx3,tx);
1011 fjy3 = _mm_add_ps(fjy3,ty);
1012 fjz3 = _mm_add_ps(fjz3,tz);
1016 /**************************
1017 * CALCULATE INTERACTIONS *
1018 **************************/
1020 if (gmx_mm_any_lt(rsq21,rcutoff2))
1023 /* REACTION-FIELD ELECTROSTATICS */
1024 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
1025 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1027 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1029 /* Update potential sum for this i atom from the interaction with this j atom. */
1030 velec = _mm_and_ps(velec,cutoff_mask);
1031 velec = _mm_andnot_ps(dummy_mask,velec);
1032 velecsum = _mm_add_ps(velecsum,velec);
1036 fscal = _mm_and_ps(fscal,cutoff_mask);
1038 fscal = _mm_andnot_ps(dummy_mask,fscal);
1040 /* Calculate temporary vectorial force */
1041 tx = _mm_mul_ps(fscal,dx21);
1042 ty = _mm_mul_ps(fscal,dy21);
1043 tz = _mm_mul_ps(fscal,dz21);
1045 /* Update vectorial force */
1046 fix2 = _mm_add_ps(fix2,tx);
1047 fiy2 = _mm_add_ps(fiy2,ty);
1048 fiz2 = _mm_add_ps(fiz2,tz);
1050 fjx1 = _mm_add_ps(fjx1,tx);
1051 fjy1 = _mm_add_ps(fjy1,ty);
1052 fjz1 = _mm_add_ps(fjz1,tz);
1056 /**************************
1057 * CALCULATE INTERACTIONS *
1058 **************************/
1060 if (gmx_mm_any_lt(rsq22,rcutoff2))
1063 /* REACTION-FIELD ELECTROSTATICS */
1064 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
1065 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1067 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1069 /* Update potential sum for this i atom from the interaction with this j atom. */
1070 velec = _mm_and_ps(velec,cutoff_mask);
1071 velec = _mm_andnot_ps(dummy_mask,velec);
1072 velecsum = _mm_add_ps(velecsum,velec);
1076 fscal = _mm_and_ps(fscal,cutoff_mask);
1078 fscal = _mm_andnot_ps(dummy_mask,fscal);
1080 /* Calculate temporary vectorial force */
1081 tx = _mm_mul_ps(fscal,dx22);
1082 ty = _mm_mul_ps(fscal,dy22);
1083 tz = _mm_mul_ps(fscal,dz22);
1085 /* Update vectorial force */
1086 fix2 = _mm_add_ps(fix2,tx);
1087 fiy2 = _mm_add_ps(fiy2,ty);
1088 fiz2 = _mm_add_ps(fiz2,tz);
1090 fjx2 = _mm_add_ps(fjx2,tx);
1091 fjy2 = _mm_add_ps(fjy2,ty);
1092 fjz2 = _mm_add_ps(fjz2,tz);
1096 /**************************
1097 * CALCULATE INTERACTIONS *
1098 **************************/
1100 if (gmx_mm_any_lt(rsq23,rcutoff2))
1103 /* REACTION-FIELD ELECTROSTATICS */
1104 velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
1105 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1107 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1109 /* Update potential sum for this i atom from the interaction with this j atom. */
1110 velec = _mm_and_ps(velec,cutoff_mask);
1111 velec = _mm_andnot_ps(dummy_mask,velec);
1112 velecsum = _mm_add_ps(velecsum,velec);
1116 fscal = _mm_and_ps(fscal,cutoff_mask);
1118 fscal = _mm_andnot_ps(dummy_mask,fscal);
1120 /* Calculate temporary vectorial force */
1121 tx = _mm_mul_ps(fscal,dx23);
1122 ty = _mm_mul_ps(fscal,dy23);
1123 tz = _mm_mul_ps(fscal,dz23);
1125 /* Update vectorial force */
1126 fix2 = _mm_add_ps(fix2,tx);
1127 fiy2 = _mm_add_ps(fiy2,ty);
1128 fiz2 = _mm_add_ps(fiz2,tz);
1130 fjx3 = _mm_add_ps(fjx3,tx);
1131 fjy3 = _mm_add_ps(fjy3,ty);
1132 fjz3 = _mm_add_ps(fjz3,tz);
1136 /**************************
1137 * CALCULATE INTERACTIONS *
1138 **************************/
1140 if (gmx_mm_any_lt(rsq31,rcutoff2))
1143 /* REACTION-FIELD ELECTROSTATICS */
1144 velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
1145 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1147 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1149 /* Update potential sum for this i atom from the interaction with this j atom. */
1150 velec = _mm_and_ps(velec,cutoff_mask);
1151 velec = _mm_andnot_ps(dummy_mask,velec);
1152 velecsum = _mm_add_ps(velecsum,velec);
1156 fscal = _mm_and_ps(fscal,cutoff_mask);
1158 fscal = _mm_andnot_ps(dummy_mask,fscal);
1160 /* Calculate temporary vectorial force */
1161 tx = _mm_mul_ps(fscal,dx31);
1162 ty = _mm_mul_ps(fscal,dy31);
1163 tz = _mm_mul_ps(fscal,dz31);
1165 /* Update vectorial force */
1166 fix3 = _mm_add_ps(fix3,tx);
1167 fiy3 = _mm_add_ps(fiy3,ty);
1168 fiz3 = _mm_add_ps(fiz3,tz);
1170 fjx1 = _mm_add_ps(fjx1,tx);
1171 fjy1 = _mm_add_ps(fjy1,ty);
1172 fjz1 = _mm_add_ps(fjz1,tz);
1176 /**************************
1177 * CALCULATE INTERACTIONS *
1178 **************************/
1180 if (gmx_mm_any_lt(rsq32,rcutoff2))
1183 /* REACTION-FIELD ELECTROSTATICS */
1184 velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
1185 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1187 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1189 /* Update potential sum for this i atom from the interaction with this j atom. */
1190 velec = _mm_and_ps(velec,cutoff_mask);
1191 velec = _mm_andnot_ps(dummy_mask,velec);
1192 velecsum = _mm_add_ps(velecsum,velec);
1196 fscal = _mm_and_ps(fscal,cutoff_mask);
1198 fscal = _mm_andnot_ps(dummy_mask,fscal);
1200 /* Calculate temporary vectorial force */
1201 tx = _mm_mul_ps(fscal,dx32);
1202 ty = _mm_mul_ps(fscal,dy32);
1203 tz = _mm_mul_ps(fscal,dz32);
1205 /* Update vectorial force */
1206 fix3 = _mm_add_ps(fix3,tx);
1207 fiy3 = _mm_add_ps(fiy3,ty);
1208 fiz3 = _mm_add_ps(fiz3,tz);
1210 fjx2 = _mm_add_ps(fjx2,tx);
1211 fjy2 = _mm_add_ps(fjy2,ty);
1212 fjz2 = _mm_add_ps(fjz2,tz);
1216 /**************************
1217 * CALCULATE INTERACTIONS *
1218 **************************/
1220 if (gmx_mm_any_lt(rsq33,rcutoff2))
1223 /* REACTION-FIELD ELECTROSTATICS */
1224 velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
1225 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1227 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1229 /* Update potential sum for this i atom from the interaction with this j atom. */
1230 velec = _mm_and_ps(velec,cutoff_mask);
1231 velec = _mm_andnot_ps(dummy_mask,velec);
1232 velecsum = _mm_add_ps(velecsum,velec);
1236 fscal = _mm_and_ps(fscal,cutoff_mask);
1238 fscal = _mm_andnot_ps(dummy_mask,fscal);
1240 /* Calculate temporary vectorial force */
1241 tx = _mm_mul_ps(fscal,dx33);
1242 ty = _mm_mul_ps(fscal,dy33);
1243 tz = _mm_mul_ps(fscal,dz33);
1245 /* Update vectorial force */
1246 fix3 = _mm_add_ps(fix3,tx);
1247 fiy3 = _mm_add_ps(fiy3,ty);
1248 fiz3 = _mm_add_ps(fiz3,tz);
1250 fjx3 = _mm_add_ps(fjx3,tx);
1251 fjy3 = _mm_add_ps(fjy3,ty);
1252 fjz3 = _mm_add_ps(fjz3,tz);
1256 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1257 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1258 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1259 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1261 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1262 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1263 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1265 /* Inner loop uses 387 flops */
1268 /* End of innermost loop */
1270 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1271 f+i_coord_offset,fshift+i_shift_offset);
1274 /* Update potential energies */
1275 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1276 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1278 /* Increment number of inner iterations */
1279 inneriter += j_index_end - j_index_start;
1281 /* Outer loop uses 26 flops */
1284 /* Increment number of outer iterations */
1287 /* Update outer/inner flops */
1289 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*387);
1292 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse4_1_single
1293 * Electrostatics interaction: ReactionField
1294 * VdW interaction: LennardJones
1295 * Geometry: Water4-Water4
1296 * Calculate force/pot: Force
1299 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse4_1_single
1300 (t_nblist * gmx_restrict nlist,
1301 rvec * gmx_restrict xx,
1302 rvec * gmx_restrict ff,
1303 t_forcerec * gmx_restrict fr,
1304 t_mdatoms * gmx_restrict mdatoms,
1305 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1306 t_nrnb * gmx_restrict nrnb)
1308 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1309 * just 0 for non-waters.
1310 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1311 * jnr indices corresponding to data put in the four positions in the SIMD register.
1313 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1314 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1315 int jnrA,jnrB,jnrC,jnrD;
1316 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1317 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1318 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1319 real rcutoff_scalar;
1320 real *shiftvec,*fshift,*x,*f;
1321 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1322 real scratch[4*DIM];
1323 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1325 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1327 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1329 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1331 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1332 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1333 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1334 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1335 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1336 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1337 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1338 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1339 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1340 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1341 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1342 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1343 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1344 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1345 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1346 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1347 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1348 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1349 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1350 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1353 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1356 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1357 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1358 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1359 real rswitch_scalar,d_scalar;
1360 __m128 dummy_mask,cutoff_mask;
1361 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1362 __m128 one = _mm_set1_ps(1.0);
1363 __m128 two = _mm_set1_ps(2.0);
1369 jindex = nlist->jindex;
1371 shiftidx = nlist->shift;
1373 shiftvec = fr->shift_vec[0];
1374 fshift = fr->fshift[0];
1375 facel = _mm_set1_ps(fr->epsfac);
1376 charge = mdatoms->chargeA;
1377 krf = _mm_set1_ps(fr->ic->k_rf);
1378 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
1379 crf = _mm_set1_ps(fr->ic->c_rf);
1380 nvdwtype = fr->ntype;
1381 vdwparam = fr->nbfp;
1382 vdwtype = mdatoms->typeA;
1384 /* Setup water-specific parameters */
1385 inr = nlist->iinr[0];
1386 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1387 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1388 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1389 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1391 jq1 = _mm_set1_ps(charge[inr+1]);
1392 jq2 = _mm_set1_ps(charge[inr+2]);
1393 jq3 = _mm_set1_ps(charge[inr+3]);
1394 vdwjidx0A = 2*vdwtype[inr+0];
1395 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1396 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1397 qq11 = _mm_mul_ps(iq1,jq1);
1398 qq12 = _mm_mul_ps(iq1,jq2);
1399 qq13 = _mm_mul_ps(iq1,jq3);
1400 qq21 = _mm_mul_ps(iq2,jq1);
1401 qq22 = _mm_mul_ps(iq2,jq2);
1402 qq23 = _mm_mul_ps(iq2,jq3);
1403 qq31 = _mm_mul_ps(iq3,jq1);
1404 qq32 = _mm_mul_ps(iq3,jq2);
1405 qq33 = _mm_mul_ps(iq3,jq3);
1407 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1408 rcutoff_scalar = fr->rcoulomb;
1409 rcutoff = _mm_set1_ps(rcutoff_scalar);
1410 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1412 rswitch_scalar = fr->rvdw_switch;
1413 rswitch = _mm_set1_ps(rswitch_scalar);
1414 /* Setup switch parameters */
1415 d_scalar = rcutoff_scalar-rswitch_scalar;
1416 d = _mm_set1_ps(d_scalar);
1417 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1418 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1419 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1420 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1421 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1422 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1424 /* Avoid stupid compiler warnings */
1425 jnrA = jnrB = jnrC = jnrD = 0;
1426 j_coord_offsetA = 0;
1427 j_coord_offsetB = 0;
1428 j_coord_offsetC = 0;
1429 j_coord_offsetD = 0;
1434 for(iidx=0;iidx<4*DIM;iidx++)
1436 scratch[iidx] = 0.0;
1439 /* Start outer loop over neighborlists */
1440 for(iidx=0; iidx<nri; iidx++)
1442 /* Load shift vector for this list */
1443 i_shift_offset = DIM*shiftidx[iidx];
1445 /* Load limits for loop over neighbors */
1446 j_index_start = jindex[iidx];
1447 j_index_end = jindex[iidx+1];
1449 /* Get outer coordinate index */
1451 i_coord_offset = DIM*inr;
1453 /* Load i particle coords and add shift vector */
1454 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1455 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1457 fix0 = _mm_setzero_ps();
1458 fiy0 = _mm_setzero_ps();
1459 fiz0 = _mm_setzero_ps();
1460 fix1 = _mm_setzero_ps();
1461 fiy1 = _mm_setzero_ps();
1462 fiz1 = _mm_setzero_ps();
1463 fix2 = _mm_setzero_ps();
1464 fiy2 = _mm_setzero_ps();
1465 fiz2 = _mm_setzero_ps();
1466 fix3 = _mm_setzero_ps();
1467 fiy3 = _mm_setzero_ps();
1468 fiz3 = _mm_setzero_ps();
1470 /* Start inner kernel loop */
1471 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1474 /* Get j neighbor index, and coordinate index */
1476 jnrB = jjnr[jidx+1];
1477 jnrC = jjnr[jidx+2];
1478 jnrD = jjnr[jidx+3];
1479 j_coord_offsetA = DIM*jnrA;
1480 j_coord_offsetB = DIM*jnrB;
1481 j_coord_offsetC = DIM*jnrC;
1482 j_coord_offsetD = DIM*jnrD;
1484 /* load j atom coordinates */
1485 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1486 x+j_coord_offsetC,x+j_coord_offsetD,
1487 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1488 &jy2,&jz2,&jx3,&jy3,&jz3);
1490 /* Calculate displacement vector */
1491 dx00 = _mm_sub_ps(ix0,jx0);
1492 dy00 = _mm_sub_ps(iy0,jy0);
1493 dz00 = _mm_sub_ps(iz0,jz0);
1494 dx11 = _mm_sub_ps(ix1,jx1);
1495 dy11 = _mm_sub_ps(iy1,jy1);
1496 dz11 = _mm_sub_ps(iz1,jz1);
1497 dx12 = _mm_sub_ps(ix1,jx2);
1498 dy12 = _mm_sub_ps(iy1,jy2);
1499 dz12 = _mm_sub_ps(iz1,jz2);
1500 dx13 = _mm_sub_ps(ix1,jx3);
1501 dy13 = _mm_sub_ps(iy1,jy3);
1502 dz13 = _mm_sub_ps(iz1,jz3);
1503 dx21 = _mm_sub_ps(ix2,jx1);
1504 dy21 = _mm_sub_ps(iy2,jy1);
1505 dz21 = _mm_sub_ps(iz2,jz1);
1506 dx22 = _mm_sub_ps(ix2,jx2);
1507 dy22 = _mm_sub_ps(iy2,jy2);
1508 dz22 = _mm_sub_ps(iz2,jz2);
1509 dx23 = _mm_sub_ps(ix2,jx3);
1510 dy23 = _mm_sub_ps(iy2,jy3);
1511 dz23 = _mm_sub_ps(iz2,jz3);
1512 dx31 = _mm_sub_ps(ix3,jx1);
1513 dy31 = _mm_sub_ps(iy3,jy1);
1514 dz31 = _mm_sub_ps(iz3,jz1);
1515 dx32 = _mm_sub_ps(ix3,jx2);
1516 dy32 = _mm_sub_ps(iy3,jy2);
1517 dz32 = _mm_sub_ps(iz3,jz2);
1518 dx33 = _mm_sub_ps(ix3,jx3);
1519 dy33 = _mm_sub_ps(iy3,jy3);
1520 dz33 = _mm_sub_ps(iz3,jz3);
1522 /* Calculate squared distance and things based on it */
1523 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1524 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1525 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1526 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1527 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1528 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1529 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1530 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1531 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1532 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1534 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1535 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1536 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1537 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1538 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1539 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1540 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1541 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1542 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1543 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1545 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1546 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1547 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1548 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1549 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1550 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1551 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1552 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1553 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1554 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1556 fjx0 = _mm_setzero_ps();
1557 fjy0 = _mm_setzero_ps();
1558 fjz0 = _mm_setzero_ps();
1559 fjx1 = _mm_setzero_ps();
1560 fjy1 = _mm_setzero_ps();
1561 fjz1 = _mm_setzero_ps();
1562 fjx2 = _mm_setzero_ps();
1563 fjy2 = _mm_setzero_ps();
1564 fjz2 = _mm_setzero_ps();
1565 fjx3 = _mm_setzero_ps();
1566 fjy3 = _mm_setzero_ps();
1567 fjz3 = _mm_setzero_ps();
1569 /**************************
1570 * CALCULATE INTERACTIONS *
1571 **************************/
1573 if (gmx_mm_any_lt(rsq00,rcutoff2))
1576 r00 = _mm_mul_ps(rsq00,rinv00);
1578 /* LENNARD-JONES DISPERSION/REPULSION */
1580 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1581 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1582 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1583 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1584 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1586 d = _mm_sub_ps(r00,rswitch);
1587 d = _mm_max_ps(d,_mm_setzero_ps());
1588 d2 = _mm_mul_ps(d,d);
1589 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1591 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1593 /* Evaluate switch function */
1594 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1595 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1596 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1600 fscal = _mm_and_ps(fscal,cutoff_mask);
1602 /* Calculate temporary vectorial force */
1603 tx = _mm_mul_ps(fscal,dx00);
1604 ty = _mm_mul_ps(fscal,dy00);
1605 tz = _mm_mul_ps(fscal,dz00);
1607 /* Update vectorial force */
1608 fix0 = _mm_add_ps(fix0,tx);
1609 fiy0 = _mm_add_ps(fiy0,ty);
1610 fiz0 = _mm_add_ps(fiz0,tz);
1612 fjx0 = _mm_add_ps(fjx0,tx);
1613 fjy0 = _mm_add_ps(fjy0,ty);
1614 fjz0 = _mm_add_ps(fjz0,tz);
1618 /**************************
1619 * CALCULATE INTERACTIONS *
1620 **************************/
1622 if (gmx_mm_any_lt(rsq11,rcutoff2))
1625 /* REACTION-FIELD ELECTROSTATICS */
1626 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1628 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1632 fscal = _mm_and_ps(fscal,cutoff_mask);
1634 /* Calculate temporary vectorial force */
1635 tx = _mm_mul_ps(fscal,dx11);
1636 ty = _mm_mul_ps(fscal,dy11);
1637 tz = _mm_mul_ps(fscal,dz11);
1639 /* Update vectorial force */
1640 fix1 = _mm_add_ps(fix1,tx);
1641 fiy1 = _mm_add_ps(fiy1,ty);
1642 fiz1 = _mm_add_ps(fiz1,tz);
1644 fjx1 = _mm_add_ps(fjx1,tx);
1645 fjy1 = _mm_add_ps(fjy1,ty);
1646 fjz1 = _mm_add_ps(fjz1,tz);
1650 /**************************
1651 * CALCULATE INTERACTIONS *
1652 **************************/
1654 if (gmx_mm_any_lt(rsq12,rcutoff2))
1657 /* REACTION-FIELD ELECTROSTATICS */
1658 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1660 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1664 fscal = _mm_and_ps(fscal,cutoff_mask);
1666 /* Calculate temporary vectorial force */
1667 tx = _mm_mul_ps(fscal,dx12);
1668 ty = _mm_mul_ps(fscal,dy12);
1669 tz = _mm_mul_ps(fscal,dz12);
1671 /* Update vectorial force */
1672 fix1 = _mm_add_ps(fix1,tx);
1673 fiy1 = _mm_add_ps(fiy1,ty);
1674 fiz1 = _mm_add_ps(fiz1,tz);
1676 fjx2 = _mm_add_ps(fjx2,tx);
1677 fjy2 = _mm_add_ps(fjy2,ty);
1678 fjz2 = _mm_add_ps(fjz2,tz);
1682 /**************************
1683 * CALCULATE INTERACTIONS *
1684 **************************/
1686 if (gmx_mm_any_lt(rsq13,rcutoff2))
1689 /* REACTION-FIELD ELECTROSTATICS */
1690 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
1692 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1696 fscal = _mm_and_ps(fscal,cutoff_mask);
1698 /* Calculate temporary vectorial force */
1699 tx = _mm_mul_ps(fscal,dx13);
1700 ty = _mm_mul_ps(fscal,dy13);
1701 tz = _mm_mul_ps(fscal,dz13);
1703 /* Update vectorial force */
1704 fix1 = _mm_add_ps(fix1,tx);
1705 fiy1 = _mm_add_ps(fiy1,ty);
1706 fiz1 = _mm_add_ps(fiz1,tz);
1708 fjx3 = _mm_add_ps(fjx3,tx);
1709 fjy3 = _mm_add_ps(fjy3,ty);
1710 fjz3 = _mm_add_ps(fjz3,tz);
1714 /**************************
1715 * CALCULATE INTERACTIONS *
1716 **************************/
1718 if (gmx_mm_any_lt(rsq21,rcutoff2))
1721 /* REACTION-FIELD ELECTROSTATICS */
1722 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1724 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1728 fscal = _mm_and_ps(fscal,cutoff_mask);
1730 /* Calculate temporary vectorial force */
1731 tx = _mm_mul_ps(fscal,dx21);
1732 ty = _mm_mul_ps(fscal,dy21);
1733 tz = _mm_mul_ps(fscal,dz21);
1735 /* Update vectorial force */
1736 fix2 = _mm_add_ps(fix2,tx);
1737 fiy2 = _mm_add_ps(fiy2,ty);
1738 fiz2 = _mm_add_ps(fiz2,tz);
1740 fjx1 = _mm_add_ps(fjx1,tx);
1741 fjy1 = _mm_add_ps(fjy1,ty);
1742 fjz1 = _mm_add_ps(fjz1,tz);
1746 /**************************
1747 * CALCULATE INTERACTIONS *
1748 **************************/
1750 if (gmx_mm_any_lt(rsq22,rcutoff2))
1753 /* REACTION-FIELD ELECTROSTATICS */
1754 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1756 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1760 fscal = _mm_and_ps(fscal,cutoff_mask);
1762 /* Calculate temporary vectorial force */
1763 tx = _mm_mul_ps(fscal,dx22);
1764 ty = _mm_mul_ps(fscal,dy22);
1765 tz = _mm_mul_ps(fscal,dz22);
1767 /* Update vectorial force */
1768 fix2 = _mm_add_ps(fix2,tx);
1769 fiy2 = _mm_add_ps(fiy2,ty);
1770 fiz2 = _mm_add_ps(fiz2,tz);
1772 fjx2 = _mm_add_ps(fjx2,tx);
1773 fjy2 = _mm_add_ps(fjy2,ty);
1774 fjz2 = _mm_add_ps(fjz2,tz);
1778 /**************************
1779 * CALCULATE INTERACTIONS *
1780 **************************/
1782 if (gmx_mm_any_lt(rsq23,rcutoff2))
1785 /* REACTION-FIELD ELECTROSTATICS */
1786 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1788 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1792 fscal = _mm_and_ps(fscal,cutoff_mask);
1794 /* Calculate temporary vectorial force */
1795 tx = _mm_mul_ps(fscal,dx23);
1796 ty = _mm_mul_ps(fscal,dy23);
1797 tz = _mm_mul_ps(fscal,dz23);
1799 /* Update vectorial force */
1800 fix2 = _mm_add_ps(fix2,tx);
1801 fiy2 = _mm_add_ps(fiy2,ty);
1802 fiz2 = _mm_add_ps(fiz2,tz);
1804 fjx3 = _mm_add_ps(fjx3,tx);
1805 fjy3 = _mm_add_ps(fjy3,ty);
1806 fjz3 = _mm_add_ps(fjz3,tz);
1810 /**************************
1811 * CALCULATE INTERACTIONS *
1812 **************************/
1814 if (gmx_mm_any_lt(rsq31,rcutoff2))
1817 /* REACTION-FIELD ELECTROSTATICS */
1818 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1820 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1824 fscal = _mm_and_ps(fscal,cutoff_mask);
1826 /* Calculate temporary vectorial force */
1827 tx = _mm_mul_ps(fscal,dx31);
1828 ty = _mm_mul_ps(fscal,dy31);
1829 tz = _mm_mul_ps(fscal,dz31);
1831 /* Update vectorial force */
1832 fix3 = _mm_add_ps(fix3,tx);
1833 fiy3 = _mm_add_ps(fiy3,ty);
1834 fiz3 = _mm_add_ps(fiz3,tz);
1836 fjx1 = _mm_add_ps(fjx1,tx);
1837 fjy1 = _mm_add_ps(fjy1,ty);
1838 fjz1 = _mm_add_ps(fjz1,tz);
1842 /**************************
1843 * CALCULATE INTERACTIONS *
1844 **************************/
1846 if (gmx_mm_any_lt(rsq32,rcutoff2))
1849 /* REACTION-FIELD ELECTROSTATICS */
1850 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1852 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1856 fscal = _mm_and_ps(fscal,cutoff_mask);
1858 /* Calculate temporary vectorial force */
1859 tx = _mm_mul_ps(fscal,dx32);
1860 ty = _mm_mul_ps(fscal,dy32);
1861 tz = _mm_mul_ps(fscal,dz32);
1863 /* Update vectorial force */
1864 fix3 = _mm_add_ps(fix3,tx);
1865 fiy3 = _mm_add_ps(fiy3,ty);
1866 fiz3 = _mm_add_ps(fiz3,tz);
1868 fjx2 = _mm_add_ps(fjx2,tx);
1869 fjy2 = _mm_add_ps(fjy2,ty);
1870 fjz2 = _mm_add_ps(fjz2,tz);
1874 /**************************
1875 * CALCULATE INTERACTIONS *
1876 **************************/
1878 if (gmx_mm_any_lt(rsq33,rcutoff2))
1881 /* REACTION-FIELD ELECTROSTATICS */
1882 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1884 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1888 fscal = _mm_and_ps(fscal,cutoff_mask);
1890 /* Calculate temporary vectorial force */
1891 tx = _mm_mul_ps(fscal,dx33);
1892 ty = _mm_mul_ps(fscal,dy33);
1893 tz = _mm_mul_ps(fscal,dz33);
1895 /* Update vectorial force */
1896 fix3 = _mm_add_ps(fix3,tx);
1897 fiy3 = _mm_add_ps(fiy3,ty);
1898 fiz3 = _mm_add_ps(fiz3,tz);
1900 fjx3 = _mm_add_ps(fjx3,tx);
1901 fjy3 = _mm_add_ps(fjy3,ty);
1902 fjz3 = _mm_add_ps(fjz3,tz);
1906 fjptrA = f+j_coord_offsetA;
1907 fjptrB = f+j_coord_offsetB;
1908 fjptrC = f+j_coord_offsetC;
1909 fjptrD = f+j_coord_offsetD;
1911 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1912 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1913 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1915 /* Inner loop uses 329 flops */
1918 if(jidx<j_index_end)
1921 /* Get j neighbor index, and coordinate index */
1922 jnrlistA = jjnr[jidx];
1923 jnrlistB = jjnr[jidx+1];
1924 jnrlistC = jjnr[jidx+2];
1925 jnrlistD = jjnr[jidx+3];
1926 /* Sign of each element will be negative for non-real atoms.
1927 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1928 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1930 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1931 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1932 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1933 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1934 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1935 j_coord_offsetA = DIM*jnrA;
1936 j_coord_offsetB = DIM*jnrB;
1937 j_coord_offsetC = DIM*jnrC;
1938 j_coord_offsetD = DIM*jnrD;
1940 /* load j atom coordinates */
1941 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1942 x+j_coord_offsetC,x+j_coord_offsetD,
1943 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1944 &jy2,&jz2,&jx3,&jy3,&jz3);
1946 /* Calculate displacement vector */
1947 dx00 = _mm_sub_ps(ix0,jx0);
1948 dy00 = _mm_sub_ps(iy0,jy0);
1949 dz00 = _mm_sub_ps(iz0,jz0);
1950 dx11 = _mm_sub_ps(ix1,jx1);
1951 dy11 = _mm_sub_ps(iy1,jy1);
1952 dz11 = _mm_sub_ps(iz1,jz1);
1953 dx12 = _mm_sub_ps(ix1,jx2);
1954 dy12 = _mm_sub_ps(iy1,jy2);
1955 dz12 = _mm_sub_ps(iz1,jz2);
1956 dx13 = _mm_sub_ps(ix1,jx3);
1957 dy13 = _mm_sub_ps(iy1,jy3);
1958 dz13 = _mm_sub_ps(iz1,jz3);
1959 dx21 = _mm_sub_ps(ix2,jx1);
1960 dy21 = _mm_sub_ps(iy2,jy1);
1961 dz21 = _mm_sub_ps(iz2,jz1);
1962 dx22 = _mm_sub_ps(ix2,jx2);
1963 dy22 = _mm_sub_ps(iy2,jy2);
1964 dz22 = _mm_sub_ps(iz2,jz2);
1965 dx23 = _mm_sub_ps(ix2,jx3);
1966 dy23 = _mm_sub_ps(iy2,jy3);
1967 dz23 = _mm_sub_ps(iz2,jz3);
1968 dx31 = _mm_sub_ps(ix3,jx1);
1969 dy31 = _mm_sub_ps(iy3,jy1);
1970 dz31 = _mm_sub_ps(iz3,jz1);
1971 dx32 = _mm_sub_ps(ix3,jx2);
1972 dy32 = _mm_sub_ps(iy3,jy2);
1973 dz32 = _mm_sub_ps(iz3,jz2);
1974 dx33 = _mm_sub_ps(ix3,jx3);
1975 dy33 = _mm_sub_ps(iy3,jy3);
1976 dz33 = _mm_sub_ps(iz3,jz3);
1978 /* Calculate squared distance and things based on it */
1979 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1980 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1981 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1982 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1983 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1984 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1985 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1986 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1987 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1988 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1990 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1991 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1992 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1993 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1994 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1995 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1996 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1997 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1998 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1999 rinv33 = gmx_mm_invsqrt_ps(rsq33);
2001 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2002 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2003 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2004 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2005 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2006 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2007 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2008 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2009 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2010 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2012 fjx0 = _mm_setzero_ps();
2013 fjy0 = _mm_setzero_ps();
2014 fjz0 = _mm_setzero_ps();
2015 fjx1 = _mm_setzero_ps();
2016 fjy1 = _mm_setzero_ps();
2017 fjz1 = _mm_setzero_ps();
2018 fjx2 = _mm_setzero_ps();
2019 fjy2 = _mm_setzero_ps();
2020 fjz2 = _mm_setzero_ps();
2021 fjx3 = _mm_setzero_ps();
2022 fjy3 = _mm_setzero_ps();
2023 fjz3 = _mm_setzero_ps();
2025 /**************************
2026 * CALCULATE INTERACTIONS *
2027 **************************/
2029 if (gmx_mm_any_lt(rsq00,rcutoff2))
2032 r00 = _mm_mul_ps(rsq00,rinv00);
2033 r00 = _mm_andnot_ps(dummy_mask,r00);
2035 /* LENNARD-JONES DISPERSION/REPULSION */
2037 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2038 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
2039 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2040 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2041 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2043 d = _mm_sub_ps(r00,rswitch);
2044 d = _mm_max_ps(d,_mm_setzero_ps());
2045 d2 = _mm_mul_ps(d,d);
2046 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2048 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2050 /* Evaluate switch function */
2051 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2052 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2053 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2057 fscal = _mm_and_ps(fscal,cutoff_mask);
2059 fscal = _mm_andnot_ps(dummy_mask,fscal);
2061 /* Calculate temporary vectorial force */
2062 tx = _mm_mul_ps(fscal,dx00);
2063 ty = _mm_mul_ps(fscal,dy00);
2064 tz = _mm_mul_ps(fscal,dz00);
2066 /* Update vectorial force */
2067 fix0 = _mm_add_ps(fix0,tx);
2068 fiy0 = _mm_add_ps(fiy0,ty);
2069 fiz0 = _mm_add_ps(fiz0,tz);
2071 fjx0 = _mm_add_ps(fjx0,tx);
2072 fjy0 = _mm_add_ps(fjy0,ty);
2073 fjz0 = _mm_add_ps(fjz0,tz);
2077 /**************************
2078 * CALCULATE INTERACTIONS *
2079 **************************/
2081 if (gmx_mm_any_lt(rsq11,rcutoff2))
2084 /* REACTION-FIELD ELECTROSTATICS */
2085 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
2087 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2091 fscal = _mm_and_ps(fscal,cutoff_mask);
2093 fscal = _mm_andnot_ps(dummy_mask,fscal);
2095 /* Calculate temporary vectorial force */
2096 tx = _mm_mul_ps(fscal,dx11);
2097 ty = _mm_mul_ps(fscal,dy11);
2098 tz = _mm_mul_ps(fscal,dz11);
2100 /* Update vectorial force */
2101 fix1 = _mm_add_ps(fix1,tx);
2102 fiy1 = _mm_add_ps(fiy1,ty);
2103 fiz1 = _mm_add_ps(fiz1,tz);
2105 fjx1 = _mm_add_ps(fjx1,tx);
2106 fjy1 = _mm_add_ps(fjy1,ty);
2107 fjz1 = _mm_add_ps(fjz1,tz);
2111 /**************************
2112 * CALCULATE INTERACTIONS *
2113 **************************/
2115 if (gmx_mm_any_lt(rsq12,rcutoff2))
2118 /* REACTION-FIELD ELECTROSTATICS */
2119 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
2121 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2125 fscal = _mm_and_ps(fscal,cutoff_mask);
2127 fscal = _mm_andnot_ps(dummy_mask,fscal);
2129 /* Calculate temporary vectorial force */
2130 tx = _mm_mul_ps(fscal,dx12);
2131 ty = _mm_mul_ps(fscal,dy12);
2132 tz = _mm_mul_ps(fscal,dz12);
2134 /* Update vectorial force */
2135 fix1 = _mm_add_ps(fix1,tx);
2136 fiy1 = _mm_add_ps(fiy1,ty);
2137 fiz1 = _mm_add_ps(fiz1,tz);
2139 fjx2 = _mm_add_ps(fjx2,tx);
2140 fjy2 = _mm_add_ps(fjy2,ty);
2141 fjz2 = _mm_add_ps(fjz2,tz);
2145 /**************************
2146 * CALCULATE INTERACTIONS *
2147 **************************/
2149 if (gmx_mm_any_lt(rsq13,rcutoff2))
2152 /* REACTION-FIELD ELECTROSTATICS */
2153 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
2155 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2159 fscal = _mm_and_ps(fscal,cutoff_mask);
2161 fscal = _mm_andnot_ps(dummy_mask,fscal);
2163 /* Calculate temporary vectorial force */
2164 tx = _mm_mul_ps(fscal,dx13);
2165 ty = _mm_mul_ps(fscal,dy13);
2166 tz = _mm_mul_ps(fscal,dz13);
2168 /* Update vectorial force */
2169 fix1 = _mm_add_ps(fix1,tx);
2170 fiy1 = _mm_add_ps(fiy1,ty);
2171 fiz1 = _mm_add_ps(fiz1,tz);
2173 fjx3 = _mm_add_ps(fjx3,tx);
2174 fjy3 = _mm_add_ps(fjy3,ty);
2175 fjz3 = _mm_add_ps(fjz3,tz);
2179 /**************************
2180 * CALCULATE INTERACTIONS *
2181 **************************/
2183 if (gmx_mm_any_lt(rsq21,rcutoff2))
2186 /* REACTION-FIELD ELECTROSTATICS */
2187 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
2189 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2193 fscal = _mm_and_ps(fscal,cutoff_mask);
2195 fscal = _mm_andnot_ps(dummy_mask,fscal);
2197 /* Calculate temporary vectorial force */
2198 tx = _mm_mul_ps(fscal,dx21);
2199 ty = _mm_mul_ps(fscal,dy21);
2200 tz = _mm_mul_ps(fscal,dz21);
2202 /* Update vectorial force */
2203 fix2 = _mm_add_ps(fix2,tx);
2204 fiy2 = _mm_add_ps(fiy2,ty);
2205 fiz2 = _mm_add_ps(fiz2,tz);
2207 fjx1 = _mm_add_ps(fjx1,tx);
2208 fjy1 = _mm_add_ps(fjy1,ty);
2209 fjz1 = _mm_add_ps(fjz1,tz);
2213 /**************************
2214 * CALCULATE INTERACTIONS *
2215 **************************/
2217 if (gmx_mm_any_lt(rsq22,rcutoff2))
2220 /* REACTION-FIELD ELECTROSTATICS */
2221 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
2223 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2227 fscal = _mm_and_ps(fscal,cutoff_mask);
2229 fscal = _mm_andnot_ps(dummy_mask,fscal);
2231 /* Calculate temporary vectorial force */
2232 tx = _mm_mul_ps(fscal,dx22);
2233 ty = _mm_mul_ps(fscal,dy22);
2234 tz = _mm_mul_ps(fscal,dz22);
2236 /* Update vectorial force */
2237 fix2 = _mm_add_ps(fix2,tx);
2238 fiy2 = _mm_add_ps(fiy2,ty);
2239 fiz2 = _mm_add_ps(fiz2,tz);
2241 fjx2 = _mm_add_ps(fjx2,tx);
2242 fjy2 = _mm_add_ps(fjy2,ty);
2243 fjz2 = _mm_add_ps(fjz2,tz);
2247 /**************************
2248 * CALCULATE INTERACTIONS *
2249 **************************/
2251 if (gmx_mm_any_lt(rsq23,rcutoff2))
2254 /* REACTION-FIELD ELECTROSTATICS */
2255 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
2257 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
2261 fscal = _mm_and_ps(fscal,cutoff_mask);
2263 fscal = _mm_andnot_ps(dummy_mask,fscal);
2265 /* Calculate temporary vectorial force */
2266 tx = _mm_mul_ps(fscal,dx23);
2267 ty = _mm_mul_ps(fscal,dy23);
2268 tz = _mm_mul_ps(fscal,dz23);
2270 /* Update vectorial force */
2271 fix2 = _mm_add_ps(fix2,tx);
2272 fiy2 = _mm_add_ps(fiy2,ty);
2273 fiz2 = _mm_add_ps(fiz2,tz);
2275 fjx3 = _mm_add_ps(fjx3,tx);
2276 fjy3 = _mm_add_ps(fjy3,ty);
2277 fjz3 = _mm_add_ps(fjz3,tz);
2281 /**************************
2282 * CALCULATE INTERACTIONS *
2283 **************************/
2285 if (gmx_mm_any_lt(rsq31,rcutoff2))
2288 /* REACTION-FIELD ELECTROSTATICS */
2289 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
2291 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2295 fscal = _mm_and_ps(fscal,cutoff_mask);
2297 fscal = _mm_andnot_ps(dummy_mask,fscal);
2299 /* Calculate temporary vectorial force */
2300 tx = _mm_mul_ps(fscal,dx31);
2301 ty = _mm_mul_ps(fscal,dy31);
2302 tz = _mm_mul_ps(fscal,dz31);
2304 /* Update vectorial force */
2305 fix3 = _mm_add_ps(fix3,tx);
2306 fiy3 = _mm_add_ps(fiy3,ty);
2307 fiz3 = _mm_add_ps(fiz3,tz);
2309 fjx1 = _mm_add_ps(fjx1,tx);
2310 fjy1 = _mm_add_ps(fjy1,ty);
2311 fjz1 = _mm_add_ps(fjz1,tz);
2315 /**************************
2316 * CALCULATE INTERACTIONS *
2317 **************************/
2319 if (gmx_mm_any_lt(rsq32,rcutoff2))
2322 /* REACTION-FIELD ELECTROSTATICS */
2323 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
2325 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2329 fscal = _mm_and_ps(fscal,cutoff_mask);
2331 fscal = _mm_andnot_ps(dummy_mask,fscal);
2333 /* Calculate temporary vectorial force */
2334 tx = _mm_mul_ps(fscal,dx32);
2335 ty = _mm_mul_ps(fscal,dy32);
2336 tz = _mm_mul_ps(fscal,dz32);
2338 /* Update vectorial force */
2339 fix3 = _mm_add_ps(fix3,tx);
2340 fiy3 = _mm_add_ps(fiy3,ty);
2341 fiz3 = _mm_add_ps(fiz3,tz);
2343 fjx2 = _mm_add_ps(fjx2,tx);
2344 fjy2 = _mm_add_ps(fjy2,ty);
2345 fjz2 = _mm_add_ps(fjz2,tz);
2349 /**************************
2350 * CALCULATE INTERACTIONS *
2351 **************************/
2353 if (gmx_mm_any_lt(rsq33,rcutoff2))
2356 /* REACTION-FIELD ELECTROSTATICS */
2357 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
2359 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2363 fscal = _mm_and_ps(fscal,cutoff_mask);
2365 fscal = _mm_andnot_ps(dummy_mask,fscal);
2367 /* Calculate temporary vectorial force */
2368 tx = _mm_mul_ps(fscal,dx33);
2369 ty = _mm_mul_ps(fscal,dy33);
2370 tz = _mm_mul_ps(fscal,dz33);
2372 /* Update vectorial force */
2373 fix3 = _mm_add_ps(fix3,tx);
2374 fiy3 = _mm_add_ps(fiy3,ty);
2375 fiz3 = _mm_add_ps(fiz3,tz);
2377 fjx3 = _mm_add_ps(fjx3,tx);
2378 fjy3 = _mm_add_ps(fjy3,ty);
2379 fjz3 = _mm_add_ps(fjz3,tz);
2383 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2384 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2385 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2386 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2388 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2389 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2390 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2392 /* Inner loop uses 330 flops */
2395 /* End of innermost loop */
2397 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2398 f+i_coord_offset,fshift+i_shift_offset);
2400 /* Increment number of inner iterations */
2401 inneriter += j_index_end - j_index_start;
2403 /* Outer loop uses 24 flops */
2406 /* Increment number of outer iterations */
2409 /* Update outer/inner flops */
2411 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*330);