2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse2_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse2_single
51 * Electrostatics interaction: ReactionField
52 * VdW interaction: CubicSplineTable
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse2_single
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
89 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
91 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
93 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
106 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
109 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
110 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
112 __m128i ifour = _mm_set1_epi32(4);
113 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
115 __m128 dummy_mask,cutoff_mask;
116 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
117 __m128 one = _mm_set1_ps(1.0);
118 __m128 two = _mm_set1_ps(2.0);
124 jindex = nlist->jindex;
126 shiftidx = nlist->shift;
128 shiftvec = fr->shift_vec[0];
129 fshift = fr->fshift[0];
130 facel = _mm_set1_ps(fr->ic->epsfac);
131 charge = mdatoms->chargeA;
132 krf = _mm_set1_ps(fr->ic->k_rf);
133 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
134 crf = _mm_set1_ps(fr->ic->c_rf);
135 nvdwtype = fr->ntype;
137 vdwtype = mdatoms->typeA;
139 vftab = kernel_data->table_vdw->data;
140 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
142 /* Setup water-specific parameters */
143 inr = nlist->iinr[0];
144 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
145 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
146 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
147 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
149 jq0 = _mm_set1_ps(charge[inr+0]);
150 jq1 = _mm_set1_ps(charge[inr+1]);
151 jq2 = _mm_set1_ps(charge[inr+2]);
152 vdwjidx0A = 2*vdwtype[inr+0];
153 qq00 = _mm_mul_ps(iq0,jq0);
154 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
155 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
156 qq01 = _mm_mul_ps(iq0,jq1);
157 qq02 = _mm_mul_ps(iq0,jq2);
158 qq10 = _mm_mul_ps(iq1,jq0);
159 qq11 = _mm_mul_ps(iq1,jq1);
160 qq12 = _mm_mul_ps(iq1,jq2);
161 qq20 = _mm_mul_ps(iq2,jq0);
162 qq21 = _mm_mul_ps(iq2,jq1);
163 qq22 = _mm_mul_ps(iq2,jq2);
165 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
166 rcutoff_scalar = fr->ic->rcoulomb;
167 rcutoff = _mm_set1_ps(rcutoff_scalar);
168 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
170 /* Avoid stupid compiler warnings */
171 jnrA = jnrB = jnrC = jnrD = 0;
180 for(iidx=0;iidx<4*DIM;iidx++)
185 /* Start outer loop over neighborlists */
186 for(iidx=0; iidx<nri; iidx++)
188 /* Load shift vector for this list */
189 i_shift_offset = DIM*shiftidx[iidx];
191 /* Load limits for loop over neighbors */
192 j_index_start = jindex[iidx];
193 j_index_end = jindex[iidx+1];
195 /* Get outer coordinate index */
197 i_coord_offset = DIM*inr;
199 /* Load i particle coords and add shift vector */
200 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
201 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
203 fix0 = _mm_setzero_ps();
204 fiy0 = _mm_setzero_ps();
205 fiz0 = _mm_setzero_ps();
206 fix1 = _mm_setzero_ps();
207 fiy1 = _mm_setzero_ps();
208 fiz1 = _mm_setzero_ps();
209 fix2 = _mm_setzero_ps();
210 fiy2 = _mm_setzero_ps();
211 fiz2 = _mm_setzero_ps();
213 /* Reset potential sums */
214 velecsum = _mm_setzero_ps();
215 vvdwsum = _mm_setzero_ps();
217 /* Start inner kernel loop */
218 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
221 /* Get j neighbor index, and coordinate index */
226 j_coord_offsetA = DIM*jnrA;
227 j_coord_offsetB = DIM*jnrB;
228 j_coord_offsetC = DIM*jnrC;
229 j_coord_offsetD = DIM*jnrD;
231 /* load j atom coordinates */
232 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
233 x+j_coord_offsetC,x+j_coord_offsetD,
234 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
236 /* Calculate displacement vector */
237 dx00 = _mm_sub_ps(ix0,jx0);
238 dy00 = _mm_sub_ps(iy0,jy0);
239 dz00 = _mm_sub_ps(iz0,jz0);
240 dx01 = _mm_sub_ps(ix0,jx1);
241 dy01 = _mm_sub_ps(iy0,jy1);
242 dz01 = _mm_sub_ps(iz0,jz1);
243 dx02 = _mm_sub_ps(ix0,jx2);
244 dy02 = _mm_sub_ps(iy0,jy2);
245 dz02 = _mm_sub_ps(iz0,jz2);
246 dx10 = _mm_sub_ps(ix1,jx0);
247 dy10 = _mm_sub_ps(iy1,jy0);
248 dz10 = _mm_sub_ps(iz1,jz0);
249 dx11 = _mm_sub_ps(ix1,jx1);
250 dy11 = _mm_sub_ps(iy1,jy1);
251 dz11 = _mm_sub_ps(iz1,jz1);
252 dx12 = _mm_sub_ps(ix1,jx2);
253 dy12 = _mm_sub_ps(iy1,jy2);
254 dz12 = _mm_sub_ps(iz1,jz2);
255 dx20 = _mm_sub_ps(ix2,jx0);
256 dy20 = _mm_sub_ps(iy2,jy0);
257 dz20 = _mm_sub_ps(iz2,jz0);
258 dx21 = _mm_sub_ps(ix2,jx1);
259 dy21 = _mm_sub_ps(iy2,jy1);
260 dz21 = _mm_sub_ps(iz2,jz1);
261 dx22 = _mm_sub_ps(ix2,jx2);
262 dy22 = _mm_sub_ps(iy2,jy2);
263 dz22 = _mm_sub_ps(iz2,jz2);
265 /* Calculate squared distance and things based on it */
266 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
267 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
268 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
269 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
270 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
271 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
272 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
273 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
274 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
276 rinv00 = sse2_invsqrt_f(rsq00);
277 rinv01 = sse2_invsqrt_f(rsq01);
278 rinv02 = sse2_invsqrt_f(rsq02);
279 rinv10 = sse2_invsqrt_f(rsq10);
280 rinv11 = sse2_invsqrt_f(rsq11);
281 rinv12 = sse2_invsqrt_f(rsq12);
282 rinv20 = sse2_invsqrt_f(rsq20);
283 rinv21 = sse2_invsqrt_f(rsq21);
284 rinv22 = sse2_invsqrt_f(rsq22);
286 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
287 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
288 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
289 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
290 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
291 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
292 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
293 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
294 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
296 fjx0 = _mm_setzero_ps();
297 fjy0 = _mm_setzero_ps();
298 fjz0 = _mm_setzero_ps();
299 fjx1 = _mm_setzero_ps();
300 fjy1 = _mm_setzero_ps();
301 fjz1 = _mm_setzero_ps();
302 fjx2 = _mm_setzero_ps();
303 fjy2 = _mm_setzero_ps();
304 fjz2 = _mm_setzero_ps();
306 /**************************
307 * CALCULATE INTERACTIONS *
308 **************************/
310 if (gmx_mm_any_lt(rsq00,rcutoff2))
313 r00 = _mm_mul_ps(rsq00,rinv00);
315 /* Calculate table index by multiplying r with table scale and truncate to integer */
316 rt = _mm_mul_ps(r00,vftabscale);
317 vfitab = _mm_cvttps_epi32(rt);
318 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
319 vfitab = _mm_slli_epi32(vfitab,3);
321 /* REACTION-FIELD ELECTROSTATICS */
322 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
323 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
325 /* CUBIC SPLINE TABLE DISPERSION */
326 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
327 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
328 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
329 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
330 _MM_TRANSPOSE4_PS(Y,F,G,H);
331 Heps = _mm_mul_ps(vfeps,H);
332 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
333 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
334 vvdw6 = _mm_mul_ps(c6_00,VV);
335 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
336 fvdw6 = _mm_mul_ps(c6_00,FF);
338 /* CUBIC SPLINE TABLE REPULSION */
339 vfitab = _mm_add_epi32(vfitab,ifour);
340 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
341 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
342 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
343 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
344 _MM_TRANSPOSE4_PS(Y,F,G,H);
345 Heps = _mm_mul_ps(vfeps,H);
346 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
347 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
348 vvdw12 = _mm_mul_ps(c12_00,VV);
349 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
350 fvdw12 = _mm_mul_ps(c12_00,FF);
351 vvdw = _mm_add_ps(vvdw12,vvdw6);
352 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
354 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
356 /* Update potential sum for this i atom from the interaction with this j atom. */
357 velec = _mm_and_ps(velec,cutoff_mask);
358 velecsum = _mm_add_ps(velecsum,velec);
359 vvdw = _mm_and_ps(vvdw,cutoff_mask);
360 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
362 fscal = _mm_add_ps(felec,fvdw);
364 fscal = _mm_and_ps(fscal,cutoff_mask);
366 /* Calculate temporary vectorial force */
367 tx = _mm_mul_ps(fscal,dx00);
368 ty = _mm_mul_ps(fscal,dy00);
369 tz = _mm_mul_ps(fscal,dz00);
371 /* Update vectorial force */
372 fix0 = _mm_add_ps(fix0,tx);
373 fiy0 = _mm_add_ps(fiy0,ty);
374 fiz0 = _mm_add_ps(fiz0,tz);
376 fjx0 = _mm_add_ps(fjx0,tx);
377 fjy0 = _mm_add_ps(fjy0,ty);
378 fjz0 = _mm_add_ps(fjz0,tz);
382 /**************************
383 * CALCULATE INTERACTIONS *
384 **************************/
386 if (gmx_mm_any_lt(rsq01,rcutoff2))
389 /* REACTION-FIELD ELECTROSTATICS */
390 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
391 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
393 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
395 /* Update potential sum for this i atom from the interaction with this j atom. */
396 velec = _mm_and_ps(velec,cutoff_mask);
397 velecsum = _mm_add_ps(velecsum,velec);
401 fscal = _mm_and_ps(fscal,cutoff_mask);
403 /* Calculate temporary vectorial force */
404 tx = _mm_mul_ps(fscal,dx01);
405 ty = _mm_mul_ps(fscal,dy01);
406 tz = _mm_mul_ps(fscal,dz01);
408 /* Update vectorial force */
409 fix0 = _mm_add_ps(fix0,tx);
410 fiy0 = _mm_add_ps(fiy0,ty);
411 fiz0 = _mm_add_ps(fiz0,tz);
413 fjx1 = _mm_add_ps(fjx1,tx);
414 fjy1 = _mm_add_ps(fjy1,ty);
415 fjz1 = _mm_add_ps(fjz1,tz);
419 /**************************
420 * CALCULATE INTERACTIONS *
421 **************************/
423 if (gmx_mm_any_lt(rsq02,rcutoff2))
426 /* REACTION-FIELD ELECTROSTATICS */
427 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
428 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
430 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
432 /* Update potential sum for this i atom from the interaction with this j atom. */
433 velec = _mm_and_ps(velec,cutoff_mask);
434 velecsum = _mm_add_ps(velecsum,velec);
438 fscal = _mm_and_ps(fscal,cutoff_mask);
440 /* Calculate temporary vectorial force */
441 tx = _mm_mul_ps(fscal,dx02);
442 ty = _mm_mul_ps(fscal,dy02);
443 tz = _mm_mul_ps(fscal,dz02);
445 /* Update vectorial force */
446 fix0 = _mm_add_ps(fix0,tx);
447 fiy0 = _mm_add_ps(fiy0,ty);
448 fiz0 = _mm_add_ps(fiz0,tz);
450 fjx2 = _mm_add_ps(fjx2,tx);
451 fjy2 = _mm_add_ps(fjy2,ty);
452 fjz2 = _mm_add_ps(fjz2,tz);
456 /**************************
457 * CALCULATE INTERACTIONS *
458 **************************/
460 if (gmx_mm_any_lt(rsq10,rcutoff2))
463 /* REACTION-FIELD ELECTROSTATICS */
464 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
465 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
467 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
469 /* Update potential sum for this i atom from the interaction with this j atom. */
470 velec = _mm_and_ps(velec,cutoff_mask);
471 velecsum = _mm_add_ps(velecsum,velec);
475 fscal = _mm_and_ps(fscal,cutoff_mask);
477 /* Calculate temporary vectorial force */
478 tx = _mm_mul_ps(fscal,dx10);
479 ty = _mm_mul_ps(fscal,dy10);
480 tz = _mm_mul_ps(fscal,dz10);
482 /* Update vectorial force */
483 fix1 = _mm_add_ps(fix1,tx);
484 fiy1 = _mm_add_ps(fiy1,ty);
485 fiz1 = _mm_add_ps(fiz1,tz);
487 fjx0 = _mm_add_ps(fjx0,tx);
488 fjy0 = _mm_add_ps(fjy0,ty);
489 fjz0 = _mm_add_ps(fjz0,tz);
493 /**************************
494 * CALCULATE INTERACTIONS *
495 **************************/
497 if (gmx_mm_any_lt(rsq11,rcutoff2))
500 /* REACTION-FIELD ELECTROSTATICS */
501 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
502 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
504 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
506 /* Update potential sum for this i atom from the interaction with this j atom. */
507 velec = _mm_and_ps(velec,cutoff_mask);
508 velecsum = _mm_add_ps(velecsum,velec);
512 fscal = _mm_and_ps(fscal,cutoff_mask);
514 /* Calculate temporary vectorial force */
515 tx = _mm_mul_ps(fscal,dx11);
516 ty = _mm_mul_ps(fscal,dy11);
517 tz = _mm_mul_ps(fscal,dz11);
519 /* Update vectorial force */
520 fix1 = _mm_add_ps(fix1,tx);
521 fiy1 = _mm_add_ps(fiy1,ty);
522 fiz1 = _mm_add_ps(fiz1,tz);
524 fjx1 = _mm_add_ps(fjx1,tx);
525 fjy1 = _mm_add_ps(fjy1,ty);
526 fjz1 = _mm_add_ps(fjz1,tz);
530 /**************************
531 * CALCULATE INTERACTIONS *
532 **************************/
534 if (gmx_mm_any_lt(rsq12,rcutoff2))
537 /* REACTION-FIELD ELECTROSTATICS */
538 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
539 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
541 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
543 /* Update potential sum for this i atom from the interaction with this j atom. */
544 velec = _mm_and_ps(velec,cutoff_mask);
545 velecsum = _mm_add_ps(velecsum,velec);
549 fscal = _mm_and_ps(fscal,cutoff_mask);
551 /* Calculate temporary vectorial force */
552 tx = _mm_mul_ps(fscal,dx12);
553 ty = _mm_mul_ps(fscal,dy12);
554 tz = _mm_mul_ps(fscal,dz12);
556 /* Update vectorial force */
557 fix1 = _mm_add_ps(fix1,tx);
558 fiy1 = _mm_add_ps(fiy1,ty);
559 fiz1 = _mm_add_ps(fiz1,tz);
561 fjx2 = _mm_add_ps(fjx2,tx);
562 fjy2 = _mm_add_ps(fjy2,ty);
563 fjz2 = _mm_add_ps(fjz2,tz);
567 /**************************
568 * CALCULATE INTERACTIONS *
569 **************************/
571 if (gmx_mm_any_lt(rsq20,rcutoff2))
574 /* REACTION-FIELD ELECTROSTATICS */
575 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
576 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
578 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
580 /* Update potential sum for this i atom from the interaction with this j atom. */
581 velec = _mm_and_ps(velec,cutoff_mask);
582 velecsum = _mm_add_ps(velecsum,velec);
586 fscal = _mm_and_ps(fscal,cutoff_mask);
588 /* Calculate temporary vectorial force */
589 tx = _mm_mul_ps(fscal,dx20);
590 ty = _mm_mul_ps(fscal,dy20);
591 tz = _mm_mul_ps(fscal,dz20);
593 /* Update vectorial force */
594 fix2 = _mm_add_ps(fix2,tx);
595 fiy2 = _mm_add_ps(fiy2,ty);
596 fiz2 = _mm_add_ps(fiz2,tz);
598 fjx0 = _mm_add_ps(fjx0,tx);
599 fjy0 = _mm_add_ps(fjy0,ty);
600 fjz0 = _mm_add_ps(fjz0,tz);
604 /**************************
605 * CALCULATE INTERACTIONS *
606 **************************/
608 if (gmx_mm_any_lt(rsq21,rcutoff2))
611 /* REACTION-FIELD ELECTROSTATICS */
612 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
613 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
615 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
617 /* Update potential sum for this i atom from the interaction with this j atom. */
618 velec = _mm_and_ps(velec,cutoff_mask);
619 velecsum = _mm_add_ps(velecsum,velec);
623 fscal = _mm_and_ps(fscal,cutoff_mask);
625 /* Calculate temporary vectorial force */
626 tx = _mm_mul_ps(fscal,dx21);
627 ty = _mm_mul_ps(fscal,dy21);
628 tz = _mm_mul_ps(fscal,dz21);
630 /* Update vectorial force */
631 fix2 = _mm_add_ps(fix2,tx);
632 fiy2 = _mm_add_ps(fiy2,ty);
633 fiz2 = _mm_add_ps(fiz2,tz);
635 fjx1 = _mm_add_ps(fjx1,tx);
636 fjy1 = _mm_add_ps(fjy1,ty);
637 fjz1 = _mm_add_ps(fjz1,tz);
641 /**************************
642 * CALCULATE INTERACTIONS *
643 **************************/
645 if (gmx_mm_any_lt(rsq22,rcutoff2))
648 /* REACTION-FIELD ELECTROSTATICS */
649 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
650 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
652 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
654 /* Update potential sum for this i atom from the interaction with this j atom. */
655 velec = _mm_and_ps(velec,cutoff_mask);
656 velecsum = _mm_add_ps(velecsum,velec);
660 fscal = _mm_and_ps(fscal,cutoff_mask);
662 /* Calculate temporary vectorial force */
663 tx = _mm_mul_ps(fscal,dx22);
664 ty = _mm_mul_ps(fscal,dy22);
665 tz = _mm_mul_ps(fscal,dz22);
667 /* Update vectorial force */
668 fix2 = _mm_add_ps(fix2,tx);
669 fiy2 = _mm_add_ps(fiy2,ty);
670 fiz2 = _mm_add_ps(fiz2,tz);
672 fjx2 = _mm_add_ps(fjx2,tx);
673 fjy2 = _mm_add_ps(fjy2,ty);
674 fjz2 = _mm_add_ps(fjz2,tz);
678 fjptrA = f+j_coord_offsetA;
679 fjptrB = f+j_coord_offsetB;
680 fjptrC = f+j_coord_offsetC;
681 fjptrD = f+j_coord_offsetD;
683 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
684 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
686 /* Inner loop uses 360 flops */
692 /* Get j neighbor index, and coordinate index */
693 jnrlistA = jjnr[jidx];
694 jnrlistB = jjnr[jidx+1];
695 jnrlistC = jjnr[jidx+2];
696 jnrlistD = jjnr[jidx+3];
697 /* Sign of each element will be negative for non-real atoms.
698 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
699 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
701 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
702 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
703 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
704 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
705 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
706 j_coord_offsetA = DIM*jnrA;
707 j_coord_offsetB = DIM*jnrB;
708 j_coord_offsetC = DIM*jnrC;
709 j_coord_offsetD = DIM*jnrD;
711 /* load j atom coordinates */
712 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
713 x+j_coord_offsetC,x+j_coord_offsetD,
714 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
716 /* Calculate displacement vector */
717 dx00 = _mm_sub_ps(ix0,jx0);
718 dy00 = _mm_sub_ps(iy0,jy0);
719 dz00 = _mm_sub_ps(iz0,jz0);
720 dx01 = _mm_sub_ps(ix0,jx1);
721 dy01 = _mm_sub_ps(iy0,jy1);
722 dz01 = _mm_sub_ps(iz0,jz1);
723 dx02 = _mm_sub_ps(ix0,jx2);
724 dy02 = _mm_sub_ps(iy0,jy2);
725 dz02 = _mm_sub_ps(iz0,jz2);
726 dx10 = _mm_sub_ps(ix1,jx0);
727 dy10 = _mm_sub_ps(iy1,jy0);
728 dz10 = _mm_sub_ps(iz1,jz0);
729 dx11 = _mm_sub_ps(ix1,jx1);
730 dy11 = _mm_sub_ps(iy1,jy1);
731 dz11 = _mm_sub_ps(iz1,jz1);
732 dx12 = _mm_sub_ps(ix1,jx2);
733 dy12 = _mm_sub_ps(iy1,jy2);
734 dz12 = _mm_sub_ps(iz1,jz2);
735 dx20 = _mm_sub_ps(ix2,jx0);
736 dy20 = _mm_sub_ps(iy2,jy0);
737 dz20 = _mm_sub_ps(iz2,jz0);
738 dx21 = _mm_sub_ps(ix2,jx1);
739 dy21 = _mm_sub_ps(iy2,jy1);
740 dz21 = _mm_sub_ps(iz2,jz1);
741 dx22 = _mm_sub_ps(ix2,jx2);
742 dy22 = _mm_sub_ps(iy2,jy2);
743 dz22 = _mm_sub_ps(iz2,jz2);
745 /* Calculate squared distance and things based on it */
746 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
747 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
748 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
749 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
750 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
751 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
752 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
753 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
754 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
756 rinv00 = sse2_invsqrt_f(rsq00);
757 rinv01 = sse2_invsqrt_f(rsq01);
758 rinv02 = sse2_invsqrt_f(rsq02);
759 rinv10 = sse2_invsqrt_f(rsq10);
760 rinv11 = sse2_invsqrt_f(rsq11);
761 rinv12 = sse2_invsqrt_f(rsq12);
762 rinv20 = sse2_invsqrt_f(rsq20);
763 rinv21 = sse2_invsqrt_f(rsq21);
764 rinv22 = sse2_invsqrt_f(rsq22);
766 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
767 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
768 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
769 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
770 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
771 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
772 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
773 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
774 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
776 fjx0 = _mm_setzero_ps();
777 fjy0 = _mm_setzero_ps();
778 fjz0 = _mm_setzero_ps();
779 fjx1 = _mm_setzero_ps();
780 fjy1 = _mm_setzero_ps();
781 fjz1 = _mm_setzero_ps();
782 fjx2 = _mm_setzero_ps();
783 fjy2 = _mm_setzero_ps();
784 fjz2 = _mm_setzero_ps();
786 /**************************
787 * CALCULATE INTERACTIONS *
788 **************************/
790 if (gmx_mm_any_lt(rsq00,rcutoff2))
793 r00 = _mm_mul_ps(rsq00,rinv00);
794 r00 = _mm_andnot_ps(dummy_mask,r00);
796 /* Calculate table index by multiplying r with table scale and truncate to integer */
797 rt = _mm_mul_ps(r00,vftabscale);
798 vfitab = _mm_cvttps_epi32(rt);
799 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
800 vfitab = _mm_slli_epi32(vfitab,3);
802 /* REACTION-FIELD ELECTROSTATICS */
803 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
804 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
806 /* CUBIC SPLINE TABLE DISPERSION */
807 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
808 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
809 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
810 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
811 _MM_TRANSPOSE4_PS(Y,F,G,H);
812 Heps = _mm_mul_ps(vfeps,H);
813 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
814 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
815 vvdw6 = _mm_mul_ps(c6_00,VV);
816 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
817 fvdw6 = _mm_mul_ps(c6_00,FF);
819 /* CUBIC SPLINE TABLE REPULSION */
820 vfitab = _mm_add_epi32(vfitab,ifour);
821 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
822 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
823 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
824 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
825 _MM_TRANSPOSE4_PS(Y,F,G,H);
826 Heps = _mm_mul_ps(vfeps,H);
827 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
828 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
829 vvdw12 = _mm_mul_ps(c12_00,VV);
830 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
831 fvdw12 = _mm_mul_ps(c12_00,FF);
832 vvdw = _mm_add_ps(vvdw12,vvdw6);
833 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
835 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
837 /* Update potential sum for this i atom from the interaction with this j atom. */
838 velec = _mm_and_ps(velec,cutoff_mask);
839 velec = _mm_andnot_ps(dummy_mask,velec);
840 velecsum = _mm_add_ps(velecsum,velec);
841 vvdw = _mm_and_ps(vvdw,cutoff_mask);
842 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
843 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
845 fscal = _mm_add_ps(felec,fvdw);
847 fscal = _mm_and_ps(fscal,cutoff_mask);
849 fscal = _mm_andnot_ps(dummy_mask,fscal);
851 /* Calculate temporary vectorial force */
852 tx = _mm_mul_ps(fscal,dx00);
853 ty = _mm_mul_ps(fscal,dy00);
854 tz = _mm_mul_ps(fscal,dz00);
856 /* Update vectorial force */
857 fix0 = _mm_add_ps(fix0,tx);
858 fiy0 = _mm_add_ps(fiy0,ty);
859 fiz0 = _mm_add_ps(fiz0,tz);
861 fjx0 = _mm_add_ps(fjx0,tx);
862 fjy0 = _mm_add_ps(fjy0,ty);
863 fjz0 = _mm_add_ps(fjz0,tz);
867 /**************************
868 * CALCULATE INTERACTIONS *
869 **************************/
871 if (gmx_mm_any_lt(rsq01,rcutoff2))
874 /* REACTION-FIELD ELECTROSTATICS */
875 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
876 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
878 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
880 /* Update potential sum for this i atom from the interaction with this j atom. */
881 velec = _mm_and_ps(velec,cutoff_mask);
882 velec = _mm_andnot_ps(dummy_mask,velec);
883 velecsum = _mm_add_ps(velecsum,velec);
887 fscal = _mm_and_ps(fscal,cutoff_mask);
889 fscal = _mm_andnot_ps(dummy_mask,fscal);
891 /* Calculate temporary vectorial force */
892 tx = _mm_mul_ps(fscal,dx01);
893 ty = _mm_mul_ps(fscal,dy01);
894 tz = _mm_mul_ps(fscal,dz01);
896 /* Update vectorial force */
897 fix0 = _mm_add_ps(fix0,tx);
898 fiy0 = _mm_add_ps(fiy0,ty);
899 fiz0 = _mm_add_ps(fiz0,tz);
901 fjx1 = _mm_add_ps(fjx1,tx);
902 fjy1 = _mm_add_ps(fjy1,ty);
903 fjz1 = _mm_add_ps(fjz1,tz);
907 /**************************
908 * CALCULATE INTERACTIONS *
909 **************************/
911 if (gmx_mm_any_lt(rsq02,rcutoff2))
914 /* REACTION-FIELD ELECTROSTATICS */
915 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
916 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
918 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
920 /* Update potential sum for this i atom from the interaction with this j atom. */
921 velec = _mm_and_ps(velec,cutoff_mask);
922 velec = _mm_andnot_ps(dummy_mask,velec);
923 velecsum = _mm_add_ps(velecsum,velec);
927 fscal = _mm_and_ps(fscal,cutoff_mask);
929 fscal = _mm_andnot_ps(dummy_mask,fscal);
931 /* Calculate temporary vectorial force */
932 tx = _mm_mul_ps(fscal,dx02);
933 ty = _mm_mul_ps(fscal,dy02);
934 tz = _mm_mul_ps(fscal,dz02);
936 /* Update vectorial force */
937 fix0 = _mm_add_ps(fix0,tx);
938 fiy0 = _mm_add_ps(fiy0,ty);
939 fiz0 = _mm_add_ps(fiz0,tz);
941 fjx2 = _mm_add_ps(fjx2,tx);
942 fjy2 = _mm_add_ps(fjy2,ty);
943 fjz2 = _mm_add_ps(fjz2,tz);
947 /**************************
948 * CALCULATE INTERACTIONS *
949 **************************/
951 if (gmx_mm_any_lt(rsq10,rcutoff2))
954 /* REACTION-FIELD ELECTROSTATICS */
955 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
956 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
958 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
960 /* Update potential sum for this i atom from the interaction with this j atom. */
961 velec = _mm_and_ps(velec,cutoff_mask);
962 velec = _mm_andnot_ps(dummy_mask,velec);
963 velecsum = _mm_add_ps(velecsum,velec);
967 fscal = _mm_and_ps(fscal,cutoff_mask);
969 fscal = _mm_andnot_ps(dummy_mask,fscal);
971 /* Calculate temporary vectorial force */
972 tx = _mm_mul_ps(fscal,dx10);
973 ty = _mm_mul_ps(fscal,dy10);
974 tz = _mm_mul_ps(fscal,dz10);
976 /* Update vectorial force */
977 fix1 = _mm_add_ps(fix1,tx);
978 fiy1 = _mm_add_ps(fiy1,ty);
979 fiz1 = _mm_add_ps(fiz1,tz);
981 fjx0 = _mm_add_ps(fjx0,tx);
982 fjy0 = _mm_add_ps(fjy0,ty);
983 fjz0 = _mm_add_ps(fjz0,tz);
987 /**************************
988 * CALCULATE INTERACTIONS *
989 **************************/
991 if (gmx_mm_any_lt(rsq11,rcutoff2))
994 /* REACTION-FIELD ELECTROSTATICS */
995 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
996 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
998 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1000 /* Update potential sum for this i atom from the interaction with this j atom. */
1001 velec = _mm_and_ps(velec,cutoff_mask);
1002 velec = _mm_andnot_ps(dummy_mask,velec);
1003 velecsum = _mm_add_ps(velecsum,velec);
1007 fscal = _mm_and_ps(fscal,cutoff_mask);
1009 fscal = _mm_andnot_ps(dummy_mask,fscal);
1011 /* Calculate temporary vectorial force */
1012 tx = _mm_mul_ps(fscal,dx11);
1013 ty = _mm_mul_ps(fscal,dy11);
1014 tz = _mm_mul_ps(fscal,dz11);
1016 /* Update vectorial force */
1017 fix1 = _mm_add_ps(fix1,tx);
1018 fiy1 = _mm_add_ps(fiy1,ty);
1019 fiz1 = _mm_add_ps(fiz1,tz);
1021 fjx1 = _mm_add_ps(fjx1,tx);
1022 fjy1 = _mm_add_ps(fjy1,ty);
1023 fjz1 = _mm_add_ps(fjz1,tz);
1027 /**************************
1028 * CALCULATE INTERACTIONS *
1029 **************************/
1031 if (gmx_mm_any_lt(rsq12,rcutoff2))
1034 /* REACTION-FIELD ELECTROSTATICS */
1035 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
1036 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1038 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1040 /* Update potential sum for this i atom from the interaction with this j atom. */
1041 velec = _mm_and_ps(velec,cutoff_mask);
1042 velec = _mm_andnot_ps(dummy_mask,velec);
1043 velecsum = _mm_add_ps(velecsum,velec);
1047 fscal = _mm_and_ps(fscal,cutoff_mask);
1049 fscal = _mm_andnot_ps(dummy_mask,fscal);
1051 /* Calculate temporary vectorial force */
1052 tx = _mm_mul_ps(fscal,dx12);
1053 ty = _mm_mul_ps(fscal,dy12);
1054 tz = _mm_mul_ps(fscal,dz12);
1056 /* Update vectorial force */
1057 fix1 = _mm_add_ps(fix1,tx);
1058 fiy1 = _mm_add_ps(fiy1,ty);
1059 fiz1 = _mm_add_ps(fiz1,tz);
1061 fjx2 = _mm_add_ps(fjx2,tx);
1062 fjy2 = _mm_add_ps(fjy2,ty);
1063 fjz2 = _mm_add_ps(fjz2,tz);
1067 /**************************
1068 * CALCULATE INTERACTIONS *
1069 **************************/
1071 if (gmx_mm_any_lt(rsq20,rcutoff2))
1074 /* REACTION-FIELD ELECTROSTATICS */
1075 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
1076 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1078 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1080 /* Update potential sum for this i atom from the interaction with this j atom. */
1081 velec = _mm_and_ps(velec,cutoff_mask);
1082 velec = _mm_andnot_ps(dummy_mask,velec);
1083 velecsum = _mm_add_ps(velecsum,velec);
1087 fscal = _mm_and_ps(fscal,cutoff_mask);
1089 fscal = _mm_andnot_ps(dummy_mask,fscal);
1091 /* Calculate temporary vectorial force */
1092 tx = _mm_mul_ps(fscal,dx20);
1093 ty = _mm_mul_ps(fscal,dy20);
1094 tz = _mm_mul_ps(fscal,dz20);
1096 /* Update vectorial force */
1097 fix2 = _mm_add_ps(fix2,tx);
1098 fiy2 = _mm_add_ps(fiy2,ty);
1099 fiz2 = _mm_add_ps(fiz2,tz);
1101 fjx0 = _mm_add_ps(fjx0,tx);
1102 fjy0 = _mm_add_ps(fjy0,ty);
1103 fjz0 = _mm_add_ps(fjz0,tz);
1107 /**************************
1108 * CALCULATE INTERACTIONS *
1109 **************************/
1111 if (gmx_mm_any_lt(rsq21,rcutoff2))
1114 /* REACTION-FIELD ELECTROSTATICS */
1115 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
1116 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1118 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1120 /* Update potential sum for this i atom from the interaction with this j atom. */
1121 velec = _mm_and_ps(velec,cutoff_mask);
1122 velec = _mm_andnot_ps(dummy_mask,velec);
1123 velecsum = _mm_add_ps(velecsum,velec);
1127 fscal = _mm_and_ps(fscal,cutoff_mask);
1129 fscal = _mm_andnot_ps(dummy_mask,fscal);
1131 /* Calculate temporary vectorial force */
1132 tx = _mm_mul_ps(fscal,dx21);
1133 ty = _mm_mul_ps(fscal,dy21);
1134 tz = _mm_mul_ps(fscal,dz21);
1136 /* Update vectorial force */
1137 fix2 = _mm_add_ps(fix2,tx);
1138 fiy2 = _mm_add_ps(fiy2,ty);
1139 fiz2 = _mm_add_ps(fiz2,tz);
1141 fjx1 = _mm_add_ps(fjx1,tx);
1142 fjy1 = _mm_add_ps(fjy1,ty);
1143 fjz1 = _mm_add_ps(fjz1,tz);
1147 /**************************
1148 * CALCULATE INTERACTIONS *
1149 **************************/
1151 if (gmx_mm_any_lt(rsq22,rcutoff2))
1154 /* REACTION-FIELD ELECTROSTATICS */
1155 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
1156 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1158 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1160 /* Update potential sum for this i atom from the interaction with this j atom. */
1161 velec = _mm_and_ps(velec,cutoff_mask);
1162 velec = _mm_andnot_ps(dummy_mask,velec);
1163 velecsum = _mm_add_ps(velecsum,velec);
1167 fscal = _mm_and_ps(fscal,cutoff_mask);
1169 fscal = _mm_andnot_ps(dummy_mask,fscal);
1171 /* Calculate temporary vectorial force */
1172 tx = _mm_mul_ps(fscal,dx22);
1173 ty = _mm_mul_ps(fscal,dy22);
1174 tz = _mm_mul_ps(fscal,dz22);
1176 /* Update vectorial force */
1177 fix2 = _mm_add_ps(fix2,tx);
1178 fiy2 = _mm_add_ps(fiy2,ty);
1179 fiz2 = _mm_add_ps(fiz2,tz);
1181 fjx2 = _mm_add_ps(fjx2,tx);
1182 fjy2 = _mm_add_ps(fjy2,ty);
1183 fjz2 = _mm_add_ps(fjz2,tz);
1187 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1188 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1189 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1190 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1192 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1193 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1195 /* Inner loop uses 361 flops */
1198 /* End of innermost loop */
1200 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1201 f+i_coord_offset,fshift+i_shift_offset);
1204 /* Update potential energies */
1205 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1206 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1208 /* Increment number of inner iterations */
1209 inneriter += j_index_end - j_index_start;
1211 /* Outer loop uses 20 flops */
1214 /* Increment number of outer iterations */
1217 /* Update outer/inner flops */
1219 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*361);
1222 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse2_single
1223 * Electrostatics interaction: ReactionField
1224 * VdW interaction: CubicSplineTable
1225 * Geometry: Water3-Water3
1226 * Calculate force/pot: Force
1229 nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse2_single
1230 (t_nblist * gmx_restrict nlist,
1231 rvec * gmx_restrict xx,
1232 rvec * gmx_restrict ff,
1233 struct t_forcerec * gmx_restrict fr,
1234 t_mdatoms * gmx_restrict mdatoms,
1235 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1236 t_nrnb * gmx_restrict nrnb)
1238 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1239 * just 0 for non-waters.
1240 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1241 * jnr indices corresponding to data put in the four positions in the SIMD register.
1243 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1244 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1245 int jnrA,jnrB,jnrC,jnrD;
1246 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1247 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1248 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1249 real rcutoff_scalar;
1250 real *shiftvec,*fshift,*x,*f;
1251 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1252 real scratch[4*DIM];
1253 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1255 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1257 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1259 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1260 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1261 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1262 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1263 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1264 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1265 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1266 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1267 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1268 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1269 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1270 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1271 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1272 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1273 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1274 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1275 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1278 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1281 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1282 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1284 __m128i ifour = _mm_set1_epi32(4);
1285 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1287 __m128 dummy_mask,cutoff_mask;
1288 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1289 __m128 one = _mm_set1_ps(1.0);
1290 __m128 two = _mm_set1_ps(2.0);
1296 jindex = nlist->jindex;
1298 shiftidx = nlist->shift;
1300 shiftvec = fr->shift_vec[0];
1301 fshift = fr->fshift[0];
1302 facel = _mm_set1_ps(fr->ic->epsfac);
1303 charge = mdatoms->chargeA;
1304 krf = _mm_set1_ps(fr->ic->k_rf);
1305 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
1306 crf = _mm_set1_ps(fr->ic->c_rf);
1307 nvdwtype = fr->ntype;
1308 vdwparam = fr->nbfp;
1309 vdwtype = mdatoms->typeA;
1311 vftab = kernel_data->table_vdw->data;
1312 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
1314 /* Setup water-specific parameters */
1315 inr = nlist->iinr[0];
1316 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1317 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1318 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1319 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1321 jq0 = _mm_set1_ps(charge[inr+0]);
1322 jq1 = _mm_set1_ps(charge[inr+1]);
1323 jq2 = _mm_set1_ps(charge[inr+2]);
1324 vdwjidx0A = 2*vdwtype[inr+0];
1325 qq00 = _mm_mul_ps(iq0,jq0);
1326 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1327 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1328 qq01 = _mm_mul_ps(iq0,jq1);
1329 qq02 = _mm_mul_ps(iq0,jq2);
1330 qq10 = _mm_mul_ps(iq1,jq0);
1331 qq11 = _mm_mul_ps(iq1,jq1);
1332 qq12 = _mm_mul_ps(iq1,jq2);
1333 qq20 = _mm_mul_ps(iq2,jq0);
1334 qq21 = _mm_mul_ps(iq2,jq1);
1335 qq22 = _mm_mul_ps(iq2,jq2);
1337 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1338 rcutoff_scalar = fr->ic->rcoulomb;
1339 rcutoff = _mm_set1_ps(rcutoff_scalar);
1340 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1342 /* Avoid stupid compiler warnings */
1343 jnrA = jnrB = jnrC = jnrD = 0;
1344 j_coord_offsetA = 0;
1345 j_coord_offsetB = 0;
1346 j_coord_offsetC = 0;
1347 j_coord_offsetD = 0;
1352 for(iidx=0;iidx<4*DIM;iidx++)
1354 scratch[iidx] = 0.0;
1357 /* Start outer loop over neighborlists */
1358 for(iidx=0; iidx<nri; iidx++)
1360 /* Load shift vector for this list */
1361 i_shift_offset = DIM*shiftidx[iidx];
1363 /* Load limits for loop over neighbors */
1364 j_index_start = jindex[iidx];
1365 j_index_end = jindex[iidx+1];
1367 /* Get outer coordinate index */
1369 i_coord_offset = DIM*inr;
1371 /* Load i particle coords and add shift vector */
1372 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1373 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1375 fix0 = _mm_setzero_ps();
1376 fiy0 = _mm_setzero_ps();
1377 fiz0 = _mm_setzero_ps();
1378 fix1 = _mm_setzero_ps();
1379 fiy1 = _mm_setzero_ps();
1380 fiz1 = _mm_setzero_ps();
1381 fix2 = _mm_setzero_ps();
1382 fiy2 = _mm_setzero_ps();
1383 fiz2 = _mm_setzero_ps();
1385 /* Start inner kernel loop */
1386 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1389 /* Get j neighbor index, and coordinate index */
1391 jnrB = jjnr[jidx+1];
1392 jnrC = jjnr[jidx+2];
1393 jnrD = jjnr[jidx+3];
1394 j_coord_offsetA = DIM*jnrA;
1395 j_coord_offsetB = DIM*jnrB;
1396 j_coord_offsetC = DIM*jnrC;
1397 j_coord_offsetD = DIM*jnrD;
1399 /* load j atom coordinates */
1400 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1401 x+j_coord_offsetC,x+j_coord_offsetD,
1402 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1404 /* Calculate displacement vector */
1405 dx00 = _mm_sub_ps(ix0,jx0);
1406 dy00 = _mm_sub_ps(iy0,jy0);
1407 dz00 = _mm_sub_ps(iz0,jz0);
1408 dx01 = _mm_sub_ps(ix0,jx1);
1409 dy01 = _mm_sub_ps(iy0,jy1);
1410 dz01 = _mm_sub_ps(iz0,jz1);
1411 dx02 = _mm_sub_ps(ix0,jx2);
1412 dy02 = _mm_sub_ps(iy0,jy2);
1413 dz02 = _mm_sub_ps(iz0,jz2);
1414 dx10 = _mm_sub_ps(ix1,jx0);
1415 dy10 = _mm_sub_ps(iy1,jy0);
1416 dz10 = _mm_sub_ps(iz1,jz0);
1417 dx11 = _mm_sub_ps(ix1,jx1);
1418 dy11 = _mm_sub_ps(iy1,jy1);
1419 dz11 = _mm_sub_ps(iz1,jz1);
1420 dx12 = _mm_sub_ps(ix1,jx2);
1421 dy12 = _mm_sub_ps(iy1,jy2);
1422 dz12 = _mm_sub_ps(iz1,jz2);
1423 dx20 = _mm_sub_ps(ix2,jx0);
1424 dy20 = _mm_sub_ps(iy2,jy0);
1425 dz20 = _mm_sub_ps(iz2,jz0);
1426 dx21 = _mm_sub_ps(ix2,jx1);
1427 dy21 = _mm_sub_ps(iy2,jy1);
1428 dz21 = _mm_sub_ps(iz2,jz1);
1429 dx22 = _mm_sub_ps(ix2,jx2);
1430 dy22 = _mm_sub_ps(iy2,jy2);
1431 dz22 = _mm_sub_ps(iz2,jz2);
1433 /* Calculate squared distance and things based on it */
1434 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1435 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1436 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1437 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1438 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1439 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1440 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1441 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1442 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1444 rinv00 = sse2_invsqrt_f(rsq00);
1445 rinv01 = sse2_invsqrt_f(rsq01);
1446 rinv02 = sse2_invsqrt_f(rsq02);
1447 rinv10 = sse2_invsqrt_f(rsq10);
1448 rinv11 = sse2_invsqrt_f(rsq11);
1449 rinv12 = sse2_invsqrt_f(rsq12);
1450 rinv20 = sse2_invsqrt_f(rsq20);
1451 rinv21 = sse2_invsqrt_f(rsq21);
1452 rinv22 = sse2_invsqrt_f(rsq22);
1454 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1455 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1456 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1457 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1458 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1459 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1460 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1461 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1462 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1464 fjx0 = _mm_setzero_ps();
1465 fjy0 = _mm_setzero_ps();
1466 fjz0 = _mm_setzero_ps();
1467 fjx1 = _mm_setzero_ps();
1468 fjy1 = _mm_setzero_ps();
1469 fjz1 = _mm_setzero_ps();
1470 fjx2 = _mm_setzero_ps();
1471 fjy2 = _mm_setzero_ps();
1472 fjz2 = _mm_setzero_ps();
1474 /**************************
1475 * CALCULATE INTERACTIONS *
1476 **************************/
1478 if (gmx_mm_any_lt(rsq00,rcutoff2))
1481 r00 = _mm_mul_ps(rsq00,rinv00);
1483 /* Calculate table index by multiplying r with table scale and truncate to integer */
1484 rt = _mm_mul_ps(r00,vftabscale);
1485 vfitab = _mm_cvttps_epi32(rt);
1486 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1487 vfitab = _mm_slli_epi32(vfitab,3);
1489 /* REACTION-FIELD ELECTROSTATICS */
1490 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1492 /* CUBIC SPLINE TABLE DISPERSION */
1493 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1494 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1495 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1496 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1497 _MM_TRANSPOSE4_PS(Y,F,G,H);
1498 Heps = _mm_mul_ps(vfeps,H);
1499 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1500 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1501 fvdw6 = _mm_mul_ps(c6_00,FF);
1503 /* CUBIC SPLINE TABLE REPULSION */
1504 vfitab = _mm_add_epi32(vfitab,ifour);
1505 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1506 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1507 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1508 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1509 _MM_TRANSPOSE4_PS(Y,F,G,H);
1510 Heps = _mm_mul_ps(vfeps,H);
1511 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1512 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1513 fvdw12 = _mm_mul_ps(c12_00,FF);
1514 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1516 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1518 fscal = _mm_add_ps(felec,fvdw);
1520 fscal = _mm_and_ps(fscal,cutoff_mask);
1522 /* Calculate temporary vectorial force */
1523 tx = _mm_mul_ps(fscal,dx00);
1524 ty = _mm_mul_ps(fscal,dy00);
1525 tz = _mm_mul_ps(fscal,dz00);
1527 /* Update vectorial force */
1528 fix0 = _mm_add_ps(fix0,tx);
1529 fiy0 = _mm_add_ps(fiy0,ty);
1530 fiz0 = _mm_add_ps(fiz0,tz);
1532 fjx0 = _mm_add_ps(fjx0,tx);
1533 fjy0 = _mm_add_ps(fjy0,ty);
1534 fjz0 = _mm_add_ps(fjz0,tz);
1538 /**************************
1539 * CALCULATE INTERACTIONS *
1540 **************************/
1542 if (gmx_mm_any_lt(rsq01,rcutoff2))
1545 /* REACTION-FIELD ELECTROSTATICS */
1546 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1548 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1552 fscal = _mm_and_ps(fscal,cutoff_mask);
1554 /* Calculate temporary vectorial force */
1555 tx = _mm_mul_ps(fscal,dx01);
1556 ty = _mm_mul_ps(fscal,dy01);
1557 tz = _mm_mul_ps(fscal,dz01);
1559 /* Update vectorial force */
1560 fix0 = _mm_add_ps(fix0,tx);
1561 fiy0 = _mm_add_ps(fiy0,ty);
1562 fiz0 = _mm_add_ps(fiz0,tz);
1564 fjx1 = _mm_add_ps(fjx1,tx);
1565 fjy1 = _mm_add_ps(fjy1,ty);
1566 fjz1 = _mm_add_ps(fjz1,tz);
1570 /**************************
1571 * CALCULATE INTERACTIONS *
1572 **************************/
1574 if (gmx_mm_any_lt(rsq02,rcutoff2))
1577 /* REACTION-FIELD ELECTROSTATICS */
1578 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1580 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1584 fscal = _mm_and_ps(fscal,cutoff_mask);
1586 /* Calculate temporary vectorial force */
1587 tx = _mm_mul_ps(fscal,dx02);
1588 ty = _mm_mul_ps(fscal,dy02);
1589 tz = _mm_mul_ps(fscal,dz02);
1591 /* Update vectorial force */
1592 fix0 = _mm_add_ps(fix0,tx);
1593 fiy0 = _mm_add_ps(fiy0,ty);
1594 fiz0 = _mm_add_ps(fiz0,tz);
1596 fjx2 = _mm_add_ps(fjx2,tx);
1597 fjy2 = _mm_add_ps(fjy2,ty);
1598 fjz2 = _mm_add_ps(fjz2,tz);
1602 /**************************
1603 * CALCULATE INTERACTIONS *
1604 **************************/
1606 if (gmx_mm_any_lt(rsq10,rcutoff2))
1609 /* REACTION-FIELD ELECTROSTATICS */
1610 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1612 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1616 fscal = _mm_and_ps(fscal,cutoff_mask);
1618 /* Calculate temporary vectorial force */
1619 tx = _mm_mul_ps(fscal,dx10);
1620 ty = _mm_mul_ps(fscal,dy10);
1621 tz = _mm_mul_ps(fscal,dz10);
1623 /* Update vectorial force */
1624 fix1 = _mm_add_ps(fix1,tx);
1625 fiy1 = _mm_add_ps(fiy1,ty);
1626 fiz1 = _mm_add_ps(fiz1,tz);
1628 fjx0 = _mm_add_ps(fjx0,tx);
1629 fjy0 = _mm_add_ps(fjy0,ty);
1630 fjz0 = _mm_add_ps(fjz0,tz);
1634 /**************************
1635 * CALCULATE INTERACTIONS *
1636 **************************/
1638 if (gmx_mm_any_lt(rsq11,rcutoff2))
1641 /* REACTION-FIELD ELECTROSTATICS */
1642 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1644 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1648 fscal = _mm_and_ps(fscal,cutoff_mask);
1650 /* Calculate temporary vectorial force */
1651 tx = _mm_mul_ps(fscal,dx11);
1652 ty = _mm_mul_ps(fscal,dy11);
1653 tz = _mm_mul_ps(fscal,dz11);
1655 /* Update vectorial force */
1656 fix1 = _mm_add_ps(fix1,tx);
1657 fiy1 = _mm_add_ps(fiy1,ty);
1658 fiz1 = _mm_add_ps(fiz1,tz);
1660 fjx1 = _mm_add_ps(fjx1,tx);
1661 fjy1 = _mm_add_ps(fjy1,ty);
1662 fjz1 = _mm_add_ps(fjz1,tz);
1666 /**************************
1667 * CALCULATE INTERACTIONS *
1668 **************************/
1670 if (gmx_mm_any_lt(rsq12,rcutoff2))
1673 /* REACTION-FIELD ELECTROSTATICS */
1674 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1676 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1680 fscal = _mm_and_ps(fscal,cutoff_mask);
1682 /* Calculate temporary vectorial force */
1683 tx = _mm_mul_ps(fscal,dx12);
1684 ty = _mm_mul_ps(fscal,dy12);
1685 tz = _mm_mul_ps(fscal,dz12);
1687 /* Update vectorial force */
1688 fix1 = _mm_add_ps(fix1,tx);
1689 fiy1 = _mm_add_ps(fiy1,ty);
1690 fiz1 = _mm_add_ps(fiz1,tz);
1692 fjx2 = _mm_add_ps(fjx2,tx);
1693 fjy2 = _mm_add_ps(fjy2,ty);
1694 fjz2 = _mm_add_ps(fjz2,tz);
1698 /**************************
1699 * CALCULATE INTERACTIONS *
1700 **************************/
1702 if (gmx_mm_any_lt(rsq20,rcutoff2))
1705 /* REACTION-FIELD ELECTROSTATICS */
1706 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1708 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1712 fscal = _mm_and_ps(fscal,cutoff_mask);
1714 /* Calculate temporary vectorial force */
1715 tx = _mm_mul_ps(fscal,dx20);
1716 ty = _mm_mul_ps(fscal,dy20);
1717 tz = _mm_mul_ps(fscal,dz20);
1719 /* Update vectorial force */
1720 fix2 = _mm_add_ps(fix2,tx);
1721 fiy2 = _mm_add_ps(fiy2,ty);
1722 fiz2 = _mm_add_ps(fiz2,tz);
1724 fjx0 = _mm_add_ps(fjx0,tx);
1725 fjy0 = _mm_add_ps(fjy0,ty);
1726 fjz0 = _mm_add_ps(fjz0,tz);
1730 /**************************
1731 * CALCULATE INTERACTIONS *
1732 **************************/
1734 if (gmx_mm_any_lt(rsq21,rcutoff2))
1737 /* REACTION-FIELD ELECTROSTATICS */
1738 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1740 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1744 fscal = _mm_and_ps(fscal,cutoff_mask);
1746 /* Calculate temporary vectorial force */
1747 tx = _mm_mul_ps(fscal,dx21);
1748 ty = _mm_mul_ps(fscal,dy21);
1749 tz = _mm_mul_ps(fscal,dz21);
1751 /* Update vectorial force */
1752 fix2 = _mm_add_ps(fix2,tx);
1753 fiy2 = _mm_add_ps(fiy2,ty);
1754 fiz2 = _mm_add_ps(fiz2,tz);
1756 fjx1 = _mm_add_ps(fjx1,tx);
1757 fjy1 = _mm_add_ps(fjy1,ty);
1758 fjz1 = _mm_add_ps(fjz1,tz);
1762 /**************************
1763 * CALCULATE INTERACTIONS *
1764 **************************/
1766 if (gmx_mm_any_lt(rsq22,rcutoff2))
1769 /* REACTION-FIELD ELECTROSTATICS */
1770 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1772 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1776 fscal = _mm_and_ps(fscal,cutoff_mask);
1778 /* Calculate temporary vectorial force */
1779 tx = _mm_mul_ps(fscal,dx22);
1780 ty = _mm_mul_ps(fscal,dy22);
1781 tz = _mm_mul_ps(fscal,dz22);
1783 /* Update vectorial force */
1784 fix2 = _mm_add_ps(fix2,tx);
1785 fiy2 = _mm_add_ps(fiy2,ty);
1786 fiz2 = _mm_add_ps(fiz2,tz);
1788 fjx2 = _mm_add_ps(fjx2,tx);
1789 fjy2 = _mm_add_ps(fjy2,ty);
1790 fjz2 = _mm_add_ps(fjz2,tz);
1794 fjptrA = f+j_coord_offsetA;
1795 fjptrB = f+j_coord_offsetB;
1796 fjptrC = f+j_coord_offsetC;
1797 fjptrD = f+j_coord_offsetD;
1799 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1800 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1802 /* Inner loop uses 297 flops */
1805 if(jidx<j_index_end)
1808 /* Get j neighbor index, and coordinate index */
1809 jnrlistA = jjnr[jidx];
1810 jnrlistB = jjnr[jidx+1];
1811 jnrlistC = jjnr[jidx+2];
1812 jnrlistD = jjnr[jidx+3];
1813 /* Sign of each element will be negative for non-real atoms.
1814 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1815 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1817 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1818 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1819 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1820 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1821 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1822 j_coord_offsetA = DIM*jnrA;
1823 j_coord_offsetB = DIM*jnrB;
1824 j_coord_offsetC = DIM*jnrC;
1825 j_coord_offsetD = DIM*jnrD;
1827 /* load j atom coordinates */
1828 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1829 x+j_coord_offsetC,x+j_coord_offsetD,
1830 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1832 /* Calculate displacement vector */
1833 dx00 = _mm_sub_ps(ix0,jx0);
1834 dy00 = _mm_sub_ps(iy0,jy0);
1835 dz00 = _mm_sub_ps(iz0,jz0);
1836 dx01 = _mm_sub_ps(ix0,jx1);
1837 dy01 = _mm_sub_ps(iy0,jy1);
1838 dz01 = _mm_sub_ps(iz0,jz1);
1839 dx02 = _mm_sub_ps(ix0,jx2);
1840 dy02 = _mm_sub_ps(iy0,jy2);
1841 dz02 = _mm_sub_ps(iz0,jz2);
1842 dx10 = _mm_sub_ps(ix1,jx0);
1843 dy10 = _mm_sub_ps(iy1,jy0);
1844 dz10 = _mm_sub_ps(iz1,jz0);
1845 dx11 = _mm_sub_ps(ix1,jx1);
1846 dy11 = _mm_sub_ps(iy1,jy1);
1847 dz11 = _mm_sub_ps(iz1,jz1);
1848 dx12 = _mm_sub_ps(ix1,jx2);
1849 dy12 = _mm_sub_ps(iy1,jy2);
1850 dz12 = _mm_sub_ps(iz1,jz2);
1851 dx20 = _mm_sub_ps(ix2,jx0);
1852 dy20 = _mm_sub_ps(iy2,jy0);
1853 dz20 = _mm_sub_ps(iz2,jz0);
1854 dx21 = _mm_sub_ps(ix2,jx1);
1855 dy21 = _mm_sub_ps(iy2,jy1);
1856 dz21 = _mm_sub_ps(iz2,jz1);
1857 dx22 = _mm_sub_ps(ix2,jx2);
1858 dy22 = _mm_sub_ps(iy2,jy2);
1859 dz22 = _mm_sub_ps(iz2,jz2);
1861 /* Calculate squared distance and things based on it */
1862 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1863 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1864 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1865 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1866 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1867 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1868 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1869 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1870 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1872 rinv00 = sse2_invsqrt_f(rsq00);
1873 rinv01 = sse2_invsqrt_f(rsq01);
1874 rinv02 = sse2_invsqrt_f(rsq02);
1875 rinv10 = sse2_invsqrt_f(rsq10);
1876 rinv11 = sse2_invsqrt_f(rsq11);
1877 rinv12 = sse2_invsqrt_f(rsq12);
1878 rinv20 = sse2_invsqrt_f(rsq20);
1879 rinv21 = sse2_invsqrt_f(rsq21);
1880 rinv22 = sse2_invsqrt_f(rsq22);
1882 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1883 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1884 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1885 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1886 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1887 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1888 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1889 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1890 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1892 fjx0 = _mm_setzero_ps();
1893 fjy0 = _mm_setzero_ps();
1894 fjz0 = _mm_setzero_ps();
1895 fjx1 = _mm_setzero_ps();
1896 fjy1 = _mm_setzero_ps();
1897 fjz1 = _mm_setzero_ps();
1898 fjx2 = _mm_setzero_ps();
1899 fjy2 = _mm_setzero_ps();
1900 fjz2 = _mm_setzero_ps();
1902 /**************************
1903 * CALCULATE INTERACTIONS *
1904 **************************/
1906 if (gmx_mm_any_lt(rsq00,rcutoff2))
1909 r00 = _mm_mul_ps(rsq00,rinv00);
1910 r00 = _mm_andnot_ps(dummy_mask,r00);
1912 /* Calculate table index by multiplying r with table scale and truncate to integer */
1913 rt = _mm_mul_ps(r00,vftabscale);
1914 vfitab = _mm_cvttps_epi32(rt);
1915 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1916 vfitab = _mm_slli_epi32(vfitab,3);
1918 /* REACTION-FIELD ELECTROSTATICS */
1919 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1921 /* CUBIC SPLINE TABLE DISPERSION */
1922 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1923 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1924 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1925 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1926 _MM_TRANSPOSE4_PS(Y,F,G,H);
1927 Heps = _mm_mul_ps(vfeps,H);
1928 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1929 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1930 fvdw6 = _mm_mul_ps(c6_00,FF);
1932 /* CUBIC SPLINE TABLE REPULSION */
1933 vfitab = _mm_add_epi32(vfitab,ifour);
1934 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1935 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1936 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1937 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1938 _MM_TRANSPOSE4_PS(Y,F,G,H);
1939 Heps = _mm_mul_ps(vfeps,H);
1940 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1941 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1942 fvdw12 = _mm_mul_ps(c12_00,FF);
1943 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1945 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1947 fscal = _mm_add_ps(felec,fvdw);
1949 fscal = _mm_and_ps(fscal,cutoff_mask);
1951 fscal = _mm_andnot_ps(dummy_mask,fscal);
1953 /* Calculate temporary vectorial force */
1954 tx = _mm_mul_ps(fscal,dx00);
1955 ty = _mm_mul_ps(fscal,dy00);
1956 tz = _mm_mul_ps(fscal,dz00);
1958 /* Update vectorial force */
1959 fix0 = _mm_add_ps(fix0,tx);
1960 fiy0 = _mm_add_ps(fiy0,ty);
1961 fiz0 = _mm_add_ps(fiz0,tz);
1963 fjx0 = _mm_add_ps(fjx0,tx);
1964 fjy0 = _mm_add_ps(fjy0,ty);
1965 fjz0 = _mm_add_ps(fjz0,tz);
1969 /**************************
1970 * CALCULATE INTERACTIONS *
1971 **************************/
1973 if (gmx_mm_any_lt(rsq01,rcutoff2))
1976 /* REACTION-FIELD ELECTROSTATICS */
1977 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1979 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1983 fscal = _mm_and_ps(fscal,cutoff_mask);
1985 fscal = _mm_andnot_ps(dummy_mask,fscal);
1987 /* Calculate temporary vectorial force */
1988 tx = _mm_mul_ps(fscal,dx01);
1989 ty = _mm_mul_ps(fscal,dy01);
1990 tz = _mm_mul_ps(fscal,dz01);
1992 /* Update vectorial force */
1993 fix0 = _mm_add_ps(fix0,tx);
1994 fiy0 = _mm_add_ps(fiy0,ty);
1995 fiz0 = _mm_add_ps(fiz0,tz);
1997 fjx1 = _mm_add_ps(fjx1,tx);
1998 fjy1 = _mm_add_ps(fjy1,ty);
1999 fjz1 = _mm_add_ps(fjz1,tz);
2003 /**************************
2004 * CALCULATE INTERACTIONS *
2005 **************************/
2007 if (gmx_mm_any_lt(rsq02,rcutoff2))
2010 /* REACTION-FIELD ELECTROSTATICS */
2011 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
2013 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2017 fscal = _mm_and_ps(fscal,cutoff_mask);
2019 fscal = _mm_andnot_ps(dummy_mask,fscal);
2021 /* Calculate temporary vectorial force */
2022 tx = _mm_mul_ps(fscal,dx02);
2023 ty = _mm_mul_ps(fscal,dy02);
2024 tz = _mm_mul_ps(fscal,dz02);
2026 /* Update vectorial force */
2027 fix0 = _mm_add_ps(fix0,tx);
2028 fiy0 = _mm_add_ps(fiy0,ty);
2029 fiz0 = _mm_add_ps(fiz0,tz);
2031 fjx2 = _mm_add_ps(fjx2,tx);
2032 fjy2 = _mm_add_ps(fjy2,ty);
2033 fjz2 = _mm_add_ps(fjz2,tz);
2037 /**************************
2038 * CALCULATE INTERACTIONS *
2039 **************************/
2041 if (gmx_mm_any_lt(rsq10,rcutoff2))
2044 /* REACTION-FIELD ELECTROSTATICS */
2045 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
2047 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2051 fscal = _mm_and_ps(fscal,cutoff_mask);
2053 fscal = _mm_andnot_ps(dummy_mask,fscal);
2055 /* Calculate temporary vectorial force */
2056 tx = _mm_mul_ps(fscal,dx10);
2057 ty = _mm_mul_ps(fscal,dy10);
2058 tz = _mm_mul_ps(fscal,dz10);
2060 /* Update vectorial force */
2061 fix1 = _mm_add_ps(fix1,tx);
2062 fiy1 = _mm_add_ps(fiy1,ty);
2063 fiz1 = _mm_add_ps(fiz1,tz);
2065 fjx0 = _mm_add_ps(fjx0,tx);
2066 fjy0 = _mm_add_ps(fjy0,ty);
2067 fjz0 = _mm_add_ps(fjz0,tz);
2071 /**************************
2072 * CALCULATE INTERACTIONS *
2073 **************************/
2075 if (gmx_mm_any_lt(rsq11,rcutoff2))
2078 /* REACTION-FIELD ELECTROSTATICS */
2079 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
2081 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2085 fscal = _mm_and_ps(fscal,cutoff_mask);
2087 fscal = _mm_andnot_ps(dummy_mask,fscal);
2089 /* Calculate temporary vectorial force */
2090 tx = _mm_mul_ps(fscal,dx11);
2091 ty = _mm_mul_ps(fscal,dy11);
2092 tz = _mm_mul_ps(fscal,dz11);
2094 /* Update vectorial force */
2095 fix1 = _mm_add_ps(fix1,tx);
2096 fiy1 = _mm_add_ps(fiy1,ty);
2097 fiz1 = _mm_add_ps(fiz1,tz);
2099 fjx1 = _mm_add_ps(fjx1,tx);
2100 fjy1 = _mm_add_ps(fjy1,ty);
2101 fjz1 = _mm_add_ps(fjz1,tz);
2105 /**************************
2106 * CALCULATE INTERACTIONS *
2107 **************************/
2109 if (gmx_mm_any_lt(rsq12,rcutoff2))
2112 /* REACTION-FIELD ELECTROSTATICS */
2113 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
2115 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2119 fscal = _mm_and_ps(fscal,cutoff_mask);
2121 fscal = _mm_andnot_ps(dummy_mask,fscal);
2123 /* Calculate temporary vectorial force */
2124 tx = _mm_mul_ps(fscal,dx12);
2125 ty = _mm_mul_ps(fscal,dy12);
2126 tz = _mm_mul_ps(fscal,dz12);
2128 /* Update vectorial force */
2129 fix1 = _mm_add_ps(fix1,tx);
2130 fiy1 = _mm_add_ps(fiy1,ty);
2131 fiz1 = _mm_add_ps(fiz1,tz);
2133 fjx2 = _mm_add_ps(fjx2,tx);
2134 fjy2 = _mm_add_ps(fjy2,ty);
2135 fjz2 = _mm_add_ps(fjz2,tz);
2139 /**************************
2140 * CALCULATE INTERACTIONS *
2141 **************************/
2143 if (gmx_mm_any_lt(rsq20,rcutoff2))
2146 /* REACTION-FIELD ELECTROSTATICS */
2147 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
2149 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2153 fscal = _mm_and_ps(fscal,cutoff_mask);
2155 fscal = _mm_andnot_ps(dummy_mask,fscal);
2157 /* Calculate temporary vectorial force */
2158 tx = _mm_mul_ps(fscal,dx20);
2159 ty = _mm_mul_ps(fscal,dy20);
2160 tz = _mm_mul_ps(fscal,dz20);
2162 /* Update vectorial force */
2163 fix2 = _mm_add_ps(fix2,tx);
2164 fiy2 = _mm_add_ps(fiy2,ty);
2165 fiz2 = _mm_add_ps(fiz2,tz);
2167 fjx0 = _mm_add_ps(fjx0,tx);
2168 fjy0 = _mm_add_ps(fjy0,ty);
2169 fjz0 = _mm_add_ps(fjz0,tz);
2173 /**************************
2174 * CALCULATE INTERACTIONS *
2175 **************************/
2177 if (gmx_mm_any_lt(rsq21,rcutoff2))
2180 /* REACTION-FIELD ELECTROSTATICS */
2181 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
2183 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2187 fscal = _mm_and_ps(fscal,cutoff_mask);
2189 fscal = _mm_andnot_ps(dummy_mask,fscal);
2191 /* Calculate temporary vectorial force */
2192 tx = _mm_mul_ps(fscal,dx21);
2193 ty = _mm_mul_ps(fscal,dy21);
2194 tz = _mm_mul_ps(fscal,dz21);
2196 /* Update vectorial force */
2197 fix2 = _mm_add_ps(fix2,tx);
2198 fiy2 = _mm_add_ps(fiy2,ty);
2199 fiz2 = _mm_add_ps(fiz2,tz);
2201 fjx1 = _mm_add_ps(fjx1,tx);
2202 fjy1 = _mm_add_ps(fjy1,ty);
2203 fjz1 = _mm_add_ps(fjz1,tz);
2207 /**************************
2208 * CALCULATE INTERACTIONS *
2209 **************************/
2211 if (gmx_mm_any_lt(rsq22,rcutoff2))
2214 /* REACTION-FIELD ELECTROSTATICS */
2215 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
2217 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2221 fscal = _mm_and_ps(fscal,cutoff_mask);
2223 fscal = _mm_andnot_ps(dummy_mask,fscal);
2225 /* Calculate temporary vectorial force */
2226 tx = _mm_mul_ps(fscal,dx22);
2227 ty = _mm_mul_ps(fscal,dy22);
2228 tz = _mm_mul_ps(fscal,dz22);
2230 /* Update vectorial force */
2231 fix2 = _mm_add_ps(fix2,tx);
2232 fiy2 = _mm_add_ps(fiy2,ty);
2233 fiz2 = _mm_add_ps(fiz2,tz);
2235 fjx2 = _mm_add_ps(fjx2,tx);
2236 fjy2 = _mm_add_ps(fjy2,ty);
2237 fjz2 = _mm_add_ps(fjz2,tz);
2241 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2242 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2243 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2244 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2246 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2247 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2249 /* Inner loop uses 298 flops */
2252 /* End of innermost loop */
2254 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2255 f+i_coord_offset,fshift+i_shift_offset);
2257 /* Increment number of inner iterations */
2258 inneriter += j_index_end - j_index_start;
2260 /* Outer loop uses 18 flops */
2263 /* Increment number of outer iterations */
2266 /* Update outer/inner flops */
2268 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*298);