2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_sse4_1_single.h"
48 #include "kernelutil_x86_sse4_1_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse4_1_single
52 * Electrostatics interaction: ReactionField
53 * VdW interaction: LennardJones
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse4_1_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
107 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
111 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
112 __m128 dummy_mask,cutoff_mask;
113 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
114 __m128 one = _mm_set1_ps(1.0);
115 __m128 two = _mm_set1_ps(2.0);
121 jindex = nlist->jindex;
123 shiftidx = nlist->shift;
125 shiftvec = fr->shift_vec[0];
126 fshift = fr->fshift[0];
127 facel = _mm_set1_ps(fr->epsfac);
128 charge = mdatoms->chargeA;
129 krf = _mm_set1_ps(fr->ic->k_rf);
130 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
131 crf = _mm_set1_ps(fr->ic->c_rf);
132 nvdwtype = fr->ntype;
134 vdwtype = mdatoms->typeA;
136 /* Setup water-specific parameters */
137 inr = nlist->iinr[0];
138 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
139 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
140 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
141 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
143 jq0 = _mm_set1_ps(charge[inr+0]);
144 jq1 = _mm_set1_ps(charge[inr+1]);
145 jq2 = _mm_set1_ps(charge[inr+2]);
146 vdwjidx0A = 2*vdwtype[inr+0];
147 qq00 = _mm_mul_ps(iq0,jq0);
148 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
149 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
150 qq01 = _mm_mul_ps(iq0,jq1);
151 qq02 = _mm_mul_ps(iq0,jq2);
152 qq10 = _mm_mul_ps(iq1,jq0);
153 qq11 = _mm_mul_ps(iq1,jq1);
154 qq12 = _mm_mul_ps(iq1,jq2);
155 qq20 = _mm_mul_ps(iq2,jq0);
156 qq21 = _mm_mul_ps(iq2,jq1);
157 qq22 = _mm_mul_ps(iq2,jq2);
159 /* Avoid stupid compiler warnings */
160 jnrA = jnrB = jnrC = jnrD = 0;
169 for(iidx=0;iidx<4*DIM;iidx++)
174 /* Start outer loop over neighborlists */
175 for(iidx=0; iidx<nri; iidx++)
177 /* Load shift vector for this list */
178 i_shift_offset = DIM*shiftidx[iidx];
180 /* Load limits for loop over neighbors */
181 j_index_start = jindex[iidx];
182 j_index_end = jindex[iidx+1];
184 /* Get outer coordinate index */
186 i_coord_offset = DIM*inr;
188 /* Load i particle coords and add shift vector */
189 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
190 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
192 fix0 = _mm_setzero_ps();
193 fiy0 = _mm_setzero_ps();
194 fiz0 = _mm_setzero_ps();
195 fix1 = _mm_setzero_ps();
196 fiy1 = _mm_setzero_ps();
197 fiz1 = _mm_setzero_ps();
198 fix2 = _mm_setzero_ps();
199 fiy2 = _mm_setzero_ps();
200 fiz2 = _mm_setzero_ps();
202 /* Reset potential sums */
203 velecsum = _mm_setzero_ps();
204 vvdwsum = _mm_setzero_ps();
206 /* Start inner kernel loop */
207 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
210 /* Get j neighbor index, and coordinate index */
215 j_coord_offsetA = DIM*jnrA;
216 j_coord_offsetB = DIM*jnrB;
217 j_coord_offsetC = DIM*jnrC;
218 j_coord_offsetD = DIM*jnrD;
220 /* load j atom coordinates */
221 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
222 x+j_coord_offsetC,x+j_coord_offsetD,
223 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
225 /* Calculate displacement vector */
226 dx00 = _mm_sub_ps(ix0,jx0);
227 dy00 = _mm_sub_ps(iy0,jy0);
228 dz00 = _mm_sub_ps(iz0,jz0);
229 dx01 = _mm_sub_ps(ix0,jx1);
230 dy01 = _mm_sub_ps(iy0,jy1);
231 dz01 = _mm_sub_ps(iz0,jz1);
232 dx02 = _mm_sub_ps(ix0,jx2);
233 dy02 = _mm_sub_ps(iy0,jy2);
234 dz02 = _mm_sub_ps(iz0,jz2);
235 dx10 = _mm_sub_ps(ix1,jx0);
236 dy10 = _mm_sub_ps(iy1,jy0);
237 dz10 = _mm_sub_ps(iz1,jz0);
238 dx11 = _mm_sub_ps(ix1,jx1);
239 dy11 = _mm_sub_ps(iy1,jy1);
240 dz11 = _mm_sub_ps(iz1,jz1);
241 dx12 = _mm_sub_ps(ix1,jx2);
242 dy12 = _mm_sub_ps(iy1,jy2);
243 dz12 = _mm_sub_ps(iz1,jz2);
244 dx20 = _mm_sub_ps(ix2,jx0);
245 dy20 = _mm_sub_ps(iy2,jy0);
246 dz20 = _mm_sub_ps(iz2,jz0);
247 dx21 = _mm_sub_ps(ix2,jx1);
248 dy21 = _mm_sub_ps(iy2,jy1);
249 dz21 = _mm_sub_ps(iz2,jz1);
250 dx22 = _mm_sub_ps(ix2,jx2);
251 dy22 = _mm_sub_ps(iy2,jy2);
252 dz22 = _mm_sub_ps(iz2,jz2);
254 /* Calculate squared distance and things based on it */
255 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
256 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
257 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
258 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
259 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
260 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
261 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
262 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
263 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
265 rinv00 = gmx_mm_invsqrt_ps(rsq00);
266 rinv01 = gmx_mm_invsqrt_ps(rsq01);
267 rinv02 = gmx_mm_invsqrt_ps(rsq02);
268 rinv10 = gmx_mm_invsqrt_ps(rsq10);
269 rinv11 = gmx_mm_invsqrt_ps(rsq11);
270 rinv12 = gmx_mm_invsqrt_ps(rsq12);
271 rinv20 = gmx_mm_invsqrt_ps(rsq20);
272 rinv21 = gmx_mm_invsqrt_ps(rsq21);
273 rinv22 = gmx_mm_invsqrt_ps(rsq22);
275 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
276 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
277 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
278 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
279 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
280 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
281 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
282 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
283 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
285 fjx0 = _mm_setzero_ps();
286 fjy0 = _mm_setzero_ps();
287 fjz0 = _mm_setzero_ps();
288 fjx1 = _mm_setzero_ps();
289 fjy1 = _mm_setzero_ps();
290 fjz1 = _mm_setzero_ps();
291 fjx2 = _mm_setzero_ps();
292 fjy2 = _mm_setzero_ps();
293 fjz2 = _mm_setzero_ps();
295 /**************************
296 * CALCULATE INTERACTIONS *
297 **************************/
299 /* REACTION-FIELD ELECTROSTATICS */
300 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
301 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
303 /* LENNARD-JONES DISPERSION/REPULSION */
305 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
306 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
307 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
308 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
309 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
311 /* Update potential sum for this i atom from the interaction with this j atom. */
312 velecsum = _mm_add_ps(velecsum,velec);
313 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
315 fscal = _mm_add_ps(felec,fvdw);
317 /* Calculate temporary vectorial force */
318 tx = _mm_mul_ps(fscal,dx00);
319 ty = _mm_mul_ps(fscal,dy00);
320 tz = _mm_mul_ps(fscal,dz00);
322 /* Update vectorial force */
323 fix0 = _mm_add_ps(fix0,tx);
324 fiy0 = _mm_add_ps(fiy0,ty);
325 fiz0 = _mm_add_ps(fiz0,tz);
327 fjx0 = _mm_add_ps(fjx0,tx);
328 fjy0 = _mm_add_ps(fjy0,ty);
329 fjz0 = _mm_add_ps(fjz0,tz);
331 /**************************
332 * CALCULATE INTERACTIONS *
333 **************************/
335 /* REACTION-FIELD ELECTROSTATICS */
336 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
337 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
339 /* Update potential sum for this i atom from the interaction with this j atom. */
340 velecsum = _mm_add_ps(velecsum,velec);
344 /* Calculate temporary vectorial force */
345 tx = _mm_mul_ps(fscal,dx01);
346 ty = _mm_mul_ps(fscal,dy01);
347 tz = _mm_mul_ps(fscal,dz01);
349 /* Update vectorial force */
350 fix0 = _mm_add_ps(fix0,tx);
351 fiy0 = _mm_add_ps(fiy0,ty);
352 fiz0 = _mm_add_ps(fiz0,tz);
354 fjx1 = _mm_add_ps(fjx1,tx);
355 fjy1 = _mm_add_ps(fjy1,ty);
356 fjz1 = _mm_add_ps(fjz1,tz);
358 /**************************
359 * CALCULATE INTERACTIONS *
360 **************************/
362 /* REACTION-FIELD ELECTROSTATICS */
363 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
364 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
366 /* Update potential sum for this i atom from the interaction with this j atom. */
367 velecsum = _mm_add_ps(velecsum,velec);
371 /* Calculate temporary vectorial force */
372 tx = _mm_mul_ps(fscal,dx02);
373 ty = _mm_mul_ps(fscal,dy02);
374 tz = _mm_mul_ps(fscal,dz02);
376 /* Update vectorial force */
377 fix0 = _mm_add_ps(fix0,tx);
378 fiy0 = _mm_add_ps(fiy0,ty);
379 fiz0 = _mm_add_ps(fiz0,tz);
381 fjx2 = _mm_add_ps(fjx2,tx);
382 fjy2 = _mm_add_ps(fjy2,ty);
383 fjz2 = _mm_add_ps(fjz2,tz);
385 /**************************
386 * CALCULATE INTERACTIONS *
387 **************************/
389 /* REACTION-FIELD ELECTROSTATICS */
390 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
391 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
393 /* Update potential sum for this i atom from the interaction with this j atom. */
394 velecsum = _mm_add_ps(velecsum,velec);
398 /* Calculate temporary vectorial force */
399 tx = _mm_mul_ps(fscal,dx10);
400 ty = _mm_mul_ps(fscal,dy10);
401 tz = _mm_mul_ps(fscal,dz10);
403 /* Update vectorial force */
404 fix1 = _mm_add_ps(fix1,tx);
405 fiy1 = _mm_add_ps(fiy1,ty);
406 fiz1 = _mm_add_ps(fiz1,tz);
408 fjx0 = _mm_add_ps(fjx0,tx);
409 fjy0 = _mm_add_ps(fjy0,ty);
410 fjz0 = _mm_add_ps(fjz0,tz);
412 /**************************
413 * CALCULATE INTERACTIONS *
414 **************************/
416 /* REACTION-FIELD ELECTROSTATICS */
417 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
418 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
420 /* Update potential sum for this i atom from the interaction with this j atom. */
421 velecsum = _mm_add_ps(velecsum,velec);
425 /* Calculate temporary vectorial force */
426 tx = _mm_mul_ps(fscal,dx11);
427 ty = _mm_mul_ps(fscal,dy11);
428 tz = _mm_mul_ps(fscal,dz11);
430 /* Update vectorial force */
431 fix1 = _mm_add_ps(fix1,tx);
432 fiy1 = _mm_add_ps(fiy1,ty);
433 fiz1 = _mm_add_ps(fiz1,tz);
435 fjx1 = _mm_add_ps(fjx1,tx);
436 fjy1 = _mm_add_ps(fjy1,ty);
437 fjz1 = _mm_add_ps(fjz1,tz);
439 /**************************
440 * CALCULATE INTERACTIONS *
441 **************************/
443 /* REACTION-FIELD ELECTROSTATICS */
444 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
445 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
447 /* Update potential sum for this i atom from the interaction with this j atom. */
448 velecsum = _mm_add_ps(velecsum,velec);
452 /* Calculate temporary vectorial force */
453 tx = _mm_mul_ps(fscal,dx12);
454 ty = _mm_mul_ps(fscal,dy12);
455 tz = _mm_mul_ps(fscal,dz12);
457 /* Update vectorial force */
458 fix1 = _mm_add_ps(fix1,tx);
459 fiy1 = _mm_add_ps(fiy1,ty);
460 fiz1 = _mm_add_ps(fiz1,tz);
462 fjx2 = _mm_add_ps(fjx2,tx);
463 fjy2 = _mm_add_ps(fjy2,ty);
464 fjz2 = _mm_add_ps(fjz2,tz);
466 /**************************
467 * CALCULATE INTERACTIONS *
468 **************************/
470 /* REACTION-FIELD ELECTROSTATICS */
471 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
472 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
474 /* Update potential sum for this i atom from the interaction with this j atom. */
475 velecsum = _mm_add_ps(velecsum,velec);
479 /* Calculate temporary vectorial force */
480 tx = _mm_mul_ps(fscal,dx20);
481 ty = _mm_mul_ps(fscal,dy20);
482 tz = _mm_mul_ps(fscal,dz20);
484 /* Update vectorial force */
485 fix2 = _mm_add_ps(fix2,tx);
486 fiy2 = _mm_add_ps(fiy2,ty);
487 fiz2 = _mm_add_ps(fiz2,tz);
489 fjx0 = _mm_add_ps(fjx0,tx);
490 fjy0 = _mm_add_ps(fjy0,ty);
491 fjz0 = _mm_add_ps(fjz0,tz);
493 /**************************
494 * CALCULATE INTERACTIONS *
495 **************************/
497 /* REACTION-FIELD ELECTROSTATICS */
498 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
499 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
501 /* Update potential sum for this i atom from the interaction with this j atom. */
502 velecsum = _mm_add_ps(velecsum,velec);
506 /* Calculate temporary vectorial force */
507 tx = _mm_mul_ps(fscal,dx21);
508 ty = _mm_mul_ps(fscal,dy21);
509 tz = _mm_mul_ps(fscal,dz21);
511 /* Update vectorial force */
512 fix2 = _mm_add_ps(fix2,tx);
513 fiy2 = _mm_add_ps(fiy2,ty);
514 fiz2 = _mm_add_ps(fiz2,tz);
516 fjx1 = _mm_add_ps(fjx1,tx);
517 fjy1 = _mm_add_ps(fjy1,ty);
518 fjz1 = _mm_add_ps(fjz1,tz);
520 /**************************
521 * CALCULATE INTERACTIONS *
522 **************************/
524 /* REACTION-FIELD ELECTROSTATICS */
525 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
526 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
528 /* Update potential sum for this i atom from the interaction with this j atom. */
529 velecsum = _mm_add_ps(velecsum,velec);
533 /* Calculate temporary vectorial force */
534 tx = _mm_mul_ps(fscal,dx22);
535 ty = _mm_mul_ps(fscal,dy22);
536 tz = _mm_mul_ps(fscal,dz22);
538 /* Update vectorial force */
539 fix2 = _mm_add_ps(fix2,tx);
540 fiy2 = _mm_add_ps(fiy2,ty);
541 fiz2 = _mm_add_ps(fiz2,tz);
543 fjx2 = _mm_add_ps(fjx2,tx);
544 fjy2 = _mm_add_ps(fjy2,ty);
545 fjz2 = _mm_add_ps(fjz2,tz);
547 fjptrA = f+j_coord_offsetA;
548 fjptrB = f+j_coord_offsetB;
549 fjptrC = f+j_coord_offsetC;
550 fjptrD = f+j_coord_offsetD;
552 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
553 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
555 /* Inner loop uses 300 flops */
561 /* Get j neighbor index, and coordinate index */
562 jnrlistA = jjnr[jidx];
563 jnrlistB = jjnr[jidx+1];
564 jnrlistC = jjnr[jidx+2];
565 jnrlistD = jjnr[jidx+3];
566 /* Sign of each element will be negative for non-real atoms.
567 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
568 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
570 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
571 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
572 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
573 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
574 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
575 j_coord_offsetA = DIM*jnrA;
576 j_coord_offsetB = DIM*jnrB;
577 j_coord_offsetC = DIM*jnrC;
578 j_coord_offsetD = DIM*jnrD;
580 /* load j atom coordinates */
581 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
582 x+j_coord_offsetC,x+j_coord_offsetD,
583 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
585 /* Calculate displacement vector */
586 dx00 = _mm_sub_ps(ix0,jx0);
587 dy00 = _mm_sub_ps(iy0,jy0);
588 dz00 = _mm_sub_ps(iz0,jz0);
589 dx01 = _mm_sub_ps(ix0,jx1);
590 dy01 = _mm_sub_ps(iy0,jy1);
591 dz01 = _mm_sub_ps(iz0,jz1);
592 dx02 = _mm_sub_ps(ix0,jx2);
593 dy02 = _mm_sub_ps(iy0,jy2);
594 dz02 = _mm_sub_ps(iz0,jz2);
595 dx10 = _mm_sub_ps(ix1,jx0);
596 dy10 = _mm_sub_ps(iy1,jy0);
597 dz10 = _mm_sub_ps(iz1,jz0);
598 dx11 = _mm_sub_ps(ix1,jx1);
599 dy11 = _mm_sub_ps(iy1,jy1);
600 dz11 = _mm_sub_ps(iz1,jz1);
601 dx12 = _mm_sub_ps(ix1,jx2);
602 dy12 = _mm_sub_ps(iy1,jy2);
603 dz12 = _mm_sub_ps(iz1,jz2);
604 dx20 = _mm_sub_ps(ix2,jx0);
605 dy20 = _mm_sub_ps(iy2,jy0);
606 dz20 = _mm_sub_ps(iz2,jz0);
607 dx21 = _mm_sub_ps(ix2,jx1);
608 dy21 = _mm_sub_ps(iy2,jy1);
609 dz21 = _mm_sub_ps(iz2,jz1);
610 dx22 = _mm_sub_ps(ix2,jx2);
611 dy22 = _mm_sub_ps(iy2,jy2);
612 dz22 = _mm_sub_ps(iz2,jz2);
614 /* Calculate squared distance and things based on it */
615 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
616 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
617 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
618 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
619 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
620 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
621 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
622 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
623 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
625 rinv00 = gmx_mm_invsqrt_ps(rsq00);
626 rinv01 = gmx_mm_invsqrt_ps(rsq01);
627 rinv02 = gmx_mm_invsqrt_ps(rsq02);
628 rinv10 = gmx_mm_invsqrt_ps(rsq10);
629 rinv11 = gmx_mm_invsqrt_ps(rsq11);
630 rinv12 = gmx_mm_invsqrt_ps(rsq12);
631 rinv20 = gmx_mm_invsqrt_ps(rsq20);
632 rinv21 = gmx_mm_invsqrt_ps(rsq21);
633 rinv22 = gmx_mm_invsqrt_ps(rsq22);
635 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
636 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
637 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
638 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
639 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
640 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
641 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
642 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
643 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
645 fjx0 = _mm_setzero_ps();
646 fjy0 = _mm_setzero_ps();
647 fjz0 = _mm_setzero_ps();
648 fjx1 = _mm_setzero_ps();
649 fjy1 = _mm_setzero_ps();
650 fjz1 = _mm_setzero_ps();
651 fjx2 = _mm_setzero_ps();
652 fjy2 = _mm_setzero_ps();
653 fjz2 = _mm_setzero_ps();
655 /**************************
656 * CALCULATE INTERACTIONS *
657 **************************/
659 /* REACTION-FIELD ELECTROSTATICS */
660 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
661 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
663 /* LENNARD-JONES DISPERSION/REPULSION */
665 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
666 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
667 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
668 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
669 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
671 /* Update potential sum for this i atom from the interaction with this j atom. */
672 velec = _mm_andnot_ps(dummy_mask,velec);
673 velecsum = _mm_add_ps(velecsum,velec);
674 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
675 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
677 fscal = _mm_add_ps(felec,fvdw);
679 fscal = _mm_andnot_ps(dummy_mask,fscal);
681 /* Calculate temporary vectorial force */
682 tx = _mm_mul_ps(fscal,dx00);
683 ty = _mm_mul_ps(fscal,dy00);
684 tz = _mm_mul_ps(fscal,dz00);
686 /* Update vectorial force */
687 fix0 = _mm_add_ps(fix0,tx);
688 fiy0 = _mm_add_ps(fiy0,ty);
689 fiz0 = _mm_add_ps(fiz0,tz);
691 fjx0 = _mm_add_ps(fjx0,tx);
692 fjy0 = _mm_add_ps(fjy0,ty);
693 fjz0 = _mm_add_ps(fjz0,tz);
695 /**************************
696 * CALCULATE INTERACTIONS *
697 **************************/
699 /* REACTION-FIELD ELECTROSTATICS */
700 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
701 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
703 /* Update potential sum for this i atom from the interaction with this j atom. */
704 velec = _mm_andnot_ps(dummy_mask,velec);
705 velecsum = _mm_add_ps(velecsum,velec);
709 fscal = _mm_andnot_ps(dummy_mask,fscal);
711 /* Calculate temporary vectorial force */
712 tx = _mm_mul_ps(fscal,dx01);
713 ty = _mm_mul_ps(fscal,dy01);
714 tz = _mm_mul_ps(fscal,dz01);
716 /* Update vectorial force */
717 fix0 = _mm_add_ps(fix0,tx);
718 fiy0 = _mm_add_ps(fiy0,ty);
719 fiz0 = _mm_add_ps(fiz0,tz);
721 fjx1 = _mm_add_ps(fjx1,tx);
722 fjy1 = _mm_add_ps(fjy1,ty);
723 fjz1 = _mm_add_ps(fjz1,tz);
725 /**************************
726 * CALCULATE INTERACTIONS *
727 **************************/
729 /* REACTION-FIELD ELECTROSTATICS */
730 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
731 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
733 /* Update potential sum for this i atom from the interaction with this j atom. */
734 velec = _mm_andnot_ps(dummy_mask,velec);
735 velecsum = _mm_add_ps(velecsum,velec);
739 fscal = _mm_andnot_ps(dummy_mask,fscal);
741 /* Calculate temporary vectorial force */
742 tx = _mm_mul_ps(fscal,dx02);
743 ty = _mm_mul_ps(fscal,dy02);
744 tz = _mm_mul_ps(fscal,dz02);
746 /* Update vectorial force */
747 fix0 = _mm_add_ps(fix0,tx);
748 fiy0 = _mm_add_ps(fiy0,ty);
749 fiz0 = _mm_add_ps(fiz0,tz);
751 fjx2 = _mm_add_ps(fjx2,tx);
752 fjy2 = _mm_add_ps(fjy2,ty);
753 fjz2 = _mm_add_ps(fjz2,tz);
755 /**************************
756 * CALCULATE INTERACTIONS *
757 **************************/
759 /* REACTION-FIELD ELECTROSTATICS */
760 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
761 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
763 /* Update potential sum for this i atom from the interaction with this j atom. */
764 velec = _mm_andnot_ps(dummy_mask,velec);
765 velecsum = _mm_add_ps(velecsum,velec);
769 fscal = _mm_andnot_ps(dummy_mask,fscal);
771 /* Calculate temporary vectorial force */
772 tx = _mm_mul_ps(fscal,dx10);
773 ty = _mm_mul_ps(fscal,dy10);
774 tz = _mm_mul_ps(fscal,dz10);
776 /* Update vectorial force */
777 fix1 = _mm_add_ps(fix1,tx);
778 fiy1 = _mm_add_ps(fiy1,ty);
779 fiz1 = _mm_add_ps(fiz1,tz);
781 fjx0 = _mm_add_ps(fjx0,tx);
782 fjy0 = _mm_add_ps(fjy0,ty);
783 fjz0 = _mm_add_ps(fjz0,tz);
785 /**************************
786 * CALCULATE INTERACTIONS *
787 **************************/
789 /* REACTION-FIELD ELECTROSTATICS */
790 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
791 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
793 /* Update potential sum for this i atom from the interaction with this j atom. */
794 velec = _mm_andnot_ps(dummy_mask,velec);
795 velecsum = _mm_add_ps(velecsum,velec);
799 fscal = _mm_andnot_ps(dummy_mask,fscal);
801 /* Calculate temporary vectorial force */
802 tx = _mm_mul_ps(fscal,dx11);
803 ty = _mm_mul_ps(fscal,dy11);
804 tz = _mm_mul_ps(fscal,dz11);
806 /* Update vectorial force */
807 fix1 = _mm_add_ps(fix1,tx);
808 fiy1 = _mm_add_ps(fiy1,ty);
809 fiz1 = _mm_add_ps(fiz1,tz);
811 fjx1 = _mm_add_ps(fjx1,tx);
812 fjy1 = _mm_add_ps(fjy1,ty);
813 fjz1 = _mm_add_ps(fjz1,tz);
815 /**************************
816 * CALCULATE INTERACTIONS *
817 **************************/
819 /* REACTION-FIELD ELECTROSTATICS */
820 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
821 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
823 /* Update potential sum for this i atom from the interaction with this j atom. */
824 velec = _mm_andnot_ps(dummy_mask,velec);
825 velecsum = _mm_add_ps(velecsum,velec);
829 fscal = _mm_andnot_ps(dummy_mask,fscal);
831 /* Calculate temporary vectorial force */
832 tx = _mm_mul_ps(fscal,dx12);
833 ty = _mm_mul_ps(fscal,dy12);
834 tz = _mm_mul_ps(fscal,dz12);
836 /* Update vectorial force */
837 fix1 = _mm_add_ps(fix1,tx);
838 fiy1 = _mm_add_ps(fiy1,ty);
839 fiz1 = _mm_add_ps(fiz1,tz);
841 fjx2 = _mm_add_ps(fjx2,tx);
842 fjy2 = _mm_add_ps(fjy2,ty);
843 fjz2 = _mm_add_ps(fjz2,tz);
845 /**************************
846 * CALCULATE INTERACTIONS *
847 **************************/
849 /* REACTION-FIELD ELECTROSTATICS */
850 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
851 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
853 /* Update potential sum for this i atom from the interaction with this j atom. */
854 velec = _mm_andnot_ps(dummy_mask,velec);
855 velecsum = _mm_add_ps(velecsum,velec);
859 fscal = _mm_andnot_ps(dummy_mask,fscal);
861 /* Calculate temporary vectorial force */
862 tx = _mm_mul_ps(fscal,dx20);
863 ty = _mm_mul_ps(fscal,dy20);
864 tz = _mm_mul_ps(fscal,dz20);
866 /* Update vectorial force */
867 fix2 = _mm_add_ps(fix2,tx);
868 fiy2 = _mm_add_ps(fiy2,ty);
869 fiz2 = _mm_add_ps(fiz2,tz);
871 fjx0 = _mm_add_ps(fjx0,tx);
872 fjy0 = _mm_add_ps(fjy0,ty);
873 fjz0 = _mm_add_ps(fjz0,tz);
875 /**************************
876 * CALCULATE INTERACTIONS *
877 **************************/
879 /* REACTION-FIELD ELECTROSTATICS */
880 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
881 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
883 /* Update potential sum for this i atom from the interaction with this j atom. */
884 velec = _mm_andnot_ps(dummy_mask,velec);
885 velecsum = _mm_add_ps(velecsum,velec);
889 fscal = _mm_andnot_ps(dummy_mask,fscal);
891 /* Calculate temporary vectorial force */
892 tx = _mm_mul_ps(fscal,dx21);
893 ty = _mm_mul_ps(fscal,dy21);
894 tz = _mm_mul_ps(fscal,dz21);
896 /* Update vectorial force */
897 fix2 = _mm_add_ps(fix2,tx);
898 fiy2 = _mm_add_ps(fiy2,ty);
899 fiz2 = _mm_add_ps(fiz2,tz);
901 fjx1 = _mm_add_ps(fjx1,tx);
902 fjy1 = _mm_add_ps(fjy1,ty);
903 fjz1 = _mm_add_ps(fjz1,tz);
905 /**************************
906 * CALCULATE INTERACTIONS *
907 **************************/
909 /* REACTION-FIELD ELECTROSTATICS */
910 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
911 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
913 /* Update potential sum for this i atom from the interaction with this j atom. */
914 velec = _mm_andnot_ps(dummy_mask,velec);
915 velecsum = _mm_add_ps(velecsum,velec);
919 fscal = _mm_andnot_ps(dummy_mask,fscal);
921 /* Calculate temporary vectorial force */
922 tx = _mm_mul_ps(fscal,dx22);
923 ty = _mm_mul_ps(fscal,dy22);
924 tz = _mm_mul_ps(fscal,dz22);
926 /* Update vectorial force */
927 fix2 = _mm_add_ps(fix2,tx);
928 fiy2 = _mm_add_ps(fiy2,ty);
929 fiz2 = _mm_add_ps(fiz2,tz);
931 fjx2 = _mm_add_ps(fjx2,tx);
932 fjy2 = _mm_add_ps(fjy2,ty);
933 fjz2 = _mm_add_ps(fjz2,tz);
935 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
936 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
937 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
938 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
940 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
941 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
943 /* Inner loop uses 300 flops */
946 /* End of innermost loop */
948 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
949 f+i_coord_offset,fshift+i_shift_offset);
952 /* Update potential energies */
953 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
954 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
956 /* Increment number of inner iterations */
957 inneriter += j_index_end - j_index_start;
959 /* Outer loop uses 20 flops */
962 /* Increment number of outer iterations */
965 /* Update outer/inner flops */
967 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*300);
970 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse4_1_single
971 * Electrostatics interaction: ReactionField
972 * VdW interaction: LennardJones
973 * Geometry: Water3-Water3
974 * Calculate force/pot: Force
977 nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse4_1_single
978 (t_nblist * gmx_restrict nlist,
979 rvec * gmx_restrict xx,
980 rvec * gmx_restrict ff,
981 t_forcerec * gmx_restrict fr,
982 t_mdatoms * gmx_restrict mdatoms,
983 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
984 t_nrnb * gmx_restrict nrnb)
986 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
987 * just 0 for non-waters.
988 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
989 * jnr indices corresponding to data put in the four positions in the SIMD register.
991 int i_shift_offset,i_coord_offset,outeriter,inneriter;
992 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
993 int jnrA,jnrB,jnrC,jnrD;
994 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
995 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
996 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
998 real *shiftvec,*fshift,*x,*f;
999 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1000 real scratch[4*DIM];
1001 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1003 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1005 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1007 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1008 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1009 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1010 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1011 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1012 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1013 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1014 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1015 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1016 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1017 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1018 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1019 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1020 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1021 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1022 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1023 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1026 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1029 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1030 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1031 __m128 dummy_mask,cutoff_mask;
1032 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1033 __m128 one = _mm_set1_ps(1.0);
1034 __m128 two = _mm_set1_ps(2.0);
1040 jindex = nlist->jindex;
1042 shiftidx = nlist->shift;
1044 shiftvec = fr->shift_vec[0];
1045 fshift = fr->fshift[0];
1046 facel = _mm_set1_ps(fr->epsfac);
1047 charge = mdatoms->chargeA;
1048 krf = _mm_set1_ps(fr->ic->k_rf);
1049 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
1050 crf = _mm_set1_ps(fr->ic->c_rf);
1051 nvdwtype = fr->ntype;
1052 vdwparam = fr->nbfp;
1053 vdwtype = mdatoms->typeA;
1055 /* Setup water-specific parameters */
1056 inr = nlist->iinr[0];
1057 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1058 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1059 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1060 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1062 jq0 = _mm_set1_ps(charge[inr+0]);
1063 jq1 = _mm_set1_ps(charge[inr+1]);
1064 jq2 = _mm_set1_ps(charge[inr+2]);
1065 vdwjidx0A = 2*vdwtype[inr+0];
1066 qq00 = _mm_mul_ps(iq0,jq0);
1067 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1068 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1069 qq01 = _mm_mul_ps(iq0,jq1);
1070 qq02 = _mm_mul_ps(iq0,jq2);
1071 qq10 = _mm_mul_ps(iq1,jq0);
1072 qq11 = _mm_mul_ps(iq1,jq1);
1073 qq12 = _mm_mul_ps(iq1,jq2);
1074 qq20 = _mm_mul_ps(iq2,jq0);
1075 qq21 = _mm_mul_ps(iq2,jq1);
1076 qq22 = _mm_mul_ps(iq2,jq2);
1078 /* Avoid stupid compiler warnings */
1079 jnrA = jnrB = jnrC = jnrD = 0;
1080 j_coord_offsetA = 0;
1081 j_coord_offsetB = 0;
1082 j_coord_offsetC = 0;
1083 j_coord_offsetD = 0;
1088 for(iidx=0;iidx<4*DIM;iidx++)
1090 scratch[iidx] = 0.0;
1093 /* Start outer loop over neighborlists */
1094 for(iidx=0; iidx<nri; iidx++)
1096 /* Load shift vector for this list */
1097 i_shift_offset = DIM*shiftidx[iidx];
1099 /* Load limits for loop over neighbors */
1100 j_index_start = jindex[iidx];
1101 j_index_end = jindex[iidx+1];
1103 /* Get outer coordinate index */
1105 i_coord_offset = DIM*inr;
1107 /* Load i particle coords and add shift vector */
1108 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1109 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1111 fix0 = _mm_setzero_ps();
1112 fiy0 = _mm_setzero_ps();
1113 fiz0 = _mm_setzero_ps();
1114 fix1 = _mm_setzero_ps();
1115 fiy1 = _mm_setzero_ps();
1116 fiz1 = _mm_setzero_ps();
1117 fix2 = _mm_setzero_ps();
1118 fiy2 = _mm_setzero_ps();
1119 fiz2 = _mm_setzero_ps();
1121 /* Start inner kernel loop */
1122 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1125 /* Get j neighbor index, and coordinate index */
1127 jnrB = jjnr[jidx+1];
1128 jnrC = jjnr[jidx+2];
1129 jnrD = jjnr[jidx+3];
1130 j_coord_offsetA = DIM*jnrA;
1131 j_coord_offsetB = DIM*jnrB;
1132 j_coord_offsetC = DIM*jnrC;
1133 j_coord_offsetD = DIM*jnrD;
1135 /* load j atom coordinates */
1136 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1137 x+j_coord_offsetC,x+j_coord_offsetD,
1138 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1140 /* Calculate displacement vector */
1141 dx00 = _mm_sub_ps(ix0,jx0);
1142 dy00 = _mm_sub_ps(iy0,jy0);
1143 dz00 = _mm_sub_ps(iz0,jz0);
1144 dx01 = _mm_sub_ps(ix0,jx1);
1145 dy01 = _mm_sub_ps(iy0,jy1);
1146 dz01 = _mm_sub_ps(iz0,jz1);
1147 dx02 = _mm_sub_ps(ix0,jx2);
1148 dy02 = _mm_sub_ps(iy0,jy2);
1149 dz02 = _mm_sub_ps(iz0,jz2);
1150 dx10 = _mm_sub_ps(ix1,jx0);
1151 dy10 = _mm_sub_ps(iy1,jy0);
1152 dz10 = _mm_sub_ps(iz1,jz0);
1153 dx11 = _mm_sub_ps(ix1,jx1);
1154 dy11 = _mm_sub_ps(iy1,jy1);
1155 dz11 = _mm_sub_ps(iz1,jz1);
1156 dx12 = _mm_sub_ps(ix1,jx2);
1157 dy12 = _mm_sub_ps(iy1,jy2);
1158 dz12 = _mm_sub_ps(iz1,jz2);
1159 dx20 = _mm_sub_ps(ix2,jx0);
1160 dy20 = _mm_sub_ps(iy2,jy0);
1161 dz20 = _mm_sub_ps(iz2,jz0);
1162 dx21 = _mm_sub_ps(ix2,jx1);
1163 dy21 = _mm_sub_ps(iy2,jy1);
1164 dz21 = _mm_sub_ps(iz2,jz1);
1165 dx22 = _mm_sub_ps(ix2,jx2);
1166 dy22 = _mm_sub_ps(iy2,jy2);
1167 dz22 = _mm_sub_ps(iz2,jz2);
1169 /* Calculate squared distance and things based on it */
1170 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1171 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1172 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1173 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1174 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1175 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1176 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1177 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1178 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1180 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1181 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1182 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1183 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1184 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1185 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1186 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1187 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1188 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1190 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1191 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1192 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1193 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1194 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1195 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1196 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1197 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1198 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1200 fjx0 = _mm_setzero_ps();
1201 fjy0 = _mm_setzero_ps();
1202 fjz0 = _mm_setzero_ps();
1203 fjx1 = _mm_setzero_ps();
1204 fjy1 = _mm_setzero_ps();
1205 fjz1 = _mm_setzero_ps();
1206 fjx2 = _mm_setzero_ps();
1207 fjy2 = _mm_setzero_ps();
1208 fjz2 = _mm_setzero_ps();
1210 /**************************
1211 * CALCULATE INTERACTIONS *
1212 **************************/
1214 /* REACTION-FIELD ELECTROSTATICS */
1215 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1217 /* LENNARD-JONES DISPERSION/REPULSION */
1219 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1220 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1222 fscal = _mm_add_ps(felec,fvdw);
1224 /* Calculate temporary vectorial force */
1225 tx = _mm_mul_ps(fscal,dx00);
1226 ty = _mm_mul_ps(fscal,dy00);
1227 tz = _mm_mul_ps(fscal,dz00);
1229 /* Update vectorial force */
1230 fix0 = _mm_add_ps(fix0,tx);
1231 fiy0 = _mm_add_ps(fiy0,ty);
1232 fiz0 = _mm_add_ps(fiz0,tz);
1234 fjx0 = _mm_add_ps(fjx0,tx);
1235 fjy0 = _mm_add_ps(fjy0,ty);
1236 fjz0 = _mm_add_ps(fjz0,tz);
1238 /**************************
1239 * CALCULATE INTERACTIONS *
1240 **************************/
1242 /* REACTION-FIELD ELECTROSTATICS */
1243 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1247 /* Calculate temporary vectorial force */
1248 tx = _mm_mul_ps(fscal,dx01);
1249 ty = _mm_mul_ps(fscal,dy01);
1250 tz = _mm_mul_ps(fscal,dz01);
1252 /* Update vectorial force */
1253 fix0 = _mm_add_ps(fix0,tx);
1254 fiy0 = _mm_add_ps(fiy0,ty);
1255 fiz0 = _mm_add_ps(fiz0,tz);
1257 fjx1 = _mm_add_ps(fjx1,tx);
1258 fjy1 = _mm_add_ps(fjy1,ty);
1259 fjz1 = _mm_add_ps(fjz1,tz);
1261 /**************************
1262 * CALCULATE INTERACTIONS *
1263 **************************/
1265 /* REACTION-FIELD ELECTROSTATICS */
1266 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1270 /* Calculate temporary vectorial force */
1271 tx = _mm_mul_ps(fscal,dx02);
1272 ty = _mm_mul_ps(fscal,dy02);
1273 tz = _mm_mul_ps(fscal,dz02);
1275 /* Update vectorial force */
1276 fix0 = _mm_add_ps(fix0,tx);
1277 fiy0 = _mm_add_ps(fiy0,ty);
1278 fiz0 = _mm_add_ps(fiz0,tz);
1280 fjx2 = _mm_add_ps(fjx2,tx);
1281 fjy2 = _mm_add_ps(fjy2,ty);
1282 fjz2 = _mm_add_ps(fjz2,tz);
1284 /**************************
1285 * CALCULATE INTERACTIONS *
1286 **************************/
1288 /* REACTION-FIELD ELECTROSTATICS */
1289 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1293 /* Calculate temporary vectorial force */
1294 tx = _mm_mul_ps(fscal,dx10);
1295 ty = _mm_mul_ps(fscal,dy10);
1296 tz = _mm_mul_ps(fscal,dz10);
1298 /* Update vectorial force */
1299 fix1 = _mm_add_ps(fix1,tx);
1300 fiy1 = _mm_add_ps(fiy1,ty);
1301 fiz1 = _mm_add_ps(fiz1,tz);
1303 fjx0 = _mm_add_ps(fjx0,tx);
1304 fjy0 = _mm_add_ps(fjy0,ty);
1305 fjz0 = _mm_add_ps(fjz0,tz);
1307 /**************************
1308 * CALCULATE INTERACTIONS *
1309 **************************/
1311 /* REACTION-FIELD ELECTROSTATICS */
1312 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1316 /* Calculate temporary vectorial force */
1317 tx = _mm_mul_ps(fscal,dx11);
1318 ty = _mm_mul_ps(fscal,dy11);
1319 tz = _mm_mul_ps(fscal,dz11);
1321 /* Update vectorial force */
1322 fix1 = _mm_add_ps(fix1,tx);
1323 fiy1 = _mm_add_ps(fiy1,ty);
1324 fiz1 = _mm_add_ps(fiz1,tz);
1326 fjx1 = _mm_add_ps(fjx1,tx);
1327 fjy1 = _mm_add_ps(fjy1,ty);
1328 fjz1 = _mm_add_ps(fjz1,tz);
1330 /**************************
1331 * CALCULATE INTERACTIONS *
1332 **************************/
1334 /* REACTION-FIELD ELECTROSTATICS */
1335 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1339 /* Calculate temporary vectorial force */
1340 tx = _mm_mul_ps(fscal,dx12);
1341 ty = _mm_mul_ps(fscal,dy12);
1342 tz = _mm_mul_ps(fscal,dz12);
1344 /* Update vectorial force */
1345 fix1 = _mm_add_ps(fix1,tx);
1346 fiy1 = _mm_add_ps(fiy1,ty);
1347 fiz1 = _mm_add_ps(fiz1,tz);
1349 fjx2 = _mm_add_ps(fjx2,tx);
1350 fjy2 = _mm_add_ps(fjy2,ty);
1351 fjz2 = _mm_add_ps(fjz2,tz);
1353 /**************************
1354 * CALCULATE INTERACTIONS *
1355 **************************/
1357 /* REACTION-FIELD ELECTROSTATICS */
1358 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1362 /* Calculate temporary vectorial force */
1363 tx = _mm_mul_ps(fscal,dx20);
1364 ty = _mm_mul_ps(fscal,dy20);
1365 tz = _mm_mul_ps(fscal,dz20);
1367 /* Update vectorial force */
1368 fix2 = _mm_add_ps(fix2,tx);
1369 fiy2 = _mm_add_ps(fiy2,ty);
1370 fiz2 = _mm_add_ps(fiz2,tz);
1372 fjx0 = _mm_add_ps(fjx0,tx);
1373 fjy0 = _mm_add_ps(fjy0,ty);
1374 fjz0 = _mm_add_ps(fjz0,tz);
1376 /**************************
1377 * CALCULATE INTERACTIONS *
1378 **************************/
1380 /* REACTION-FIELD ELECTROSTATICS */
1381 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1385 /* Calculate temporary vectorial force */
1386 tx = _mm_mul_ps(fscal,dx21);
1387 ty = _mm_mul_ps(fscal,dy21);
1388 tz = _mm_mul_ps(fscal,dz21);
1390 /* Update vectorial force */
1391 fix2 = _mm_add_ps(fix2,tx);
1392 fiy2 = _mm_add_ps(fiy2,ty);
1393 fiz2 = _mm_add_ps(fiz2,tz);
1395 fjx1 = _mm_add_ps(fjx1,tx);
1396 fjy1 = _mm_add_ps(fjy1,ty);
1397 fjz1 = _mm_add_ps(fjz1,tz);
1399 /**************************
1400 * CALCULATE INTERACTIONS *
1401 **************************/
1403 /* REACTION-FIELD ELECTROSTATICS */
1404 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1408 /* Calculate temporary vectorial force */
1409 tx = _mm_mul_ps(fscal,dx22);
1410 ty = _mm_mul_ps(fscal,dy22);
1411 tz = _mm_mul_ps(fscal,dz22);
1413 /* Update vectorial force */
1414 fix2 = _mm_add_ps(fix2,tx);
1415 fiy2 = _mm_add_ps(fiy2,ty);
1416 fiz2 = _mm_add_ps(fiz2,tz);
1418 fjx2 = _mm_add_ps(fjx2,tx);
1419 fjy2 = _mm_add_ps(fjy2,ty);
1420 fjz2 = _mm_add_ps(fjz2,tz);
1422 fjptrA = f+j_coord_offsetA;
1423 fjptrB = f+j_coord_offsetB;
1424 fjptrC = f+j_coord_offsetC;
1425 fjptrD = f+j_coord_offsetD;
1427 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1428 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1430 /* Inner loop uses 250 flops */
1433 if(jidx<j_index_end)
1436 /* Get j neighbor index, and coordinate index */
1437 jnrlistA = jjnr[jidx];
1438 jnrlistB = jjnr[jidx+1];
1439 jnrlistC = jjnr[jidx+2];
1440 jnrlistD = jjnr[jidx+3];
1441 /* Sign of each element will be negative for non-real atoms.
1442 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1443 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1445 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1446 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1447 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1448 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1449 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1450 j_coord_offsetA = DIM*jnrA;
1451 j_coord_offsetB = DIM*jnrB;
1452 j_coord_offsetC = DIM*jnrC;
1453 j_coord_offsetD = DIM*jnrD;
1455 /* load j atom coordinates */
1456 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1457 x+j_coord_offsetC,x+j_coord_offsetD,
1458 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1460 /* Calculate displacement vector */
1461 dx00 = _mm_sub_ps(ix0,jx0);
1462 dy00 = _mm_sub_ps(iy0,jy0);
1463 dz00 = _mm_sub_ps(iz0,jz0);
1464 dx01 = _mm_sub_ps(ix0,jx1);
1465 dy01 = _mm_sub_ps(iy0,jy1);
1466 dz01 = _mm_sub_ps(iz0,jz1);
1467 dx02 = _mm_sub_ps(ix0,jx2);
1468 dy02 = _mm_sub_ps(iy0,jy2);
1469 dz02 = _mm_sub_ps(iz0,jz2);
1470 dx10 = _mm_sub_ps(ix1,jx0);
1471 dy10 = _mm_sub_ps(iy1,jy0);
1472 dz10 = _mm_sub_ps(iz1,jz0);
1473 dx11 = _mm_sub_ps(ix1,jx1);
1474 dy11 = _mm_sub_ps(iy1,jy1);
1475 dz11 = _mm_sub_ps(iz1,jz1);
1476 dx12 = _mm_sub_ps(ix1,jx2);
1477 dy12 = _mm_sub_ps(iy1,jy2);
1478 dz12 = _mm_sub_ps(iz1,jz2);
1479 dx20 = _mm_sub_ps(ix2,jx0);
1480 dy20 = _mm_sub_ps(iy2,jy0);
1481 dz20 = _mm_sub_ps(iz2,jz0);
1482 dx21 = _mm_sub_ps(ix2,jx1);
1483 dy21 = _mm_sub_ps(iy2,jy1);
1484 dz21 = _mm_sub_ps(iz2,jz1);
1485 dx22 = _mm_sub_ps(ix2,jx2);
1486 dy22 = _mm_sub_ps(iy2,jy2);
1487 dz22 = _mm_sub_ps(iz2,jz2);
1489 /* Calculate squared distance and things based on it */
1490 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1491 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1492 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1493 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1494 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1495 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1496 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1497 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1498 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1500 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1501 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1502 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1503 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1504 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1505 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1506 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1507 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1508 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1510 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1511 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1512 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1513 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1514 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1515 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1516 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1517 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1518 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1520 fjx0 = _mm_setzero_ps();
1521 fjy0 = _mm_setzero_ps();
1522 fjz0 = _mm_setzero_ps();
1523 fjx1 = _mm_setzero_ps();
1524 fjy1 = _mm_setzero_ps();
1525 fjz1 = _mm_setzero_ps();
1526 fjx2 = _mm_setzero_ps();
1527 fjy2 = _mm_setzero_ps();
1528 fjz2 = _mm_setzero_ps();
1530 /**************************
1531 * CALCULATE INTERACTIONS *
1532 **************************/
1534 /* REACTION-FIELD ELECTROSTATICS */
1535 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1537 /* LENNARD-JONES DISPERSION/REPULSION */
1539 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1540 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1542 fscal = _mm_add_ps(felec,fvdw);
1544 fscal = _mm_andnot_ps(dummy_mask,fscal);
1546 /* Calculate temporary vectorial force */
1547 tx = _mm_mul_ps(fscal,dx00);
1548 ty = _mm_mul_ps(fscal,dy00);
1549 tz = _mm_mul_ps(fscal,dz00);
1551 /* Update vectorial force */
1552 fix0 = _mm_add_ps(fix0,tx);
1553 fiy0 = _mm_add_ps(fiy0,ty);
1554 fiz0 = _mm_add_ps(fiz0,tz);
1556 fjx0 = _mm_add_ps(fjx0,tx);
1557 fjy0 = _mm_add_ps(fjy0,ty);
1558 fjz0 = _mm_add_ps(fjz0,tz);
1560 /**************************
1561 * CALCULATE INTERACTIONS *
1562 **************************/
1564 /* REACTION-FIELD ELECTROSTATICS */
1565 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1569 fscal = _mm_andnot_ps(dummy_mask,fscal);
1571 /* Calculate temporary vectorial force */
1572 tx = _mm_mul_ps(fscal,dx01);
1573 ty = _mm_mul_ps(fscal,dy01);
1574 tz = _mm_mul_ps(fscal,dz01);
1576 /* Update vectorial force */
1577 fix0 = _mm_add_ps(fix0,tx);
1578 fiy0 = _mm_add_ps(fiy0,ty);
1579 fiz0 = _mm_add_ps(fiz0,tz);
1581 fjx1 = _mm_add_ps(fjx1,tx);
1582 fjy1 = _mm_add_ps(fjy1,ty);
1583 fjz1 = _mm_add_ps(fjz1,tz);
1585 /**************************
1586 * CALCULATE INTERACTIONS *
1587 **************************/
1589 /* REACTION-FIELD ELECTROSTATICS */
1590 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1594 fscal = _mm_andnot_ps(dummy_mask,fscal);
1596 /* Calculate temporary vectorial force */
1597 tx = _mm_mul_ps(fscal,dx02);
1598 ty = _mm_mul_ps(fscal,dy02);
1599 tz = _mm_mul_ps(fscal,dz02);
1601 /* Update vectorial force */
1602 fix0 = _mm_add_ps(fix0,tx);
1603 fiy0 = _mm_add_ps(fiy0,ty);
1604 fiz0 = _mm_add_ps(fiz0,tz);
1606 fjx2 = _mm_add_ps(fjx2,tx);
1607 fjy2 = _mm_add_ps(fjy2,ty);
1608 fjz2 = _mm_add_ps(fjz2,tz);
1610 /**************************
1611 * CALCULATE INTERACTIONS *
1612 **************************/
1614 /* REACTION-FIELD ELECTROSTATICS */
1615 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1619 fscal = _mm_andnot_ps(dummy_mask,fscal);
1621 /* Calculate temporary vectorial force */
1622 tx = _mm_mul_ps(fscal,dx10);
1623 ty = _mm_mul_ps(fscal,dy10);
1624 tz = _mm_mul_ps(fscal,dz10);
1626 /* Update vectorial force */
1627 fix1 = _mm_add_ps(fix1,tx);
1628 fiy1 = _mm_add_ps(fiy1,ty);
1629 fiz1 = _mm_add_ps(fiz1,tz);
1631 fjx0 = _mm_add_ps(fjx0,tx);
1632 fjy0 = _mm_add_ps(fjy0,ty);
1633 fjz0 = _mm_add_ps(fjz0,tz);
1635 /**************************
1636 * CALCULATE INTERACTIONS *
1637 **************************/
1639 /* REACTION-FIELD ELECTROSTATICS */
1640 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1644 fscal = _mm_andnot_ps(dummy_mask,fscal);
1646 /* Calculate temporary vectorial force */
1647 tx = _mm_mul_ps(fscal,dx11);
1648 ty = _mm_mul_ps(fscal,dy11);
1649 tz = _mm_mul_ps(fscal,dz11);
1651 /* Update vectorial force */
1652 fix1 = _mm_add_ps(fix1,tx);
1653 fiy1 = _mm_add_ps(fiy1,ty);
1654 fiz1 = _mm_add_ps(fiz1,tz);
1656 fjx1 = _mm_add_ps(fjx1,tx);
1657 fjy1 = _mm_add_ps(fjy1,ty);
1658 fjz1 = _mm_add_ps(fjz1,tz);
1660 /**************************
1661 * CALCULATE INTERACTIONS *
1662 **************************/
1664 /* REACTION-FIELD ELECTROSTATICS */
1665 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1669 fscal = _mm_andnot_ps(dummy_mask,fscal);
1671 /* Calculate temporary vectorial force */
1672 tx = _mm_mul_ps(fscal,dx12);
1673 ty = _mm_mul_ps(fscal,dy12);
1674 tz = _mm_mul_ps(fscal,dz12);
1676 /* Update vectorial force */
1677 fix1 = _mm_add_ps(fix1,tx);
1678 fiy1 = _mm_add_ps(fiy1,ty);
1679 fiz1 = _mm_add_ps(fiz1,tz);
1681 fjx2 = _mm_add_ps(fjx2,tx);
1682 fjy2 = _mm_add_ps(fjy2,ty);
1683 fjz2 = _mm_add_ps(fjz2,tz);
1685 /**************************
1686 * CALCULATE INTERACTIONS *
1687 **************************/
1689 /* REACTION-FIELD ELECTROSTATICS */
1690 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1694 fscal = _mm_andnot_ps(dummy_mask,fscal);
1696 /* Calculate temporary vectorial force */
1697 tx = _mm_mul_ps(fscal,dx20);
1698 ty = _mm_mul_ps(fscal,dy20);
1699 tz = _mm_mul_ps(fscal,dz20);
1701 /* Update vectorial force */
1702 fix2 = _mm_add_ps(fix2,tx);
1703 fiy2 = _mm_add_ps(fiy2,ty);
1704 fiz2 = _mm_add_ps(fiz2,tz);
1706 fjx0 = _mm_add_ps(fjx0,tx);
1707 fjy0 = _mm_add_ps(fjy0,ty);
1708 fjz0 = _mm_add_ps(fjz0,tz);
1710 /**************************
1711 * CALCULATE INTERACTIONS *
1712 **************************/
1714 /* REACTION-FIELD ELECTROSTATICS */
1715 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1719 fscal = _mm_andnot_ps(dummy_mask,fscal);
1721 /* Calculate temporary vectorial force */
1722 tx = _mm_mul_ps(fscal,dx21);
1723 ty = _mm_mul_ps(fscal,dy21);
1724 tz = _mm_mul_ps(fscal,dz21);
1726 /* Update vectorial force */
1727 fix2 = _mm_add_ps(fix2,tx);
1728 fiy2 = _mm_add_ps(fiy2,ty);
1729 fiz2 = _mm_add_ps(fiz2,tz);
1731 fjx1 = _mm_add_ps(fjx1,tx);
1732 fjy1 = _mm_add_ps(fjy1,ty);
1733 fjz1 = _mm_add_ps(fjz1,tz);
1735 /**************************
1736 * CALCULATE INTERACTIONS *
1737 **************************/
1739 /* REACTION-FIELD ELECTROSTATICS */
1740 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1744 fscal = _mm_andnot_ps(dummy_mask,fscal);
1746 /* Calculate temporary vectorial force */
1747 tx = _mm_mul_ps(fscal,dx22);
1748 ty = _mm_mul_ps(fscal,dy22);
1749 tz = _mm_mul_ps(fscal,dz22);
1751 /* Update vectorial force */
1752 fix2 = _mm_add_ps(fix2,tx);
1753 fiy2 = _mm_add_ps(fiy2,ty);
1754 fiz2 = _mm_add_ps(fiz2,tz);
1756 fjx2 = _mm_add_ps(fjx2,tx);
1757 fjy2 = _mm_add_ps(fjy2,ty);
1758 fjz2 = _mm_add_ps(fjz2,tz);
1760 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1761 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1762 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1763 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1765 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1766 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1768 /* Inner loop uses 250 flops */
1771 /* End of innermost loop */
1773 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1774 f+i_coord_offset,fshift+i_shift_offset);
1776 /* Increment number of inner iterations */
1777 inneriter += j_index_end - j_index_start;
1779 /* Outer loop uses 18 flops */
1782 /* Increment number of outer iterations */
1785 /* Update outer/inner flops */
1787 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*250);