Bug Summary

File:gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_sse4_1_single.c
Location:line 120, column 22
Description:Value stored to 'signbit' during its initialization is never read

Annotated Source Code

1/*
2 * This file is part of the GROMACS molecular simulation package.
3 *
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
8 *
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
13 *
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23 *
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
31 *
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
34 */
35/*
36 * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37 */
38#ifdef HAVE_CONFIG_H1
39#include <config.h>
40#endif
41
42#include <math.h>
43
44#include "../nb_kernel.h"
45#include "types/simple.h"
46#include "gromacs/math/vec.h"
47#include "nrnb.h"
48
49#include "gromacs/simd/math_x86_sse4_1_single.h"
50#include "kernelutil_x86_sse4_1_single.h"
51
52/*
53 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse4_1_single
54 * Electrostatics interaction: ReactionField
55 * VdW interaction: LennardJones
56 * Geometry: Water4-Water4
57 * Calculate force/pot: PotentialAndForce
58 */
59void
60nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse4_1_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
68{
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 */
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real rcutoff_scalar;
81 real *shiftvec,*fshift,*x,*f;
82 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83 real scratch[4*DIM3];
84 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85 int vdwioffset0;
86 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87 int vdwioffset1;
88 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89 int vdwioffset2;
90 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91 int vdwioffset3;
92 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
93 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
94 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
95 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
96 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
97 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
98 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
99 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
100 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
101 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
102 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
105 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
108 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
109 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
110 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
111 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
112 real *charge;
113 int nvdwtype;
114 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115 int *vdwtype;
116 real *vdwparam;
117 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
118 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
119 __m128 dummy_mask,cutoff_mask;
120 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
Value stored to 'signbit' during its initialization is never read
121 __m128 one = _mm_set1_ps(1.0);
122 __m128 two = _mm_set1_ps(2.0);
123 x = xx[0];
124 f = ff[0];
125
126 nri = nlist->nri;
127 iinr = nlist->iinr;
128 jindex = nlist->jindex;
129 jjnr = nlist->jjnr;
130 shiftidx = nlist->shift;
131 gid = nlist->gid;
132 shiftvec = fr->shift_vec[0];
133 fshift = fr->fshift[0];
134 facel = _mm_set1_ps(fr->epsfac);
135 charge = mdatoms->chargeA;
136 krf = _mm_set1_ps(fr->ic->k_rf);
137 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
138 crf = _mm_set1_ps(fr->ic->c_rf);
139 nvdwtype = fr->ntype;
140 vdwparam = fr->nbfp;
141 vdwtype = mdatoms->typeA;
142
143 /* Setup water-specific parameters */
144 inr = nlist->iinr[0];
145 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
146 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
147 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
148 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
149
150 jq1 = _mm_set1_ps(charge[inr+1]);
151 jq2 = _mm_set1_ps(charge[inr+2]);
152 jq3 = _mm_set1_ps(charge[inr+3]);
153 vdwjidx0A = 2*vdwtype[inr+0];
154 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
155 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
156 qq11 = _mm_mul_ps(iq1,jq1);
157 qq12 = _mm_mul_ps(iq1,jq2);
158 qq13 = _mm_mul_ps(iq1,jq3);
159 qq21 = _mm_mul_ps(iq2,jq1);
160 qq22 = _mm_mul_ps(iq2,jq2);
161 qq23 = _mm_mul_ps(iq2,jq3);
162 qq31 = _mm_mul_ps(iq3,jq1);
163 qq32 = _mm_mul_ps(iq3,jq2);
164 qq33 = _mm_mul_ps(iq3,jq3);
165
166 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
167 rcutoff_scalar = fr->rcoulomb;
168 rcutoff = _mm_set1_ps(rcutoff_scalar);
169 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
170
171 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
172 rvdw = _mm_set1_ps(fr->rvdw);
173
174 /* Avoid stupid compiler warnings */
175 jnrA = jnrB = jnrC = jnrD = 0;
176 j_coord_offsetA = 0;
177 j_coord_offsetB = 0;
178 j_coord_offsetC = 0;
179 j_coord_offsetD = 0;
180
181 outeriter = 0;
182 inneriter = 0;
183
184 for(iidx=0;iidx<4*DIM3;iidx++)
185 {
186 scratch[iidx] = 0.0;
187 }
188
189 /* Start outer loop over neighborlists */
190 for(iidx=0; iidx<nri; iidx++)
191 {
192 /* Load shift vector for this list */
193 i_shift_offset = DIM3*shiftidx[iidx];
194
195 /* Load limits for loop over neighbors */
196 j_index_start = jindex[iidx];
197 j_index_end = jindex[iidx+1];
198
199 /* Get outer coordinate index */
200 inr = iinr[iidx];
201 i_coord_offset = DIM3*inr;
202
203 /* Load i particle coords and add shift vector */
204 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
205 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
206
207 fix0 = _mm_setzero_ps();
208 fiy0 = _mm_setzero_ps();
209 fiz0 = _mm_setzero_ps();
210 fix1 = _mm_setzero_ps();
211 fiy1 = _mm_setzero_ps();
212 fiz1 = _mm_setzero_ps();
213 fix2 = _mm_setzero_ps();
214 fiy2 = _mm_setzero_ps();
215 fiz2 = _mm_setzero_ps();
216 fix3 = _mm_setzero_ps();
217 fiy3 = _mm_setzero_ps();
218 fiz3 = _mm_setzero_ps();
219
220 /* Reset potential sums */
221 velecsum = _mm_setzero_ps();
222 vvdwsum = _mm_setzero_ps();
223
224 /* Start inner kernel loop */
225 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
226 {
227
228 /* Get j neighbor index, and coordinate index */
229 jnrA = jjnr[jidx];
230 jnrB = jjnr[jidx+1];
231 jnrC = jjnr[jidx+2];
232 jnrD = jjnr[jidx+3];
233 j_coord_offsetA = DIM3*jnrA;
234 j_coord_offsetB = DIM3*jnrB;
235 j_coord_offsetC = DIM3*jnrC;
236 j_coord_offsetD = DIM3*jnrD;
237
238 /* load j atom coordinates */
239 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
240 x+j_coord_offsetC,x+j_coord_offsetD,
241 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
242 &jy2,&jz2,&jx3,&jy3,&jz3);
243
244 /* Calculate displacement vector */
245 dx00 = _mm_sub_ps(ix0,jx0);
246 dy00 = _mm_sub_ps(iy0,jy0);
247 dz00 = _mm_sub_ps(iz0,jz0);
248 dx11 = _mm_sub_ps(ix1,jx1);
249 dy11 = _mm_sub_ps(iy1,jy1);
250 dz11 = _mm_sub_ps(iz1,jz1);
251 dx12 = _mm_sub_ps(ix1,jx2);
252 dy12 = _mm_sub_ps(iy1,jy2);
253 dz12 = _mm_sub_ps(iz1,jz2);
254 dx13 = _mm_sub_ps(ix1,jx3);
255 dy13 = _mm_sub_ps(iy1,jy3);
256 dz13 = _mm_sub_ps(iz1,jz3);
257 dx21 = _mm_sub_ps(ix2,jx1);
258 dy21 = _mm_sub_ps(iy2,jy1);
259 dz21 = _mm_sub_ps(iz2,jz1);
260 dx22 = _mm_sub_ps(ix2,jx2);
261 dy22 = _mm_sub_ps(iy2,jy2);
262 dz22 = _mm_sub_ps(iz2,jz2);
263 dx23 = _mm_sub_ps(ix2,jx3);
264 dy23 = _mm_sub_ps(iy2,jy3);
265 dz23 = _mm_sub_ps(iz2,jz3);
266 dx31 = _mm_sub_ps(ix3,jx1);
267 dy31 = _mm_sub_ps(iy3,jy1);
268 dz31 = _mm_sub_ps(iz3,jz1);
269 dx32 = _mm_sub_ps(ix3,jx2);
270 dy32 = _mm_sub_ps(iy3,jy2);
271 dz32 = _mm_sub_ps(iz3,jz2);
272 dx33 = _mm_sub_ps(ix3,jx3);
273 dy33 = _mm_sub_ps(iy3,jy3);
274 dz33 = _mm_sub_ps(iz3,jz3);
275
276 /* Calculate squared distance and things based on it */
277 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
278 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
279 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
280 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
281 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
282 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
283 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
284 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
285 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
286 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
287
288 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
289 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
290 rinv13 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq13);
291 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
292 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
293 rinv23 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq23);
294 rinv31 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq31);
295 rinv32 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq32);
296 rinv33 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq33);
297
298 rinvsq00 = gmx_mm_inv_psgmx_simd_inv_f(rsq00);
299 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
300 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
301 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
302 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
303 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
304 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
305 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
306 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
307 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
308
309 fjx0 = _mm_setzero_ps();
310 fjy0 = _mm_setzero_ps();
311 fjz0 = _mm_setzero_ps();
312 fjx1 = _mm_setzero_ps();
313 fjy1 = _mm_setzero_ps();
314 fjz1 = _mm_setzero_ps();
315 fjx2 = _mm_setzero_ps();
316 fjy2 = _mm_setzero_ps();
317 fjz2 = _mm_setzero_ps();
318 fjx3 = _mm_setzero_ps();
319 fjy3 = _mm_setzero_ps();
320 fjz3 = _mm_setzero_ps();
321
322 /**************************
323 * CALCULATE INTERACTIONS *
324 **************************/
325
326 if (gmx_mm_any_lt(rsq00,rcutoff2))
327 {
328
329 /* LENNARD-JONES DISPERSION/REPULSION */
330
331 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
332 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
333 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
334 vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
335 _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
336 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
337
338 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
339
340 /* Update potential sum for this i atom from the interaction with this j atom. */
341 vvdw = _mm_and_ps(vvdw,cutoff_mask);
342 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
343
344 fscal = fvdw;
345
346 fscal = _mm_and_ps(fscal,cutoff_mask);
347
348 /* Calculate temporary vectorial force */
349 tx = _mm_mul_ps(fscal,dx00);
350 ty = _mm_mul_ps(fscal,dy00);
351 tz = _mm_mul_ps(fscal,dz00);
352
353 /* Update vectorial force */
354 fix0 = _mm_add_ps(fix0,tx);
355 fiy0 = _mm_add_ps(fiy0,ty);
356 fiz0 = _mm_add_ps(fiz0,tz);
357
358 fjx0 = _mm_add_ps(fjx0,tx);
359 fjy0 = _mm_add_ps(fjy0,ty);
360 fjz0 = _mm_add_ps(fjz0,tz);
361
362 }
363
364 /**************************
365 * CALCULATE INTERACTIONS *
366 **************************/
367
368 if (gmx_mm_any_lt(rsq11,rcutoff2))
369 {
370
371 /* REACTION-FIELD ELECTROSTATICS */
372 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
373 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
374
375 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
376
377 /* Update potential sum for this i atom from the interaction with this j atom. */
378 velec = _mm_and_ps(velec,cutoff_mask);
379 velecsum = _mm_add_ps(velecsum,velec);
380
381 fscal = felec;
382
383 fscal = _mm_and_ps(fscal,cutoff_mask);
384
385 /* Calculate temporary vectorial force */
386 tx = _mm_mul_ps(fscal,dx11);
387 ty = _mm_mul_ps(fscal,dy11);
388 tz = _mm_mul_ps(fscal,dz11);
389
390 /* Update vectorial force */
391 fix1 = _mm_add_ps(fix1,tx);
392 fiy1 = _mm_add_ps(fiy1,ty);
393 fiz1 = _mm_add_ps(fiz1,tz);
394
395 fjx1 = _mm_add_ps(fjx1,tx);
396 fjy1 = _mm_add_ps(fjy1,ty);
397 fjz1 = _mm_add_ps(fjz1,tz);
398
399 }
400
401 /**************************
402 * CALCULATE INTERACTIONS *
403 **************************/
404
405 if (gmx_mm_any_lt(rsq12,rcutoff2))
406 {
407
408 /* REACTION-FIELD ELECTROSTATICS */
409 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
410 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
411
412 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
413
414 /* Update potential sum for this i atom from the interaction with this j atom. */
415 velec = _mm_and_ps(velec,cutoff_mask);
416 velecsum = _mm_add_ps(velecsum,velec);
417
418 fscal = felec;
419
420 fscal = _mm_and_ps(fscal,cutoff_mask);
421
422 /* Calculate temporary vectorial force */
423 tx = _mm_mul_ps(fscal,dx12);
424 ty = _mm_mul_ps(fscal,dy12);
425 tz = _mm_mul_ps(fscal,dz12);
426
427 /* Update vectorial force */
428 fix1 = _mm_add_ps(fix1,tx);
429 fiy1 = _mm_add_ps(fiy1,ty);
430 fiz1 = _mm_add_ps(fiz1,tz);
431
432 fjx2 = _mm_add_ps(fjx2,tx);
433 fjy2 = _mm_add_ps(fjy2,ty);
434 fjz2 = _mm_add_ps(fjz2,tz);
435
436 }
437
438 /**************************
439 * CALCULATE INTERACTIONS *
440 **************************/
441
442 if (gmx_mm_any_lt(rsq13,rcutoff2))
443 {
444
445 /* REACTION-FIELD ELECTROSTATICS */
446 velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
447 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
448
449 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
450
451 /* Update potential sum for this i atom from the interaction with this j atom. */
452 velec = _mm_and_ps(velec,cutoff_mask);
453 velecsum = _mm_add_ps(velecsum,velec);
454
455 fscal = felec;
456
457 fscal = _mm_and_ps(fscal,cutoff_mask);
458
459 /* Calculate temporary vectorial force */
460 tx = _mm_mul_ps(fscal,dx13);
461 ty = _mm_mul_ps(fscal,dy13);
462 tz = _mm_mul_ps(fscal,dz13);
463
464 /* Update vectorial force */
465 fix1 = _mm_add_ps(fix1,tx);
466 fiy1 = _mm_add_ps(fiy1,ty);
467 fiz1 = _mm_add_ps(fiz1,tz);
468
469 fjx3 = _mm_add_ps(fjx3,tx);
470 fjy3 = _mm_add_ps(fjy3,ty);
471 fjz3 = _mm_add_ps(fjz3,tz);
472
473 }
474
475 /**************************
476 * CALCULATE INTERACTIONS *
477 **************************/
478
479 if (gmx_mm_any_lt(rsq21,rcutoff2))
480 {
481
482 /* REACTION-FIELD ELECTROSTATICS */
483 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
484 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
485
486 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
487
488 /* Update potential sum for this i atom from the interaction with this j atom. */
489 velec = _mm_and_ps(velec,cutoff_mask);
490 velecsum = _mm_add_ps(velecsum,velec);
491
492 fscal = felec;
493
494 fscal = _mm_and_ps(fscal,cutoff_mask);
495
496 /* Calculate temporary vectorial force */
497 tx = _mm_mul_ps(fscal,dx21);
498 ty = _mm_mul_ps(fscal,dy21);
499 tz = _mm_mul_ps(fscal,dz21);
500
501 /* Update vectorial force */
502 fix2 = _mm_add_ps(fix2,tx);
503 fiy2 = _mm_add_ps(fiy2,ty);
504 fiz2 = _mm_add_ps(fiz2,tz);
505
506 fjx1 = _mm_add_ps(fjx1,tx);
507 fjy1 = _mm_add_ps(fjy1,ty);
508 fjz1 = _mm_add_ps(fjz1,tz);
509
510 }
511
512 /**************************
513 * CALCULATE INTERACTIONS *
514 **************************/
515
516 if (gmx_mm_any_lt(rsq22,rcutoff2))
517 {
518
519 /* REACTION-FIELD ELECTROSTATICS */
520 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
521 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
522
523 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
524
525 /* Update potential sum for this i atom from the interaction with this j atom. */
526 velec = _mm_and_ps(velec,cutoff_mask);
527 velecsum = _mm_add_ps(velecsum,velec);
528
529 fscal = felec;
530
531 fscal = _mm_and_ps(fscal,cutoff_mask);
532
533 /* Calculate temporary vectorial force */
534 tx = _mm_mul_ps(fscal,dx22);
535 ty = _mm_mul_ps(fscal,dy22);
536 tz = _mm_mul_ps(fscal,dz22);
537
538 /* Update vectorial force */
539 fix2 = _mm_add_ps(fix2,tx);
540 fiy2 = _mm_add_ps(fiy2,ty);
541 fiz2 = _mm_add_ps(fiz2,tz);
542
543 fjx2 = _mm_add_ps(fjx2,tx);
544 fjy2 = _mm_add_ps(fjy2,ty);
545 fjz2 = _mm_add_ps(fjz2,tz);
546
547 }
548
549 /**************************
550 * CALCULATE INTERACTIONS *
551 **************************/
552
553 if (gmx_mm_any_lt(rsq23,rcutoff2))
554 {
555
556 /* REACTION-FIELD ELECTROSTATICS */
557 velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
558 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
559
560 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
561
562 /* Update potential sum for this i atom from the interaction with this j atom. */
563 velec = _mm_and_ps(velec,cutoff_mask);
564 velecsum = _mm_add_ps(velecsum,velec);
565
566 fscal = felec;
567
568 fscal = _mm_and_ps(fscal,cutoff_mask);
569
570 /* Calculate temporary vectorial force */
571 tx = _mm_mul_ps(fscal,dx23);
572 ty = _mm_mul_ps(fscal,dy23);
573 tz = _mm_mul_ps(fscal,dz23);
574
575 /* Update vectorial force */
576 fix2 = _mm_add_ps(fix2,tx);
577 fiy2 = _mm_add_ps(fiy2,ty);
578 fiz2 = _mm_add_ps(fiz2,tz);
579
580 fjx3 = _mm_add_ps(fjx3,tx);
581 fjy3 = _mm_add_ps(fjy3,ty);
582 fjz3 = _mm_add_ps(fjz3,tz);
583
584 }
585
586 /**************************
587 * CALCULATE INTERACTIONS *
588 **************************/
589
590 if (gmx_mm_any_lt(rsq31,rcutoff2))
591 {
592
593 /* REACTION-FIELD ELECTROSTATICS */
594 velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
595 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
596
597 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
598
599 /* Update potential sum for this i atom from the interaction with this j atom. */
600 velec = _mm_and_ps(velec,cutoff_mask);
601 velecsum = _mm_add_ps(velecsum,velec);
602
603 fscal = felec;
604
605 fscal = _mm_and_ps(fscal,cutoff_mask);
606
607 /* Calculate temporary vectorial force */
608 tx = _mm_mul_ps(fscal,dx31);
609 ty = _mm_mul_ps(fscal,dy31);
610 tz = _mm_mul_ps(fscal,dz31);
611
612 /* Update vectorial force */
613 fix3 = _mm_add_ps(fix3,tx);
614 fiy3 = _mm_add_ps(fiy3,ty);
615 fiz3 = _mm_add_ps(fiz3,tz);
616
617 fjx1 = _mm_add_ps(fjx1,tx);
618 fjy1 = _mm_add_ps(fjy1,ty);
619 fjz1 = _mm_add_ps(fjz1,tz);
620
621 }
622
623 /**************************
624 * CALCULATE INTERACTIONS *
625 **************************/
626
627 if (gmx_mm_any_lt(rsq32,rcutoff2))
628 {
629
630 /* REACTION-FIELD ELECTROSTATICS */
631 velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
632 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
633
634 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
635
636 /* Update potential sum for this i atom from the interaction with this j atom. */
637 velec = _mm_and_ps(velec,cutoff_mask);
638 velecsum = _mm_add_ps(velecsum,velec);
639
640 fscal = felec;
641
642 fscal = _mm_and_ps(fscal,cutoff_mask);
643
644 /* Calculate temporary vectorial force */
645 tx = _mm_mul_ps(fscal,dx32);
646 ty = _mm_mul_ps(fscal,dy32);
647 tz = _mm_mul_ps(fscal,dz32);
648
649 /* Update vectorial force */
650 fix3 = _mm_add_ps(fix3,tx);
651 fiy3 = _mm_add_ps(fiy3,ty);
652 fiz3 = _mm_add_ps(fiz3,tz);
653
654 fjx2 = _mm_add_ps(fjx2,tx);
655 fjy2 = _mm_add_ps(fjy2,ty);
656 fjz2 = _mm_add_ps(fjz2,tz);
657
658 }
659
660 /**************************
661 * CALCULATE INTERACTIONS *
662 **************************/
663
664 if (gmx_mm_any_lt(rsq33,rcutoff2))
665 {
666
667 /* REACTION-FIELD ELECTROSTATICS */
668 velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
669 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
670
671 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
672
673 /* Update potential sum for this i atom from the interaction with this j atom. */
674 velec = _mm_and_ps(velec,cutoff_mask);
675 velecsum = _mm_add_ps(velecsum,velec);
676
677 fscal = felec;
678
679 fscal = _mm_and_ps(fscal,cutoff_mask);
680
681 /* Calculate temporary vectorial force */
682 tx = _mm_mul_ps(fscal,dx33);
683 ty = _mm_mul_ps(fscal,dy33);
684 tz = _mm_mul_ps(fscal,dz33);
685
686 /* Update vectorial force */
687 fix3 = _mm_add_ps(fix3,tx);
688 fiy3 = _mm_add_ps(fiy3,ty);
689 fiz3 = _mm_add_ps(fiz3,tz);
690
691 fjx3 = _mm_add_ps(fjx3,tx);
692 fjy3 = _mm_add_ps(fjy3,ty);
693 fjz3 = _mm_add_ps(fjz3,tz);
694
695 }
696
697 fjptrA = f+j_coord_offsetA;
698 fjptrB = f+j_coord_offsetB;
699 fjptrC = f+j_coord_offsetC;
700 fjptrD = f+j_coord_offsetD;
701
702 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
703 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
704 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
705
706 /* Inner loop uses 368 flops */
707 }
708
709 if(jidx<j_index_end)
710 {
711
712 /* Get j neighbor index, and coordinate index */
713 jnrlistA = jjnr[jidx];
714 jnrlistB = jjnr[jidx+1];
715 jnrlistC = jjnr[jidx+2];
716 jnrlistD = jjnr[jidx+3];
717 /* Sign of each element will be negative for non-real atoms.
718 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
719 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
720 */
721 dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
722 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
723 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
724 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
725 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
726 j_coord_offsetA = DIM3*jnrA;
727 j_coord_offsetB = DIM3*jnrB;
728 j_coord_offsetC = DIM3*jnrC;
729 j_coord_offsetD = DIM3*jnrD;
730
731 /* load j atom coordinates */
732 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
733 x+j_coord_offsetC,x+j_coord_offsetD,
734 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
735 &jy2,&jz2,&jx3,&jy3,&jz3);
736
737 /* Calculate displacement vector */
738 dx00 = _mm_sub_ps(ix0,jx0);
739 dy00 = _mm_sub_ps(iy0,jy0);
740 dz00 = _mm_sub_ps(iz0,jz0);
741 dx11 = _mm_sub_ps(ix1,jx1);
742 dy11 = _mm_sub_ps(iy1,jy1);
743 dz11 = _mm_sub_ps(iz1,jz1);
744 dx12 = _mm_sub_ps(ix1,jx2);
745 dy12 = _mm_sub_ps(iy1,jy2);
746 dz12 = _mm_sub_ps(iz1,jz2);
747 dx13 = _mm_sub_ps(ix1,jx3);
748 dy13 = _mm_sub_ps(iy1,jy3);
749 dz13 = _mm_sub_ps(iz1,jz3);
750 dx21 = _mm_sub_ps(ix2,jx1);
751 dy21 = _mm_sub_ps(iy2,jy1);
752 dz21 = _mm_sub_ps(iz2,jz1);
753 dx22 = _mm_sub_ps(ix2,jx2);
754 dy22 = _mm_sub_ps(iy2,jy2);
755 dz22 = _mm_sub_ps(iz2,jz2);
756 dx23 = _mm_sub_ps(ix2,jx3);
757 dy23 = _mm_sub_ps(iy2,jy3);
758 dz23 = _mm_sub_ps(iz2,jz3);
759 dx31 = _mm_sub_ps(ix3,jx1);
760 dy31 = _mm_sub_ps(iy3,jy1);
761 dz31 = _mm_sub_ps(iz3,jz1);
762 dx32 = _mm_sub_ps(ix3,jx2);
763 dy32 = _mm_sub_ps(iy3,jy2);
764 dz32 = _mm_sub_ps(iz3,jz2);
765 dx33 = _mm_sub_ps(ix3,jx3);
766 dy33 = _mm_sub_ps(iy3,jy3);
767 dz33 = _mm_sub_ps(iz3,jz3);
768
769 /* Calculate squared distance and things based on it */
770 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
771 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
772 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
773 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
774 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
775 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
776 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
777 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
778 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
779 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
780
781 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
782 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
783 rinv13 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq13);
784 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
785 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
786 rinv23 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq23);
787 rinv31 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq31);
788 rinv32 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq32);
789 rinv33 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq33);
790
791 rinvsq00 = gmx_mm_inv_psgmx_simd_inv_f(rsq00);
792 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
793 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
794 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
795 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
796 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
797 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
798 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
799 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
800 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
801
802 fjx0 = _mm_setzero_ps();
803 fjy0 = _mm_setzero_ps();
804 fjz0 = _mm_setzero_ps();
805 fjx1 = _mm_setzero_ps();
806 fjy1 = _mm_setzero_ps();
807 fjz1 = _mm_setzero_ps();
808 fjx2 = _mm_setzero_ps();
809 fjy2 = _mm_setzero_ps();
810 fjz2 = _mm_setzero_ps();
811 fjx3 = _mm_setzero_ps();
812 fjy3 = _mm_setzero_ps();
813 fjz3 = _mm_setzero_ps();
814
815 /**************************
816 * CALCULATE INTERACTIONS *
817 **************************/
818
819 if (gmx_mm_any_lt(rsq00,rcutoff2))
820 {
821
822 /* LENNARD-JONES DISPERSION/REPULSION */
823
824 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
825 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
826 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
827 vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
828 _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
829 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
830
831 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
832
833 /* Update potential sum for this i atom from the interaction with this j atom. */
834 vvdw = _mm_and_ps(vvdw,cutoff_mask);
835 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
836 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
837
838 fscal = fvdw;
839
840 fscal = _mm_and_ps(fscal,cutoff_mask);
841
842 fscal = _mm_andnot_ps(dummy_mask,fscal);
843
844 /* Calculate temporary vectorial force */
845 tx = _mm_mul_ps(fscal,dx00);
846 ty = _mm_mul_ps(fscal,dy00);
847 tz = _mm_mul_ps(fscal,dz00);
848
849 /* Update vectorial force */
850 fix0 = _mm_add_ps(fix0,tx);
851 fiy0 = _mm_add_ps(fiy0,ty);
852 fiz0 = _mm_add_ps(fiz0,tz);
853
854 fjx0 = _mm_add_ps(fjx0,tx);
855 fjy0 = _mm_add_ps(fjy0,ty);
856 fjz0 = _mm_add_ps(fjz0,tz);
857
858 }
859
860 /**************************
861 * CALCULATE INTERACTIONS *
862 **************************/
863
864 if (gmx_mm_any_lt(rsq11,rcutoff2))
865 {
866
867 /* REACTION-FIELD ELECTROSTATICS */
868 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
869 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
870
871 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
872
873 /* Update potential sum for this i atom from the interaction with this j atom. */
874 velec = _mm_and_ps(velec,cutoff_mask);
875 velec = _mm_andnot_ps(dummy_mask,velec);
876 velecsum = _mm_add_ps(velecsum,velec);
877
878 fscal = felec;
879
880 fscal = _mm_and_ps(fscal,cutoff_mask);
881
882 fscal = _mm_andnot_ps(dummy_mask,fscal);
883
884 /* Calculate temporary vectorial force */
885 tx = _mm_mul_ps(fscal,dx11);
886 ty = _mm_mul_ps(fscal,dy11);
887 tz = _mm_mul_ps(fscal,dz11);
888
889 /* Update vectorial force */
890 fix1 = _mm_add_ps(fix1,tx);
891 fiy1 = _mm_add_ps(fiy1,ty);
892 fiz1 = _mm_add_ps(fiz1,tz);
893
894 fjx1 = _mm_add_ps(fjx1,tx);
895 fjy1 = _mm_add_ps(fjy1,ty);
896 fjz1 = _mm_add_ps(fjz1,tz);
897
898 }
899
900 /**************************
901 * CALCULATE INTERACTIONS *
902 **************************/
903
904 if (gmx_mm_any_lt(rsq12,rcutoff2))
905 {
906
907 /* REACTION-FIELD ELECTROSTATICS */
908 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
909 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
910
911 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
912
913 /* Update potential sum for this i atom from the interaction with this j atom. */
914 velec = _mm_and_ps(velec,cutoff_mask);
915 velec = _mm_andnot_ps(dummy_mask,velec);
916 velecsum = _mm_add_ps(velecsum,velec);
917
918 fscal = felec;
919
920 fscal = _mm_and_ps(fscal,cutoff_mask);
921
922 fscal = _mm_andnot_ps(dummy_mask,fscal);
923
924 /* Calculate temporary vectorial force */
925 tx = _mm_mul_ps(fscal,dx12);
926 ty = _mm_mul_ps(fscal,dy12);
927 tz = _mm_mul_ps(fscal,dz12);
928
929 /* Update vectorial force */
930 fix1 = _mm_add_ps(fix1,tx);
931 fiy1 = _mm_add_ps(fiy1,ty);
932 fiz1 = _mm_add_ps(fiz1,tz);
933
934 fjx2 = _mm_add_ps(fjx2,tx);
935 fjy2 = _mm_add_ps(fjy2,ty);
936 fjz2 = _mm_add_ps(fjz2,tz);
937
938 }
939
940 /**************************
941 * CALCULATE INTERACTIONS *
942 **************************/
943
944 if (gmx_mm_any_lt(rsq13,rcutoff2))
945 {
946
947 /* REACTION-FIELD ELECTROSTATICS */
948 velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
949 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
950
951 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
952
953 /* Update potential sum for this i atom from the interaction with this j atom. */
954 velec = _mm_and_ps(velec,cutoff_mask);
955 velec = _mm_andnot_ps(dummy_mask,velec);
956 velecsum = _mm_add_ps(velecsum,velec);
957
958 fscal = felec;
959
960 fscal = _mm_and_ps(fscal,cutoff_mask);
961
962 fscal = _mm_andnot_ps(dummy_mask,fscal);
963
964 /* Calculate temporary vectorial force */
965 tx = _mm_mul_ps(fscal,dx13);
966 ty = _mm_mul_ps(fscal,dy13);
967 tz = _mm_mul_ps(fscal,dz13);
968
969 /* Update vectorial force */
970 fix1 = _mm_add_ps(fix1,tx);
971 fiy1 = _mm_add_ps(fiy1,ty);
972 fiz1 = _mm_add_ps(fiz1,tz);
973
974 fjx3 = _mm_add_ps(fjx3,tx);
975 fjy3 = _mm_add_ps(fjy3,ty);
976 fjz3 = _mm_add_ps(fjz3,tz);
977
978 }
979
980 /**************************
981 * CALCULATE INTERACTIONS *
982 **************************/
983
984 if (gmx_mm_any_lt(rsq21,rcutoff2))
985 {
986
987 /* REACTION-FIELD ELECTROSTATICS */
988 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
989 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
990
991 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
992
993 /* Update potential sum for this i atom from the interaction with this j atom. */
994 velec = _mm_and_ps(velec,cutoff_mask);
995 velec = _mm_andnot_ps(dummy_mask,velec);
996 velecsum = _mm_add_ps(velecsum,velec);
997
998 fscal = felec;
999
1000 fscal = _mm_and_ps(fscal,cutoff_mask);
1001
1002 fscal = _mm_andnot_ps(dummy_mask,fscal);
1003
1004 /* Calculate temporary vectorial force */
1005 tx = _mm_mul_ps(fscal,dx21);
1006 ty = _mm_mul_ps(fscal,dy21);
1007 tz = _mm_mul_ps(fscal,dz21);
1008
1009 /* Update vectorial force */
1010 fix2 = _mm_add_ps(fix2,tx);
1011 fiy2 = _mm_add_ps(fiy2,ty);
1012 fiz2 = _mm_add_ps(fiz2,tz);
1013
1014 fjx1 = _mm_add_ps(fjx1,tx);
1015 fjy1 = _mm_add_ps(fjy1,ty);
1016 fjz1 = _mm_add_ps(fjz1,tz);
1017
1018 }
1019
1020 /**************************
1021 * CALCULATE INTERACTIONS *
1022 **************************/
1023
1024 if (gmx_mm_any_lt(rsq22,rcutoff2))
1025 {
1026
1027 /* REACTION-FIELD ELECTROSTATICS */
1028 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
1029 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1030
1031 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1032
1033 /* Update potential sum for this i atom from the interaction with this j atom. */
1034 velec = _mm_and_ps(velec,cutoff_mask);
1035 velec = _mm_andnot_ps(dummy_mask,velec);
1036 velecsum = _mm_add_ps(velecsum,velec);
1037
1038 fscal = felec;
1039
1040 fscal = _mm_and_ps(fscal,cutoff_mask);
1041
1042 fscal = _mm_andnot_ps(dummy_mask,fscal);
1043
1044 /* Calculate temporary vectorial force */
1045 tx = _mm_mul_ps(fscal,dx22);
1046 ty = _mm_mul_ps(fscal,dy22);
1047 tz = _mm_mul_ps(fscal,dz22);
1048
1049 /* Update vectorial force */
1050 fix2 = _mm_add_ps(fix2,tx);
1051 fiy2 = _mm_add_ps(fiy2,ty);
1052 fiz2 = _mm_add_ps(fiz2,tz);
1053
1054 fjx2 = _mm_add_ps(fjx2,tx);
1055 fjy2 = _mm_add_ps(fjy2,ty);
1056 fjz2 = _mm_add_ps(fjz2,tz);
1057
1058 }
1059
1060 /**************************
1061 * CALCULATE INTERACTIONS *
1062 **************************/
1063
1064 if (gmx_mm_any_lt(rsq23,rcutoff2))
1065 {
1066
1067 /* REACTION-FIELD ELECTROSTATICS */
1068 velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
1069 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1070
1071 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1072
1073 /* Update potential sum for this i atom from the interaction with this j atom. */
1074 velec = _mm_and_ps(velec,cutoff_mask);
1075 velec = _mm_andnot_ps(dummy_mask,velec);
1076 velecsum = _mm_add_ps(velecsum,velec);
1077
1078 fscal = felec;
1079
1080 fscal = _mm_and_ps(fscal,cutoff_mask);
1081
1082 fscal = _mm_andnot_ps(dummy_mask,fscal);
1083
1084 /* Calculate temporary vectorial force */
1085 tx = _mm_mul_ps(fscal,dx23);
1086 ty = _mm_mul_ps(fscal,dy23);
1087 tz = _mm_mul_ps(fscal,dz23);
1088
1089 /* Update vectorial force */
1090 fix2 = _mm_add_ps(fix2,tx);
1091 fiy2 = _mm_add_ps(fiy2,ty);
1092 fiz2 = _mm_add_ps(fiz2,tz);
1093
1094 fjx3 = _mm_add_ps(fjx3,tx);
1095 fjy3 = _mm_add_ps(fjy3,ty);
1096 fjz3 = _mm_add_ps(fjz3,tz);
1097
1098 }
1099
1100 /**************************
1101 * CALCULATE INTERACTIONS *
1102 **************************/
1103
1104 if (gmx_mm_any_lt(rsq31,rcutoff2))
1105 {
1106
1107 /* REACTION-FIELD ELECTROSTATICS */
1108 velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
1109 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1110
1111 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1112
1113 /* Update potential sum for this i atom from the interaction with this j atom. */
1114 velec = _mm_and_ps(velec,cutoff_mask);
1115 velec = _mm_andnot_ps(dummy_mask,velec);
1116 velecsum = _mm_add_ps(velecsum,velec);
1117
1118 fscal = felec;
1119
1120 fscal = _mm_and_ps(fscal,cutoff_mask);
1121
1122 fscal = _mm_andnot_ps(dummy_mask,fscal);
1123
1124 /* Calculate temporary vectorial force */
1125 tx = _mm_mul_ps(fscal,dx31);
1126 ty = _mm_mul_ps(fscal,dy31);
1127 tz = _mm_mul_ps(fscal,dz31);
1128
1129 /* Update vectorial force */
1130 fix3 = _mm_add_ps(fix3,tx);
1131 fiy3 = _mm_add_ps(fiy3,ty);
1132 fiz3 = _mm_add_ps(fiz3,tz);
1133
1134 fjx1 = _mm_add_ps(fjx1,tx);
1135 fjy1 = _mm_add_ps(fjy1,ty);
1136 fjz1 = _mm_add_ps(fjz1,tz);
1137
1138 }
1139
1140 /**************************
1141 * CALCULATE INTERACTIONS *
1142 **************************/
1143
1144 if (gmx_mm_any_lt(rsq32,rcutoff2))
1145 {
1146
1147 /* REACTION-FIELD ELECTROSTATICS */
1148 velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
1149 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1150
1151 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1152
1153 /* Update potential sum for this i atom from the interaction with this j atom. */
1154 velec = _mm_and_ps(velec,cutoff_mask);
1155 velec = _mm_andnot_ps(dummy_mask,velec);
1156 velecsum = _mm_add_ps(velecsum,velec);
1157
1158 fscal = felec;
1159
1160 fscal = _mm_and_ps(fscal,cutoff_mask);
1161
1162 fscal = _mm_andnot_ps(dummy_mask,fscal);
1163
1164 /* Calculate temporary vectorial force */
1165 tx = _mm_mul_ps(fscal,dx32);
1166 ty = _mm_mul_ps(fscal,dy32);
1167 tz = _mm_mul_ps(fscal,dz32);
1168
1169 /* Update vectorial force */
1170 fix3 = _mm_add_ps(fix3,tx);
1171 fiy3 = _mm_add_ps(fiy3,ty);
1172 fiz3 = _mm_add_ps(fiz3,tz);
1173
1174 fjx2 = _mm_add_ps(fjx2,tx);
1175 fjy2 = _mm_add_ps(fjy2,ty);
1176 fjz2 = _mm_add_ps(fjz2,tz);
1177
1178 }
1179
1180 /**************************
1181 * CALCULATE INTERACTIONS *
1182 **************************/
1183
1184 if (gmx_mm_any_lt(rsq33,rcutoff2))
1185 {
1186
1187 /* REACTION-FIELD ELECTROSTATICS */
1188 velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
1189 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1190
1191 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1192
1193 /* Update potential sum for this i atom from the interaction with this j atom. */
1194 velec = _mm_and_ps(velec,cutoff_mask);
1195 velec = _mm_andnot_ps(dummy_mask,velec);
1196 velecsum = _mm_add_ps(velecsum,velec);
1197
1198 fscal = felec;
1199
1200 fscal = _mm_and_ps(fscal,cutoff_mask);
1201
1202 fscal = _mm_andnot_ps(dummy_mask,fscal);
1203
1204 /* Calculate temporary vectorial force */
1205 tx = _mm_mul_ps(fscal,dx33);
1206 ty = _mm_mul_ps(fscal,dy33);
1207 tz = _mm_mul_ps(fscal,dz33);
1208
1209 /* Update vectorial force */
1210 fix3 = _mm_add_ps(fix3,tx);
1211 fiy3 = _mm_add_ps(fiy3,ty);
1212 fiz3 = _mm_add_ps(fiz3,tz);
1213
1214 fjx3 = _mm_add_ps(fjx3,tx);
1215 fjy3 = _mm_add_ps(fjy3,ty);
1216 fjz3 = _mm_add_ps(fjz3,tz);
1217
1218 }
1219
1220 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1221 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1222 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1223 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1224
1225 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1226 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1227 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1228
1229 /* Inner loop uses 368 flops */
1230 }
1231
1232 /* End of innermost loop */
1233
1234 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1235 f+i_coord_offset,fshift+i_shift_offset);
1236
1237 ggid = gid[iidx];
1238 /* Update potential energies */
1239 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1240 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1241
1242 /* Increment number of inner iterations */
1243 inneriter += j_index_end - j_index_start;
1244
1245 /* Outer loop uses 26 flops */
1246 }
1247
1248 /* Increment number of outer iterations */
1249 outeriter += nri;
1250
1251 /* Update outer/inner flops */
1252
1253 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*368)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W4W4_VF] += outeriter*26 +
inneriter*368
;
1254}
1255/*
1256 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse4_1_single
1257 * Electrostatics interaction: ReactionField
1258 * VdW interaction: LennardJones
1259 * Geometry: Water4-Water4
1260 * Calculate force/pot: Force
1261 */
1262void
1263nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse4_1_single
1264 (t_nblist * gmx_restrict nlist,
1265 rvec * gmx_restrict xx,
1266 rvec * gmx_restrict ff,
1267 t_forcerec * gmx_restrict fr,
1268 t_mdatoms * gmx_restrict mdatoms,
1269 nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
1270 t_nrnb * gmx_restrict nrnb)
1271{
1272 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1273 * just 0 for non-waters.
1274 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1275 * jnr indices corresponding to data put in the four positions in the SIMD register.
1276 */
1277 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1278 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1279 int jnrA,jnrB,jnrC,jnrD;
1280 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1281 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1282 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1283 real rcutoff_scalar;
1284 real *shiftvec,*fshift,*x,*f;
1285 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1286 real scratch[4*DIM3];
1287 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1288 int vdwioffset0;
1289 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1290 int vdwioffset1;
1291 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1292 int vdwioffset2;
1293 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1294 int vdwioffset3;
1295 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1296 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1297 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1298 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1299 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1300 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1301 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1302 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1303 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1304 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1305 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1306 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1307 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1308 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1309 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1310 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1311 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1312 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1313 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1314 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1315 real *charge;
1316 int nvdwtype;
1317 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1318 int *vdwtype;
1319 real *vdwparam;
1320 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1321 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1322 __m128 dummy_mask,cutoff_mask;
1323 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1324 __m128 one = _mm_set1_ps(1.0);
1325 __m128 two = _mm_set1_ps(2.0);
1326 x = xx[0];
1327 f = ff[0];
1328
1329 nri = nlist->nri;
1330 iinr = nlist->iinr;
1331 jindex = nlist->jindex;
1332 jjnr = nlist->jjnr;
1333 shiftidx = nlist->shift;
1334 gid = nlist->gid;
1335 shiftvec = fr->shift_vec[0];
1336 fshift = fr->fshift[0];
1337 facel = _mm_set1_ps(fr->epsfac);
1338 charge = mdatoms->chargeA;
1339 krf = _mm_set1_ps(fr->ic->k_rf);
1340 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
1341 crf = _mm_set1_ps(fr->ic->c_rf);
1342 nvdwtype = fr->ntype;
1343 vdwparam = fr->nbfp;
1344 vdwtype = mdatoms->typeA;
1345
1346 /* Setup water-specific parameters */
1347 inr = nlist->iinr[0];
1348 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1349 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1350 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1351 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1352
1353 jq1 = _mm_set1_ps(charge[inr+1]);
1354 jq2 = _mm_set1_ps(charge[inr+2]);
1355 jq3 = _mm_set1_ps(charge[inr+3]);
1356 vdwjidx0A = 2*vdwtype[inr+0];
1357 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1358 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1359 qq11 = _mm_mul_ps(iq1,jq1);
1360 qq12 = _mm_mul_ps(iq1,jq2);
1361 qq13 = _mm_mul_ps(iq1,jq3);
1362 qq21 = _mm_mul_ps(iq2,jq1);
1363 qq22 = _mm_mul_ps(iq2,jq2);
1364 qq23 = _mm_mul_ps(iq2,jq3);
1365 qq31 = _mm_mul_ps(iq3,jq1);
1366 qq32 = _mm_mul_ps(iq3,jq2);
1367 qq33 = _mm_mul_ps(iq3,jq3);
1368
1369 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1370 rcutoff_scalar = fr->rcoulomb;
1371 rcutoff = _mm_set1_ps(rcutoff_scalar);
1372 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1373
1374 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
1375 rvdw = _mm_set1_ps(fr->rvdw);
1376
1377 /* Avoid stupid compiler warnings */
1378 jnrA = jnrB = jnrC = jnrD = 0;
1379 j_coord_offsetA = 0;
1380 j_coord_offsetB = 0;
1381 j_coord_offsetC = 0;
1382 j_coord_offsetD = 0;
1383
1384 outeriter = 0;
1385 inneriter = 0;
1386
1387 for(iidx=0;iidx<4*DIM3;iidx++)
1388 {
1389 scratch[iidx] = 0.0;
1390 }
1391
1392 /* Start outer loop over neighborlists */
1393 for(iidx=0; iidx<nri; iidx++)
1394 {
1395 /* Load shift vector for this list */
1396 i_shift_offset = DIM3*shiftidx[iidx];
1397
1398 /* Load limits for loop over neighbors */
1399 j_index_start = jindex[iidx];
1400 j_index_end = jindex[iidx+1];
1401
1402 /* Get outer coordinate index */
1403 inr = iinr[iidx];
1404 i_coord_offset = DIM3*inr;
1405
1406 /* Load i particle coords and add shift vector */
1407 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1408 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1409
1410 fix0 = _mm_setzero_ps();
1411 fiy0 = _mm_setzero_ps();
1412 fiz0 = _mm_setzero_ps();
1413 fix1 = _mm_setzero_ps();
1414 fiy1 = _mm_setzero_ps();
1415 fiz1 = _mm_setzero_ps();
1416 fix2 = _mm_setzero_ps();
1417 fiy2 = _mm_setzero_ps();
1418 fiz2 = _mm_setzero_ps();
1419 fix3 = _mm_setzero_ps();
1420 fiy3 = _mm_setzero_ps();
1421 fiz3 = _mm_setzero_ps();
1422
1423 /* Start inner kernel loop */
1424 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1425 {
1426
1427 /* Get j neighbor index, and coordinate index */
1428 jnrA = jjnr[jidx];
1429 jnrB = jjnr[jidx+1];
1430 jnrC = jjnr[jidx+2];
1431 jnrD = jjnr[jidx+3];
1432 j_coord_offsetA = DIM3*jnrA;
1433 j_coord_offsetB = DIM3*jnrB;
1434 j_coord_offsetC = DIM3*jnrC;
1435 j_coord_offsetD = DIM3*jnrD;
1436
1437 /* load j atom coordinates */
1438 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1439 x+j_coord_offsetC,x+j_coord_offsetD,
1440 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1441 &jy2,&jz2,&jx3,&jy3,&jz3);
1442
1443 /* Calculate displacement vector */
1444 dx00 = _mm_sub_ps(ix0,jx0);
1445 dy00 = _mm_sub_ps(iy0,jy0);
1446 dz00 = _mm_sub_ps(iz0,jz0);
1447 dx11 = _mm_sub_ps(ix1,jx1);
1448 dy11 = _mm_sub_ps(iy1,jy1);
1449 dz11 = _mm_sub_ps(iz1,jz1);
1450 dx12 = _mm_sub_ps(ix1,jx2);
1451 dy12 = _mm_sub_ps(iy1,jy2);
1452 dz12 = _mm_sub_ps(iz1,jz2);
1453 dx13 = _mm_sub_ps(ix1,jx3);
1454 dy13 = _mm_sub_ps(iy1,jy3);
1455 dz13 = _mm_sub_ps(iz1,jz3);
1456 dx21 = _mm_sub_ps(ix2,jx1);
1457 dy21 = _mm_sub_ps(iy2,jy1);
1458 dz21 = _mm_sub_ps(iz2,jz1);
1459 dx22 = _mm_sub_ps(ix2,jx2);
1460 dy22 = _mm_sub_ps(iy2,jy2);
1461 dz22 = _mm_sub_ps(iz2,jz2);
1462 dx23 = _mm_sub_ps(ix2,jx3);
1463 dy23 = _mm_sub_ps(iy2,jy3);
1464 dz23 = _mm_sub_ps(iz2,jz3);
1465 dx31 = _mm_sub_ps(ix3,jx1);
1466 dy31 = _mm_sub_ps(iy3,jy1);
1467 dz31 = _mm_sub_ps(iz3,jz1);
1468 dx32 = _mm_sub_ps(ix3,jx2);
1469 dy32 = _mm_sub_ps(iy3,jy2);
1470 dz32 = _mm_sub_ps(iz3,jz2);
1471 dx33 = _mm_sub_ps(ix3,jx3);
1472 dy33 = _mm_sub_ps(iy3,jy3);
1473 dz33 = _mm_sub_ps(iz3,jz3);
1474
1475 /* Calculate squared distance and things based on it */
1476 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1477 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1478 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1479 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1480 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1481 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1482 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1483 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1484 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1485 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1486
1487 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
1488 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
1489 rinv13 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq13);
1490 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
1491 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
1492 rinv23 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq23);
1493 rinv31 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq31);
1494 rinv32 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq32);
1495 rinv33 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq33);
1496
1497 rinvsq00 = gmx_mm_inv_psgmx_simd_inv_f(rsq00);
1498 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1499 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1500 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1501 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1502 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1503 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1504 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1505 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1506 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1507
1508 fjx0 = _mm_setzero_ps();
1509 fjy0 = _mm_setzero_ps();
1510 fjz0 = _mm_setzero_ps();
1511 fjx1 = _mm_setzero_ps();
1512 fjy1 = _mm_setzero_ps();
1513 fjz1 = _mm_setzero_ps();
1514 fjx2 = _mm_setzero_ps();
1515 fjy2 = _mm_setzero_ps();
1516 fjz2 = _mm_setzero_ps();
1517 fjx3 = _mm_setzero_ps();
1518 fjy3 = _mm_setzero_ps();
1519 fjz3 = _mm_setzero_ps();
1520
1521 /**************************
1522 * CALCULATE INTERACTIONS *
1523 **************************/
1524
1525 if (gmx_mm_any_lt(rsq00,rcutoff2))
1526 {
1527
1528 /* LENNARD-JONES DISPERSION/REPULSION */
1529
1530 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1531 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1532
1533 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1534
1535 fscal = fvdw;
1536
1537 fscal = _mm_and_ps(fscal,cutoff_mask);
1538
1539 /* Calculate temporary vectorial force */
1540 tx = _mm_mul_ps(fscal,dx00);
1541 ty = _mm_mul_ps(fscal,dy00);
1542 tz = _mm_mul_ps(fscal,dz00);
1543
1544 /* Update vectorial force */
1545 fix0 = _mm_add_ps(fix0,tx);
1546 fiy0 = _mm_add_ps(fiy0,ty);
1547 fiz0 = _mm_add_ps(fiz0,tz);
1548
1549 fjx0 = _mm_add_ps(fjx0,tx);
1550 fjy0 = _mm_add_ps(fjy0,ty);
1551 fjz0 = _mm_add_ps(fjz0,tz);
1552
1553 }
1554
1555 /**************************
1556 * CALCULATE INTERACTIONS *
1557 **************************/
1558
1559 if (gmx_mm_any_lt(rsq11,rcutoff2))
1560 {
1561
1562 /* REACTION-FIELD ELECTROSTATICS */
1563 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1564
1565 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1566
1567 fscal = felec;
1568
1569 fscal = _mm_and_ps(fscal,cutoff_mask);
1570
1571 /* Calculate temporary vectorial force */
1572 tx = _mm_mul_ps(fscal,dx11);
1573 ty = _mm_mul_ps(fscal,dy11);
1574 tz = _mm_mul_ps(fscal,dz11);
1575
1576 /* Update vectorial force */
1577 fix1 = _mm_add_ps(fix1,tx);
1578 fiy1 = _mm_add_ps(fiy1,ty);
1579 fiz1 = _mm_add_ps(fiz1,tz);
1580
1581 fjx1 = _mm_add_ps(fjx1,tx);
1582 fjy1 = _mm_add_ps(fjy1,ty);
1583 fjz1 = _mm_add_ps(fjz1,tz);
1584
1585 }
1586
1587 /**************************
1588 * CALCULATE INTERACTIONS *
1589 **************************/
1590
1591 if (gmx_mm_any_lt(rsq12,rcutoff2))
1592 {
1593
1594 /* REACTION-FIELD ELECTROSTATICS */
1595 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1596
1597 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1598
1599 fscal = felec;
1600
1601 fscal = _mm_and_ps(fscal,cutoff_mask);
1602
1603 /* Calculate temporary vectorial force */
1604 tx = _mm_mul_ps(fscal,dx12);
1605 ty = _mm_mul_ps(fscal,dy12);
1606 tz = _mm_mul_ps(fscal,dz12);
1607
1608 /* Update vectorial force */
1609 fix1 = _mm_add_ps(fix1,tx);
1610 fiy1 = _mm_add_ps(fiy1,ty);
1611 fiz1 = _mm_add_ps(fiz1,tz);
1612
1613 fjx2 = _mm_add_ps(fjx2,tx);
1614 fjy2 = _mm_add_ps(fjy2,ty);
1615 fjz2 = _mm_add_ps(fjz2,tz);
1616
1617 }
1618
1619 /**************************
1620 * CALCULATE INTERACTIONS *
1621 **************************/
1622
1623 if (gmx_mm_any_lt(rsq13,rcutoff2))
1624 {
1625
1626 /* REACTION-FIELD ELECTROSTATICS */
1627 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
1628
1629 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1630
1631 fscal = felec;
1632
1633 fscal = _mm_and_ps(fscal,cutoff_mask);
1634
1635 /* Calculate temporary vectorial force */
1636 tx = _mm_mul_ps(fscal,dx13);
1637 ty = _mm_mul_ps(fscal,dy13);
1638 tz = _mm_mul_ps(fscal,dz13);
1639
1640 /* Update vectorial force */
1641 fix1 = _mm_add_ps(fix1,tx);
1642 fiy1 = _mm_add_ps(fiy1,ty);
1643 fiz1 = _mm_add_ps(fiz1,tz);
1644
1645 fjx3 = _mm_add_ps(fjx3,tx);
1646 fjy3 = _mm_add_ps(fjy3,ty);
1647 fjz3 = _mm_add_ps(fjz3,tz);
1648
1649 }
1650
1651 /**************************
1652 * CALCULATE INTERACTIONS *
1653 **************************/
1654
1655 if (gmx_mm_any_lt(rsq21,rcutoff2))
1656 {
1657
1658 /* REACTION-FIELD ELECTROSTATICS */
1659 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1660
1661 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1662
1663 fscal = felec;
1664
1665 fscal = _mm_and_ps(fscal,cutoff_mask);
1666
1667 /* Calculate temporary vectorial force */
1668 tx = _mm_mul_ps(fscal,dx21);
1669 ty = _mm_mul_ps(fscal,dy21);
1670 tz = _mm_mul_ps(fscal,dz21);
1671
1672 /* Update vectorial force */
1673 fix2 = _mm_add_ps(fix2,tx);
1674 fiy2 = _mm_add_ps(fiy2,ty);
1675 fiz2 = _mm_add_ps(fiz2,tz);
1676
1677 fjx1 = _mm_add_ps(fjx1,tx);
1678 fjy1 = _mm_add_ps(fjy1,ty);
1679 fjz1 = _mm_add_ps(fjz1,tz);
1680
1681 }
1682
1683 /**************************
1684 * CALCULATE INTERACTIONS *
1685 **************************/
1686
1687 if (gmx_mm_any_lt(rsq22,rcutoff2))
1688 {
1689
1690 /* REACTION-FIELD ELECTROSTATICS */
1691 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1692
1693 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1694
1695 fscal = felec;
1696
1697 fscal = _mm_and_ps(fscal,cutoff_mask);
1698
1699 /* Calculate temporary vectorial force */
1700 tx = _mm_mul_ps(fscal,dx22);
1701 ty = _mm_mul_ps(fscal,dy22);
1702 tz = _mm_mul_ps(fscal,dz22);
1703
1704 /* Update vectorial force */
1705 fix2 = _mm_add_ps(fix2,tx);
1706 fiy2 = _mm_add_ps(fiy2,ty);
1707 fiz2 = _mm_add_ps(fiz2,tz);
1708
1709 fjx2 = _mm_add_ps(fjx2,tx);
1710 fjy2 = _mm_add_ps(fjy2,ty);
1711 fjz2 = _mm_add_ps(fjz2,tz);
1712
1713 }
1714
1715 /**************************
1716 * CALCULATE INTERACTIONS *
1717 **************************/
1718
1719 if (gmx_mm_any_lt(rsq23,rcutoff2))
1720 {
1721
1722 /* REACTION-FIELD ELECTROSTATICS */
1723 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1724
1725 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1726
1727 fscal = felec;
1728
1729 fscal = _mm_and_ps(fscal,cutoff_mask);
1730
1731 /* Calculate temporary vectorial force */
1732 tx = _mm_mul_ps(fscal,dx23);
1733 ty = _mm_mul_ps(fscal,dy23);
1734 tz = _mm_mul_ps(fscal,dz23);
1735
1736 /* Update vectorial force */
1737 fix2 = _mm_add_ps(fix2,tx);
1738 fiy2 = _mm_add_ps(fiy2,ty);
1739 fiz2 = _mm_add_ps(fiz2,tz);
1740
1741 fjx3 = _mm_add_ps(fjx3,tx);
1742 fjy3 = _mm_add_ps(fjy3,ty);
1743 fjz3 = _mm_add_ps(fjz3,tz);
1744
1745 }
1746
1747 /**************************
1748 * CALCULATE INTERACTIONS *
1749 **************************/
1750
1751 if (gmx_mm_any_lt(rsq31,rcutoff2))
1752 {
1753
1754 /* REACTION-FIELD ELECTROSTATICS */
1755 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1756
1757 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1758
1759 fscal = felec;
1760
1761 fscal = _mm_and_ps(fscal,cutoff_mask);
1762
1763 /* Calculate temporary vectorial force */
1764 tx = _mm_mul_ps(fscal,dx31);
1765 ty = _mm_mul_ps(fscal,dy31);
1766 tz = _mm_mul_ps(fscal,dz31);
1767
1768 /* Update vectorial force */
1769 fix3 = _mm_add_ps(fix3,tx);
1770 fiy3 = _mm_add_ps(fiy3,ty);
1771 fiz3 = _mm_add_ps(fiz3,tz);
1772
1773 fjx1 = _mm_add_ps(fjx1,tx);
1774 fjy1 = _mm_add_ps(fjy1,ty);
1775 fjz1 = _mm_add_ps(fjz1,tz);
1776
1777 }
1778
1779 /**************************
1780 * CALCULATE INTERACTIONS *
1781 **************************/
1782
1783 if (gmx_mm_any_lt(rsq32,rcutoff2))
1784 {
1785
1786 /* REACTION-FIELD ELECTROSTATICS */
1787 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1788
1789 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1790
1791 fscal = felec;
1792
1793 fscal = _mm_and_ps(fscal,cutoff_mask);
1794
1795 /* Calculate temporary vectorial force */
1796 tx = _mm_mul_ps(fscal,dx32);
1797 ty = _mm_mul_ps(fscal,dy32);
1798 tz = _mm_mul_ps(fscal,dz32);
1799
1800 /* Update vectorial force */
1801 fix3 = _mm_add_ps(fix3,tx);
1802 fiy3 = _mm_add_ps(fiy3,ty);
1803 fiz3 = _mm_add_ps(fiz3,tz);
1804
1805 fjx2 = _mm_add_ps(fjx2,tx);
1806 fjy2 = _mm_add_ps(fjy2,ty);
1807 fjz2 = _mm_add_ps(fjz2,tz);
1808
1809 }
1810
1811 /**************************
1812 * CALCULATE INTERACTIONS *
1813 **************************/
1814
1815 if (gmx_mm_any_lt(rsq33,rcutoff2))
1816 {
1817
1818 /* REACTION-FIELD ELECTROSTATICS */
1819 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1820
1821 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1822
1823 fscal = felec;
1824
1825 fscal = _mm_and_ps(fscal,cutoff_mask);
1826
1827 /* Calculate temporary vectorial force */
1828 tx = _mm_mul_ps(fscal,dx33);
1829 ty = _mm_mul_ps(fscal,dy33);
1830 tz = _mm_mul_ps(fscal,dz33);
1831
1832 /* Update vectorial force */
1833 fix3 = _mm_add_ps(fix3,tx);
1834 fiy3 = _mm_add_ps(fiy3,ty);
1835 fiz3 = _mm_add_ps(fiz3,tz);
1836
1837 fjx3 = _mm_add_ps(fjx3,tx);
1838 fjy3 = _mm_add_ps(fjy3,ty);
1839 fjz3 = _mm_add_ps(fjz3,tz);
1840
1841 }
1842
1843 fjptrA = f+j_coord_offsetA;
1844 fjptrB = f+j_coord_offsetB;
1845 fjptrC = f+j_coord_offsetC;
1846 fjptrD = f+j_coord_offsetD;
1847
1848 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1849 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1850 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1851
1852 /* Inner loop uses 303 flops */
1853 }
1854
1855 if(jidx<j_index_end)
1856 {
1857
1858 /* Get j neighbor index, and coordinate index */
1859 jnrlistA = jjnr[jidx];
1860 jnrlistB = jjnr[jidx+1];
1861 jnrlistC = jjnr[jidx+2];
1862 jnrlistD = jjnr[jidx+3];
1863 /* Sign of each element will be negative for non-real atoms.
1864 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1865 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1866 */
1867 dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1868 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1869 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1870 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1871 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1872 j_coord_offsetA = DIM3*jnrA;
1873 j_coord_offsetB = DIM3*jnrB;
1874 j_coord_offsetC = DIM3*jnrC;
1875 j_coord_offsetD = DIM3*jnrD;
1876
1877 /* load j atom coordinates */
1878 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1879 x+j_coord_offsetC,x+j_coord_offsetD,
1880 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1881 &jy2,&jz2,&jx3,&jy3,&jz3);
1882
1883 /* Calculate displacement vector */
1884 dx00 = _mm_sub_ps(ix0,jx0);
1885 dy00 = _mm_sub_ps(iy0,jy0);
1886 dz00 = _mm_sub_ps(iz0,jz0);
1887 dx11 = _mm_sub_ps(ix1,jx1);
1888 dy11 = _mm_sub_ps(iy1,jy1);
1889 dz11 = _mm_sub_ps(iz1,jz1);
1890 dx12 = _mm_sub_ps(ix1,jx2);
1891 dy12 = _mm_sub_ps(iy1,jy2);
1892 dz12 = _mm_sub_ps(iz1,jz2);
1893 dx13 = _mm_sub_ps(ix1,jx3);
1894 dy13 = _mm_sub_ps(iy1,jy3);
1895 dz13 = _mm_sub_ps(iz1,jz3);
1896 dx21 = _mm_sub_ps(ix2,jx1);
1897 dy21 = _mm_sub_ps(iy2,jy1);
1898 dz21 = _mm_sub_ps(iz2,jz1);
1899 dx22 = _mm_sub_ps(ix2,jx2);
1900 dy22 = _mm_sub_ps(iy2,jy2);
1901 dz22 = _mm_sub_ps(iz2,jz2);
1902 dx23 = _mm_sub_ps(ix2,jx3);
1903 dy23 = _mm_sub_ps(iy2,jy3);
1904 dz23 = _mm_sub_ps(iz2,jz3);
1905 dx31 = _mm_sub_ps(ix3,jx1);
1906 dy31 = _mm_sub_ps(iy3,jy1);
1907 dz31 = _mm_sub_ps(iz3,jz1);
1908 dx32 = _mm_sub_ps(ix3,jx2);
1909 dy32 = _mm_sub_ps(iy3,jy2);
1910 dz32 = _mm_sub_ps(iz3,jz2);
1911 dx33 = _mm_sub_ps(ix3,jx3);
1912 dy33 = _mm_sub_ps(iy3,jy3);
1913 dz33 = _mm_sub_ps(iz3,jz3);
1914
1915 /* Calculate squared distance and things based on it */
1916 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1917 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1918 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1919 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1920 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1921 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1922 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1923 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1924 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1925 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1926
1927 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
1928 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
1929 rinv13 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq13);
1930 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
1931 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
1932 rinv23 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq23);
1933 rinv31 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq31);
1934 rinv32 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq32);
1935 rinv33 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq33);
1936
1937 rinvsq00 = gmx_mm_inv_psgmx_simd_inv_f(rsq00);
1938 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1939 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1940 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1941 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1942 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1943 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1944 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1945 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1946 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1947
1948 fjx0 = _mm_setzero_ps();
1949 fjy0 = _mm_setzero_ps();
1950 fjz0 = _mm_setzero_ps();
1951 fjx1 = _mm_setzero_ps();
1952 fjy1 = _mm_setzero_ps();
1953 fjz1 = _mm_setzero_ps();
1954 fjx2 = _mm_setzero_ps();
1955 fjy2 = _mm_setzero_ps();
1956 fjz2 = _mm_setzero_ps();
1957 fjx3 = _mm_setzero_ps();
1958 fjy3 = _mm_setzero_ps();
1959 fjz3 = _mm_setzero_ps();
1960
1961 /**************************
1962 * CALCULATE INTERACTIONS *
1963 **************************/
1964
1965 if (gmx_mm_any_lt(rsq00,rcutoff2))
1966 {
1967
1968 /* LENNARD-JONES DISPERSION/REPULSION */
1969
1970 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1971 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1972
1973 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1974
1975 fscal = fvdw;
1976
1977 fscal = _mm_and_ps(fscal,cutoff_mask);
1978
1979 fscal = _mm_andnot_ps(dummy_mask,fscal);
1980
1981 /* Calculate temporary vectorial force */
1982 tx = _mm_mul_ps(fscal,dx00);
1983 ty = _mm_mul_ps(fscal,dy00);
1984 tz = _mm_mul_ps(fscal,dz00);
1985
1986 /* Update vectorial force */
1987 fix0 = _mm_add_ps(fix0,tx);
1988 fiy0 = _mm_add_ps(fiy0,ty);
1989 fiz0 = _mm_add_ps(fiz0,tz);
1990
1991 fjx0 = _mm_add_ps(fjx0,tx);
1992 fjy0 = _mm_add_ps(fjy0,ty);
1993 fjz0 = _mm_add_ps(fjz0,tz);
1994
1995 }
1996
1997 /**************************
1998 * CALCULATE INTERACTIONS *
1999 **************************/
2000
2001 if (gmx_mm_any_lt(rsq11,rcutoff2))
2002 {
2003
2004 /* REACTION-FIELD ELECTROSTATICS */
2005 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
2006
2007 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2008
2009 fscal = felec;
2010
2011 fscal = _mm_and_ps(fscal,cutoff_mask);
2012
2013 fscal = _mm_andnot_ps(dummy_mask,fscal);
2014
2015 /* Calculate temporary vectorial force */
2016 tx = _mm_mul_ps(fscal,dx11);
2017 ty = _mm_mul_ps(fscal,dy11);
2018 tz = _mm_mul_ps(fscal,dz11);
2019
2020 /* Update vectorial force */
2021 fix1 = _mm_add_ps(fix1,tx);
2022 fiy1 = _mm_add_ps(fiy1,ty);
2023 fiz1 = _mm_add_ps(fiz1,tz);
2024
2025 fjx1 = _mm_add_ps(fjx1,tx);
2026 fjy1 = _mm_add_ps(fjy1,ty);
2027 fjz1 = _mm_add_ps(fjz1,tz);
2028
2029 }
2030
2031 /**************************
2032 * CALCULATE INTERACTIONS *
2033 **************************/
2034
2035 if (gmx_mm_any_lt(rsq12,rcutoff2))
2036 {
2037
2038 /* REACTION-FIELD ELECTROSTATICS */
2039 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
2040
2041 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2042
2043 fscal = felec;
2044
2045 fscal = _mm_and_ps(fscal,cutoff_mask);
2046
2047 fscal = _mm_andnot_ps(dummy_mask,fscal);
2048
2049 /* Calculate temporary vectorial force */
2050 tx = _mm_mul_ps(fscal,dx12);
2051 ty = _mm_mul_ps(fscal,dy12);
2052 tz = _mm_mul_ps(fscal,dz12);
2053
2054 /* Update vectorial force */
2055 fix1 = _mm_add_ps(fix1,tx);
2056 fiy1 = _mm_add_ps(fiy1,ty);
2057 fiz1 = _mm_add_ps(fiz1,tz);
2058
2059 fjx2 = _mm_add_ps(fjx2,tx);
2060 fjy2 = _mm_add_ps(fjy2,ty);
2061 fjz2 = _mm_add_ps(fjz2,tz);
2062
2063 }
2064
2065 /**************************
2066 * CALCULATE INTERACTIONS *
2067 **************************/
2068
2069 if (gmx_mm_any_lt(rsq13,rcutoff2))
2070 {
2071
2072 /* REACTION-FIELD ELECTROSTATICS */
2073 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
2074
2075 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2076
2077 fscal = felec;
2078
2079 fscal = _mm_and_ps(fscal,cutoff_mask);
2080
2081 fscal = _mm_andnot_ps(dummy_mask,fscal);
2082
2083 /* Calculate temporary vectorial force */
2084 tx = _mm_mul_ps(fscal,dx13);
2085 ty = _mm_mul_ps(fscal,dy13);
2086 tz = _mm_mul_ps(fscal,dz13);
2087
2088 /* Update vectorial force */
2089 fix1 = _mm_add_ps(fix1,tx);
2090 fiy1 = _mm_add_ps(fiy1,ty);
2091 fiz1 = _mm_add_ps(fiz1,tz);
2092
2093 fjx3 = _mm_add_ps(fjx3,tx);
2094 fjy3 = _mm_add_ps(fjy3,ty);
2095 fjz3 = _mm_add_ps(fjz3,tz);
2096
2097 }
2098
2099 /**************************
2100 * CALCULATE INTERACTIONS *
2101 **************************/
2102
2103 if (gmx_mm_any_lt(rsq21,rcutoff2))
2104 {
2105
2106 /* REACTION-FIELD ELECTROSTATICS */
2107 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
2108
2109 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2110
2111 fscal = felec;
2112
2113 fscal = _mm_and_ps(fscal,cutoff_mask);
2114
2115 fscal = _mm_andnot_ps(dummy_mask,fscal);
2116
2117 /* Calculate temporary vectorial force */
2118 tx = _mm_mul_ps(fscal,dx21);
2119 ty = _mm_mul_ps(fscal,dy21);
2120 tz = _mm_mul_ps(fscal,dz21);
2121
2122 /* Update vectorial force */
2123 fix2 = _mm_add_ps(fix2,tx);
2124 fiy2 = _mm_add_ps(fiy2,ty);
2125 fiz2 = _mm_add_ps(fiz2,tz);
2126
2127 fjx1 = _mm_add_ps(fjx1,tx);
2128 fjy1 = _mm_add_ps(fjy1,ty);
2129 fjz1 = _mm_add_ps(fjz1,tz);
2130
2131 }
2132
2133 /**************************
2134 * CALCULATE INTERACTIONS *
2135 **************************/
2136
2137 if (gmx_mm_any_lt(rsq22,rcutoff2))
2138 {
2139
2140 /* REACTION-FIELD ELECTROSTATICS */
2141 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
2142
2143 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2144
2145 fscal = felec;
2146
2147 fscal = _mm_and_ps(fscal,cutoff_mask);
2148
2149 fscal = _mm_andnot_ps(dummy_mask,fscal);
2150
2151 /* Calculate temporary vectorial force */
2152 tx = _mm_mul_ps(fscal,dx22);
2153 ty = _mm_mul_ps(fscal,dy22);
2154 tz = _mm_mul_ps(fscal,dz22);
2155
2156 /* Update vectorial force */
2157 fix2 = _mm_add_ps(fix2,tx);
2158 fiy2 = _mm_add_ps(fiy2,ty);
2159 fiz2 = _mm_add_ps(fiz2,tz);
2160
2161 fjx2 = _mm_add_ps(fjx2,tx);
2162 fjy2 = _mm_add_ps(fjy2,ty);
2163 fjz2 = _mm_add_ps(fjz2,tz);
2164
2165 }
2166
2167 /**************************
2168 * CALCULATE INTERACTIONS *
2169 **************************/
2170
2171 if (gmx_mm_any_lt(rsq23,rcutoff2))
2172 {
2173
2174 /* REACTION-FIELD ELECTROSTATICS */
2175 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
2176
2177 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
2178
2179 fscal = felec;
2180
2181 fscal = _mm_and_ps(fscal,cutoff_mask);
2182
2183 fscal = _mm_andnot_ps(dummy_mask,fscal);
2184
2185 /* Calculate temporary vectorial force */
2186 tx = _mm_mul_ps(fscal,dx23);
2187 ty = _mm_mul_ps(fscal,dy23);
2188 tz = _mm_mul_ps(fscal,dz23);
2189
2190 /* Update vectorial force */
2191 fix2 = _mm_add_ps(fix2,tx);
2192 fiy2 = _mm_add_ps(fiy2,ty);
2193 fiz2 = _mm_add_ps(fiz2,tz);
2194
2195 fjx3 = _mm_add_ps(fjx3,tx);
2196 fjy3 = _mm_add_ps(fjy3,ty);
2197 fjz3 = _mm_add_ps(fjz3,tz);
2198
2199 }
2200
2201 /**************************
2202 * CALCULATE INTERACTIONS *
2203 **************************/
2204
2205 if (gmx_mm_any_lt(rsq31,rcutoff2))
2206 {
2207
2208 /* REACTION-FIELD ELECTROSTATICS */
2209 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
2210
2211 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2212
2213 fscal = felec;
2214
2215 fscal = _mm_and_ps(fscal,cutoff_mask);
2216
2217 fscal = _mm_andnot_ps(dummy_mask,fscal);
2218
2219 /* Calculate temporary vectorial force */
2220 tx = _mm_mul_ps(fscal,dx31);
2221 ty = _mm_mul_ps(fscal,dy31);
2222 tz = _mm_mul_ps(fscal,dz31);
2223
2224 /* Update vectorial force */
2225 fix3 = _mm_add_ps(fix3,tx);
2226 fiy3 = _mm_add_ps(fiy3,ty);
2227 fiz3 = _mm_add_ps(fiz3,tz);
2228
2229 fjx1 = _mm_add_ps(fjx1,tx);
2230 fjy1 = _mm_add_ps(fjy1,ty);
2231 fjz1 = _mm_add_ps(fjz1,tz);
2232
2233 }
2234
2235 /**************************
2236 * CALCULATE INTERACTIONS *
2237 **************************/
2238
2239 if (gmx_mm_any_lt(rsq32,rcutoff2))
2240 {
2241
2242 /* REACTION-FIELD ELECTROSTATICS */
2243 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
2244
2245 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2246
2247 fscal = felec;
2248
2249 fscal = _mm_and_ps(fscal,cutoff_mask);
2250
2251 fscal = _mm_andnot_ps(dummy_mask,fscal);
2252
2253 /* Calculate temporary vectorial force */
2254 tx = _mm_mul_ps(fscal,dx32);
2255 ty = _mm_mul_ps(fscal,dy32);
2256 tz = _mm_mul_ps(fscal,dz32);
2257
2258 /* Update vectorial force */
2259 fix3 = _mm_add_ps(fix3,tx);
2260 fiy3 = _mm_add_ps(fiy3,ty);
2261 fiz3 = _mm_add_ps(fiz3,tz);
2262
2263 fjx2 = _mm_add_ps(fjx2,tx);
2264 fjy2 = _mm_add_ps(fjy2,ty);
2265 fjz2 = _mm_add_ps(fjz2,tz);
2266
2267 }
2268
2269 /**************************
2270 * CALCULATE INTERACTIONS *
2271 **************************/
2272
2273 if (gmx_mm_any_lt(rsq33,rcutoff2))
2274 {
2275
2276 /* REACTION-FIELD ELECTROSTATICS */
2277 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
2278
2279 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2280
2281 fscal = felec;
2282
2283 fscal = _mm_and_ps(fscal,cutoff_mask);
2284
2285 fscal = _mm_andnot_ps(dummy_mask,fscal);
2286
2287 /* Calculate temporary vectorial force */
2288 tx = _mm_mul_ps(fscal,dx33);
2289 ty = _mm_mul_ps(fscal,dy33);
2290 tz = _mm_mul_ps(fscal,dz33);
2291
2292 /* Update vectorial force */
2293 fix3 = _mm_add_ps(fix3,tx);
2294 fiy3 = _mm_add_ps(fiy3,ty);
2295 fiz3 = _mm_add_ps(fiz3,tz);
2296
2297 fjx3 = _mm_add_ps(fjx3,tx);
2298 fjy3 = _mm_add_ps(fjy3,ty);
2299 fjz3 = _mm_add_ps(fjz3,tz);
2300
2301 }
2302
2303 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2304 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2305 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2306 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2307
2308 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2309 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2310 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2311
2312 /* Inner loop uses 303 flops */
2313 }
2314
2315 /* End of innermost loop */
2316
2317 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2318 f+i_coord_offset,fshift+i_shift_offset);
2319
2320 /* Increment number of inner iterations */
2321 inneriter += j_index_end - j_index_start;
2322
2323 /* Outer loop uses 24 flops */
2324 }
2325
2326 /* Increment number of outer iterations */
2327 outeriter += nri;
2328
2329 /* Update outer/inner flops */
2330
2331 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*303)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W4W4_F] += outeriter*24 + inneriter
*303
;
2332}