Bug Summary

File:gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_sse4_1_single.c
Location:line 1328, column 5
Description:Value stored to 'jnrA' is never read

Annotated Source Code

1/*
2 * This file is part of the GROMACS molecular simulation package.
3 *
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
8 *
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
13 *
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23 *
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
31 *
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
34 */
35/*
36 * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37 */
38#ifdef HAVE_CONFIG_H1
39#include <config.h>
40#endif
41
42#include <math.h>
43
44#include "../nb_kernel.h"
45#include "types/simple.h"
46#include "gromacs/math/vec.h"
47#include "nrnb.h"
48
49#include "gromacs/simd/math_x86_sse4_1_single.h"
50#include "kernelutil_x86_sse4_1_single.h"
51
52/*
53 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse4_1_single
54 * Electrostatics interaction: ReactionField
55 * VdW interaction: LennardJones
56 * Geometry: Water3-Water3
57 * Calculate force/pot: PotentialAndForce
58 */
59void
60nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse4_1_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
68{
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 */
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real rcutoff_scalar;
81 real *shiftvec,*fshift,*x,*f;
82 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83 real scratch[4*DIM3];
84 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85 int vdwioffset0;
86 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87 int vdwioffset1;
88 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89 int vdwioffset2;
90 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
99 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
100 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
101 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
104 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
107 real *charge;
108 int nvdwtype;
109 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110 int *vdwtype;
111 real *vdwparam;
112 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
113 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
114 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
115 real rswitch_scalar,d_scalar;
116 __m128 dummy_mask,cutoff_mask;
117 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
118 __m128 one = _mm_set1_ps(1.0);
119 __m128 two = _mm_set1_ps(2.0);
120 x = xx[0];
121 f = ff[0];
122
123 nri = nlist->nri;
124 iinr = nlist->iinr;
125 jindex = nlist->jindex;
126 jjnr = nlist->jjnr;
127 shiftidx = nlist->shift;
128 gid = nlist->gid;
129 shiftvec = fr->shift_vec[0];
130 fshift = fr->fshift[0];
131 facel = _mm_set1_ps(fr->epsfac);
132 charge = mdatoms->chargeA;
133 krf = _mm_set1_ps(fr->ic->k_rf);
134 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
135 crf = _mm_set1_ps(fr->ic->c_rf);
136 nvdwtype = fr->ntype;
137 vdwparam = fr->nbfp;
138 vdwtype = mdatoms->typeA;
139
140 /* Setup water-specific parameters */
141 inr = nlist->iinr[0];
142 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
143 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
144 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
145 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
146
147 jq0 = _mm_set1_ps(charge[inr+0]);
148 jq1 = _mm_set1_ps(charge[inr+1]);
149 jq2 = _mm_set1_ps(charge[inr+2]);
150 vdwjidx0A = 2*vdwtype[inr+0];
151 qq00 = _mm_mul_ps(iq0,jq0);
152 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
153 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
154 qq01 = _mm_mul_ps(iq0,jq1);
155 qq02 = _mm_mul_ps(iq0,jq2);
156 qq10 = _mm_mul_ps(iq1,jq0);
157 qq11 = _mm_mul_ps(iq1,jq1);
158 qq12 = _mm_mul_ps(iq1,jq2);
159 qq20 = _mm_mul_ps(iq2,jq0);
160 qq21 = _mm_mul_ps(iq2,jq1);
161 qq22 = _mm_mul_ps(iq2,jq2);
162
163 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
164 rcutoff_scalar = fr->rcoulomb;
165 rcutoff = _mm_set1_ps(rcutoff_scalar);
166 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
167
168 rswitch_scalar = fr->rvdw_switch;
169 rswitch = _mm_set1_ps(rswitch_scalar);
170 /* Setup switch parameters */
171 d_scalar = rcutoff_scalar-rswitch_scalar;
172 d = _mm_set1_ps(d_scalar);
173 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
174 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
175 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
176 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
177 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
178 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
179
180 /* Avoid stupid compiler warnings */
181 jnrA = jnrB = jnrC = jnrD = 0;
182 j_coord_offsetA = 0;
183 j_coord_offsetB = 0;
184 j_coord_offsetC = 0;
185 j_coord_offsetD = 0;
186
187 outeriter = 0;
188 inneriter = 0;
189
190 for(iidx=0;iidx<4*DIM3;iidx++)
191 {
192 scratch[iidx] = 0.0;
193 }
194
195 /* Start outer loop over neighborlists */
196 for(iidx=0; iidx<nri; iidx++)
197 {
198 /* Load shift vector for this list */
199 i_shift_offset = DIM3*shiftidx[iidx];
200
201 /* Load limits for loop over neighbors */
202 j_index_start = jindex[iidx];
203 j_index_end = jindex[iidx+1];
204
205 /* Get outer coordinate index */
206 inr = iinr[iidx];
207 i_coord_offset = DIM3*inr;
208
209 /* Load i particle coords and add shift vector */
210 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
211 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
212
213 fix0 = _mm_setzero_ps();
214 fiy0 = _mm_setzero_ps();
215 fiz0 = _mm_setzero_ps();
216 fix1 = _mm_setzero_ps();
217 fiy1 = _mm_setzero_ps();
218 fiz1 = _mm_setzero_ps();
219 fix2 = _mm_setzero_ps();
220 fiy2 = _mm_setzero_ps();
221 fiz2 = _mm_setzero_ps();
222
223 /* Reset potential sums */
224 velecsum = _mm_setzero_ps();
225 vvdwsum = _mm_setzero_ps();
226
227 /* Start inner kernel loop */
228 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
229 {
230
231 /* Get j neighbor index, and coordinate index */
232 jnrA = jjnr[jidx];
233 jnrB = jjnr[jidx+1];
234 jnrC = jjnr[jidx+2];
235 jnrD = jjnr[jidx+3];
236 j_coord_offsetA = DIM3*jnrA;
237 j_coord_offsetB = DIM3*jnrB;
238 j_coord_offsetC = DIM3*jnrC;
239 j_coord_offsetD = DIM3*jnrD;
240
241 /* load j atom coordinates */
242 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
243 x+j_coord_offsetC,x+j_coord_offsetD,
244 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
245
246 /* Calculate displacement vector */
247 dx00 = _mm_sub_ps(ix0,jx0);
248 dy00 = _mm_sub_ps(iy0,jy0);
249 dz00 = _mm_sub_ps(iz0,jz0);
250 dx01 = _mm_sub_ps(ix0,jx1);
251 dy01 = _mm_sub_ps(iy0,jy1);
252 dz01 = _mm_sub_ps(iz0,jz1);
253 dx02 = _mm_sub_ps(ix0,jx2);
254 dy02 = _mm_sub_ps(iy0,jy2);
255 dz02 = _mm_sub_ps(iz0,jz2);
256 dx10 = _mm_sub_ps(ix1,jx0);
257 dy10 = _mm_sub_ps(iy1,jy0);
258 dz10 = _mm_sub_ps(iz1,jz0);
259 dx11 = _mm_sub_ps(ix1,jx1);
260 dy11 = _mm_sub_ps(iy1,jy1);
261 dz11 = _mm_sub_ps(iz1,jz1);
262 dx12 = _mm_sub_ps(ix1,jx2);
263 dy12 = _mm_sub_ps(iy1,jy2);
264 dz12 = _mm_sub_ps(iz1,jz2);
265 dx20 = _mm_sub_ps(ix2,jx0);
266 dy20 = _mm_sub_ps(iy2,jy0);
267 dz20 = _mm_sub_ps(iz2,jz0);
268 dx21 = _mm_sub_ps(ix2,jx1);
269 dy21 = _mm_sub_ps(iy2,jy1);
270 dz21 = _mm_sub_ps(iz2,jz1);
271 dx22 = _mm_sub_ps(ix2,jx2);
272 dy22 = _mm_sub_ps(iy2,jy2);
273 dz22 = _mm_sub_ps(iz2,jz2);
274
275 /* Calculate squared distance and things based on it */
276 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
277 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
278 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
279 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
280 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
281 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
282 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
283 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
284 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
285
286 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
287 rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
288 rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
289 rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
290 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
291 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
292 rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
293 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
294 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
295
296 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
297 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
298 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
299 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
300 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
301 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
302 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
303 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
304 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
305
306 fjx0 = _mm_setzero_ps();
307 fjy0 = _mm_setzero_ps();
308 fjz0 = _mm_setzero_ps();
309 fjx1 = _mm_setzero_ps();
310 fjy1 = _mm_setzero_ps();
311 fjz1 = _mm_setzero_ps();
312 fjx2 = _mm_setzero_ps();
313 fjy2 = _mm_setzero_ps();
314 fjz2 = _mm_setzero_ps();
315
316 /**************************
317 * CALCULATE INTERACTIONS *
318 **************************/
319
320 if (gmx_mm_any_lt(rsq00,rcutoff2))
321 {
322
323 r00 = _mm_mul_ps(rsq00,rinv00);
324
325 /* REACTION-FIELD ELECTROSTATICS */
326 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
327 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
328
329 /* LENNARD-JONES DISPERSION/REPULSION */
330
331 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
332 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
333 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
334 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
335 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
336
337 d = _mm_sub_ps(r00,rswitch);
338 d = _mm_max_ps(d,_mm_setzero_ps());
339 d2 = _mm_mul_ps(d,d);
340 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
341
342 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
343
344 /* Evaluate switch function */
345 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
346 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
347 vvdw = _mm_mul_ps(vvdw,sw);
348 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
349
350 /* Update potential sum for this i atom from the interaction with this j atom. */
351 velec = _mm_and_ps(velec,cutoff_mask);
352 velecsum = _mm_add_ps(velecsum,velec);
353 vvdw = _mm_and_ps(vvdw,cutoff_mask);
354 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
355
356 fscal = _mm_add_ps(felec,fvdw);
357
358 fscal = _mm_and_ps(fscal,cutoff_mask);
359
360 /* Calculate temporary vectorial force */
361 tx = _mm_mul_ps(fscal,dx00);
362 ty = _mm_mul_ps(fscal,dy00);
363 tz = _mm_mul_ps(fscal,dz00);
364
365 /* Update vectorial force */
366 fix0 = _mm_add_ps(fix0,tx);
367 fiy0 = _mm_add_ps(fiy0,ty);
368 fiz0 = _mm_add_ps(fiz0,tz);
369
370 fjx0 = _mm_add_ps(fjx0,tx);
371 fjy0 = _mm_add_ps(fjy0,ty);
372 fjz0 = _mm_add_ps(fjz0,tz);
373
374 }
375
376 /**************************
377 * CALCULATE INTERACTIONS *
378 **************************/
379
380 if (gmx_mm_any_lt(rsq01,rcutoff2))
381 {
382
383 /* REACTION-FIELD ELECTROSTATICS */
384 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
385 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
386
387 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
388
389 /* Update potential sum for this i atom from the interaction with this j atom. */
390 velec = _mm_and_ps(velec,cutoff_mask);
391 velecsum = _mm_add_ps(velecsum,velec);
392
393 fscal = felec;
394
395 fscal = _mm_and_ps(fscal,cutoff_mask);
396
397 /* Calculate temporary vectorial force */
398 tx = _mm_mul_ps(fscal,dx01);
399 ty = _mm_mul_ps(fscal,dy01);
400 tz = _mm_mul_ps(fscal,dz01);
401
402 /* Update vectorial force */
403 fix0 = _mm_add_ps(fix0,tx);
404 fiy0 = _mm_add_ps(fiy0,ty);
405 fiz0 = _mm_add_ps(fiz0,tz);
406
407 fjx1 = _mm_add_ps(fjx1,tx);
408 fjy1 = _mm_add_ps(fjy1,ty);
409 fjz1 = _mm_add_ps(fjz1,tz);
410
411 }
412
413 /**************************
414 * CALCULATE INTERACTIONS *
415 **************************/
416
417 if (gmx_mm_any_lt(rsq02,rcutoff2))
418 {
419
420 /* REACTION-FIELD ELECTROSTATICS */
421 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
422 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
423
424 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
425
426 /* Update potential sum for this i atom from the interaction with this j atom. */
427 velec = _mm_and_ps(velec,cutoff_mask);
428 velecsum = _mm_add_ps(velecsum,velec);
429
430 fscal = felec;
431
432 fscal = _mm_and_ps(fscal,cutoff_mask);
433
434 /* Calculate temporary vectorial force */
435 tx = _mm_mul_ps(fscal,dx02);
436 ty = _mm_mul_ps(fscal,dy02);
437 tz = _mm_mul_ps(fscal,dz02);
438
439 /* Update vectorial force */
440 fix0 = _mm_add_ps(fix0,tx);
441 fiy0 = _mm_add_ps(fiy0,ty);
442 fiz0 = _mm_add_ps(fiz0,tz);
443
444 fjx2 = _mm_add_ps(fjx2,tx);
445 fjy2 = _mm_add_ps(fjy2,ty);
446 fjz2 = _mm_add_ps(fjz2,tz);
447
448 }
449
450 /**************************
451 * CALCULATE INTERACTIONS *
452 **************************/
453
454 if (gmx_mm_any_lt(rsq10,rcutoff2))
455 {
456
457 /* REACTION-FIELD ELECTROSTATICS */
458 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
459 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
460
461 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
462
463 /* Update potential sum for this i atom from the interaction with this j atom. */
464 velec = _mm_and_ps(velec,cutoff_mask);
465 velecsum = _mm_add_ps(velecsum,velec);
466
467 fscal = felec;
468
469 fscal = _mm_and_ps(fscal,cutoff_mask);
470
471 /* Calculate temporary vectorial force */
472 tx = _mm_mul_ps(fscal,dx10);
473 ty = _mm_mul_ps(fscal,dy10);
474 tz = _mm_mul_ps(fscal,dz10);
475
476 /* Update vectorial force */
477 fix1 = _mm_add_ps(fix1,tx);
478 fiy1 = _mm_add_ps(fiy1,ty);
479 fiz1 = _mm_add_ps(fiz1,tz);
480
481 fjx0 = _mm_add_ps(fjx0,tx);
482 fjy0 = _mm_add_ps(fjy0,ty);
483 fjz0 = _mm_add_ps(fjz0,tz);
484
485 }
486
487 /**************************
488 * CALCULATE INTERACTIONS *
489 **************************/
490
491 if (gmx_mm_any_lt(rsq11,rcutoff2))
492 {
493
494 /* REACTION-FIELD ELECTROSTATICS */
495 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
496 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
497
498 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
499
500 /* Update potential sum for this i atom from the interaction with this j atom. */
501 velec = _mm_and_ps(velec,cutoff_mask);
502 velecsum = _mm_add_ps(velecsum,velec);
503
504 fscal = felec;
505
506 fscal = _mm_and_ps(fscal,cutoff_mask);
507
508 /* Calculate temporary vectorial force */
509 tx = _mm_mul_ps(fscal,dx11);
510 ty = _mm_mul_ps(fscal,dy11);
511 tz = _mm_mul_ps(fscal,dz11);
512
513 /* Update vectorial force */
514 fix1 = _mm_add_ps(fix1,tx);
515 fiy1 = _mm_add_ps(fiy1,ty);
516 fiz1 = _mm_add_ps(fiz1,tz);
517
518 fjx1 = _mm_add_ps(fjx1,tx);
519 fjy1 = _mm_add_ps(fjy1,ty);
520 fjz1 = _mm_add_ps(fjz1,tz);
521
522 }
523
524 /**************************
525 * CALCULATE INTERACTIONS *
526 **************************/
527
528 if (gmx_mm_any_lt(rsq12,rcutoff2))
529 {
530
531 /* REACTION-FIELD ELECTROSTATICS */
532 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
533 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
534
535 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
536
537 /* Update potential sum for this i atom from the interaction with this j atom. */
538 velec = _mm_and_ps(velec,cutoff_mask);
539 velecsum = _mm_add_ps(velecsum,velec);
540
541 fscal = felec;
542
543 fscal = _mm_and_ps(fscal,cutoff_mask);
544
545 /* Calculate temporary vectorial force */
546 tx = _mm_mul_ps(fscal,dx12);
547 ty = _mm_mul_ps(fscal,dy12);
548 tz = _mm_mul_ps(fscal,dz12);
549
550 /* Update vectorial force */
551 fix1 = _mm_add_ps(fix1,tx);
552 fiy1 = _mm_add_ps(fiy1,ty);
553 fiz1 = _mm_add_ps(fiz1,tz);
554
555 fjx2 = _mm_add_ps(fjx2,tx);
556 fjy2 = _mm_add_ps(fjy2,ty);
557 fjz2 = _mm_add_ps(fjz2,tz);
558
559 }
560
561 /**************************
562 * CALCULATE INTERACTIONS *
563 **************************/
564
565 if (gmx_mm_any_lt(rsq20,rcutoff2))
566 {
567
568 /* REACTION-FIELD ELECTROSTATICS */
569 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
570 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
571
572 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
573
574 /* Update potential sum for this i atom from the interaction with this j atom. */
575 velec = _mm_and_ps(velec,cutoff_mask);
576 velecsum = _mm_add_ps(velecsum,velec);
577
578 fscal = felec;
579
580 fscal = _mm_and_ps(fscal,cutoff_mask);
581
582 /* Calculate temporary vectorial force */
583 tx = _mm_mul_ps(fscal,dx20);
584 ty = _mm_mul_ps(fscal,dy20);
585 tz = _mm_mul_ps(fscal,dz20);
586
587 /* Update vectorial force */
588 fix2 = _mm_add_ps(fix2,tx);
589 fiy2 = _mm_add_ps(fiy2,ty);
590 fiz2 = _mm_add_ps(fiz2,tz);
591
592 fjx0 = _mm_add_ps(fjx0,tx);
593 fjy0 = _mm_add_ps(fjy0,ty);
594 fjz0 = _mm_add_ps(fjz0,tz);
595
596 }
597
598 /**************************
599 * CALCULATE INTERACTIONS *
600 **************************/
601
602 if (gmx_mm_any_lt(rsq21,rcutoff2))
603 {
604
605 /* REACTION-FIELD ELECTROSTATICS */
606 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
607 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
608
609 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
610
611 /* Update potential sum for this i atom from the interaction with this j atom. */
612 velec = _mm_and_ps(velec,cutoff_mask);
613 velecsum = _mm_add_ps(velecsum,velec);
614
615 fscal = felec;
616
617 fscal = _mm_and_ps(fscal,cutoff_mask);
618
619 /* Calculate temporary vectorial force */
620 tx = _mm_mul_ps(fscal,dx21);
621 ty = _mm_mul_ps(fscal,dy21);
622 tz = _mm_mul_ps(fscal,dz21);
623
624 /* Update vectorial force */
625 fix2 = _mm_add_ps(fix2,tx);
626 fiy2 = _mm_add_ps(fiy2,ty);
627 fiz2 = _mm_add_ps(fiz2,tz);
628
629 fjx1 = _mm_add_ps(fjx1,tx);
630 fjy1 = _mm_add_ps(fjy1,ty);
631 fjz1 = _mm_add_ps(fjz1,tz);
632
633 }
634
635 /**************************
636 * CALCULATE INTERACTIONS *
637 **************************/
638
639 if (gmx_mm_any_lt(rsq22,rcutoff2))
640 {
641
642 /* REACTION-FIELD ELECTROSTATICS */
643 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
644 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
645
646 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
647
648 /* Update potential sum for this i atom from the interaction with this j atom. */
649 velec = _mm_and_ps(velec,cutoff_mask);
650 velecsum = _mm_add_ps(velecsum,velec);
651
652 fscal = felec;
653
654 fscal = _mm_and_ps(fscal,cutoff_mask);
655
656 /* Calculate temporary vectorial force */
657 tx = _mm_mul_ps(fscal,dx22);
658 ty = _mm_mul_ps(fscal,dy22);
659 tz = _mm_mul_ps(fscal,dz22);
660
661 /* Update vectorial force */
662 fix2 = _mm_add_ps(fix2,tx);
663 fiy2 = _mm_add_ps(fiy2,ty);
664 fiz2 = _mm_add_ps(fiz2,tz);
665
666 fjx2 = _mm_add_ps(fjx2,tx);
667 fjy2 = _mm_add_ps(fjy2,ty);
668 fjz2 = _mm_add_ps(fjz2,tz);
669
670 }
671
672 fjptrA = f+j_coord_offsetA;
673 fjptrB = f+j_coord_offsetB;
674 fjptrC = f+j_coord_offsetC;
675 fjptrD = f+j_coord_offsetD;
676
677 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
678 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
679
680 /* Inner loop uses 358 flops */
681 }
682
683 if(jidx<j_index_end)
684 {
685
686 /* Get j neighbor index, and coordinate index */
687 jnrlistA = jjnr[jidx];
688 jnrlistB = jjnr[jidx+1];
689 jnrlistC = jjnr[jidx+2];
690 jnrlistD = jjnr[jidx+3];
691 /* Sign of each element will be negative for non-real atoms.
692 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
693 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
694 */
695 dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
696 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
697 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
698 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
699 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
700 j_coord_offsetA = DIM3*jnrA;
701 j_coord_offsetB = DIM3*jnrB;
702 j_coord_offsetC = DIM3*jnrC;
703 j_coord_offsetD = DIM3*jnrD;
704
705 /* load j atom coordinates */
706 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
707 x+j_coord_offsetC,x+j_coord_offsetD,
708 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
709
710 /* Calculate displacement vector */
711 dx00 = _mm_sub_ps(ix0,jx0);
712 dy00 = _mm_sub_ps(iy0,jy0);
713 dz00 = _mm_sub_ps(iz0,jz0);
714 dx01 = _mm_sub_ps(ix0,jx1);
715 dy01 = _mm_sub_ps(iy0,jy1);
716 dz01 = _mm_sub_ps(iz0,jz1);
717 dx02 = _mm_sub_ps(ix0,jx2);
718 dy02 = _mm_sub_ps(iy0,jy2);
719 dz02 = _mm_sub_ps(iz0,jz2);
720 dx10 = _mm_sub_ps(ix1,jx0);
721 dy10 = _mm_sub_ps(iy1,jy0);
722 dz10 = _mm_sub_ps(iz1,jz0);
723 dx11 = _mm_sub_ps(ix1,jx1);
724 dy11 = _mm_sub_ps(iy1,jy1);
725 dz11 = _mm_sub_ps(iz1,jz1);
726 dx12 = _mm_sub_ps(ix1,jx2);
727 dy12 = _mm_sub_ps(iy1,jy2);
728 dz12 = _mm_sub_ps(iz1,jz2);
729 dx20 = _mm_sub_ps(ix2,jx0);
730 dy20 = _mm_sub_ps(iy2,jy0);
731 dz20 = _mm_sub_ps(iz2,jz0);
732 dx21 = _mm_sub_ps(ix2,jx1);
733 dy21 = _mm_sub_ps(iy2,jy1);
734 dz21 = _mm_sub_ps(iz2,jz1);
735 dx22 = _mm_sub_ps(ix2,jx2);
736 dy22 = _mm_sub_ps(iy2,jy2);
737 dz22 = _mm_sub_ps(iz2,jz2);
738
739 /* Calculate squared distance and things based on it */
740 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
741 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
742 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
743 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
744 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
745 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
746 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
747 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
748 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
749
750 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
751 rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
752 rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
753 rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
754 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
755 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
756 rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
757 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
758 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
759
760 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
761 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
762 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
763 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
764 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
765 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
766 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
767 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
768 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
769
770 fjx0 = _mm_setzero_ps();
771 fjy0 = _mm_setzero_ps();
772 fjz0 = _mm_setzero_ps();
773 fjx1 = _mm_setzero_ps();
774 fjy1 = _mm_setzero_ps();
775 fjz1 = _mm_setzero_ps();
776 fjx2 = _mm_setzero_ps();
777 fjy2 = _mm_setzero_ps();
778 fjz2 = _mm_setzero_ps();
779
780 /**************************
781 * CALCULATE INTERACTIONS *
782 **************************/
783
784 if (gmx_mm_any_lt(rsq00,rcutoff2))
785 {
786
787 r00 = _mm_mul_ps(rsq00,rinv00);
788 r00 = _mm_andnot_ps(dummy_mask,r00);
789
790 /* REACTION-FIELD ELECTROSTATICS */
791 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
792 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
793
794 /* LENNARD-JONES DISPERSION/REPULSION */
795
796 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
797 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
798 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
799 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
800 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
801
802 d = _mm_sub_ps(r00,rswitch);
803 d = _mm_max_ps(d,_mm_setzero_ps());
804 d2 = _mm_mul_ps(d,d);
805 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
806
807 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
808
809 /* Evaluate switch function */
810 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
811 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
812 vvdw = _mm_mul_ps(vvdw,sw);
813 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
814
815 /* Update potential sum for this i atom from the interaction with this j atom. */
816 velec = _mm_and_ps(velec,cutoff_mask);
817 velec = _mm_andnot_ps(dummy_mask,velec);
818 velecsum = _mm_add_ps(velecsum,velec);
819 vvdw = _mm_and_ps(vvdw,cutoff_mask);
820 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
821 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
822
823 fscal = _mm_add_ps(felec,fvdw);
824
825 fscal = _mm_and_ps(fscal,cutoff_mask);
826
827 fscal = _mm_andnot_ps(dummy_mask,fscal);
828
829 /* Calculate temporary vectorial force */
830 tx = _mm_mul_ps(fscal,dx00);
831 ty = _mm_mul_ps(fscal,dy00);
832 tz = _mm_mul_ps(fscal,dz00);
833
834 /* Update vectorial force */
835 fix0 = _mm_add_ps(fix0,tx);
836 fiy0 = _mm_add_ps(fiy0,ty);
837 fiz0 = _mm_add_ps(fiz0,tz);
838
839 fjx0 = _mm_add_ps(fjx0,tx);
840 fjy0 = _mm_add_ps(fjy0,ty);
841 fjz0 = _mm_add_ps(fjz0,tz);
842
843 }
844
845 /**************************
846 * CALCULATE INTERACTIONS *
847 **************************/
848
849 if (gmx_mm_any_lt(rsq01,rcutoff2))
850 {
851
852 /* REACTION-FIELD ELECTROSTATICS */
853 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
854 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
855
856 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
857
858 /* Update potential sum for this i atom from the interaction with this j atom. */
859 velec = _mm_and_ps(velec,cutoff_mask);
860 velec = _mm_andnot_ps(dummy_mask,velec);
861 velecsum = _mm_add_ps(velecsum,velec);
862
863 fscal = felec;
864
865 fscal = _mm_and_ps(fscal,cutoff_mask);
866
867 fscal = _mm_andnot_ps(dummy_mask,fscal);
868
869 /* Calculate temporary vectorial force */
870 tx = _mm_mul_ps(fscal,dx01);
871 ty = _mm_mul_ps(fscal,dy01);
872 tz = _mm_mul_ps(fscal,dz01);
873
874 /* Update vectorial force */
875 fix0 = _mm_add_ps(fix0,tx);
876 fiy0 = _mm_add_ps(fiy0,ty);
877 fiz0 = _mm_add_ps(fiz0,tz);
878
879 fjx1 = _mm_add_ps(fjx1,tx);
880 fjy1 = _mm_add_ps(fjy1,ty);
881 fjz1 = _mm_add_ps(fjz1,tz);
882
883 }
884
885 /**************************
886 * CALCULATE INTERACTIONS *
887 **************************/
888
889 if (gmx_mm_any_lt(rsq02,rcutoff2))
890 {
891
892 /* REACTION-FIELD ELECTROSTATICS */
893 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
894 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
895
896 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
897
898 /* Update potential sum for this i atom from the interaction with this j atom. */
899 velec = _mm_and_ps(velec,cutoff_mask);
900 velec = _mm_andnot_ps(dummy_mask,velec);
901 velecsum = _mm_add_ps(velecsum,velec);
902
903 fscal = felec;
904
905 fscal = _mm_and_ps(fscal,cutoff_mask);
906
907 fscal = _mm_andnot_ps(dummy_mask,fscal);
908
909 /* Calculate temporary vectorial force */
910 tx = _mm_mul_ps(fscal,dx02);
911 ty = _mm_mul_ps(fscal,dy02);
912 tz = _mm_mul_ps(fscal,dz02);
913
914 /* Update vectorial force */
915 fix0 = _mm_add_ps(fix0,tx);
916 fiy0 = _mm_add_ps(fiy0,ty);
917 fiz0 = _mm_add_ps(fiz0,tz);
918
919 fjx2 = _mm_add_ps(fjx2,tx);
920 fjy2 = _mm_add_ps(fjy2,ty);
921 fjz2 = _mm_add_ps(fjz2,tz);
922
923 }
924
925 /**************************
926 * CALCULATE INTERACTIONS *
927 **************************/
928
929 if (gmx_mm_any_lt(rsq10,rcutoff2))
930 {
931
932 /* REACTION-FIELD ELECTROSTATICS */
933 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
934 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
935
936 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
937
938 /* Update potential sum for this i atom from the interaction with this j atom. */
939 velec = _mm_and_ps(velec,cutoff_mask);
940 velec = _mm_andnot_ps(dummy_mask,velec);
941 velecsum = _mm_add_ps(velecsum,velec);
942
943 fscal = felec;
944
945 fscal = _mm_and_ps(fscal,cutoff_mask);
946
947 fscal = _mm_andnot_ps(dummy_mask,fscal);
948
949 /* Calculate temporary vectorial force */
950 tx = _mm_mul_ps(fscal,dx10);
951 ty = _mm_mul_ps(fscal,dy10);
952 tz = _mm_mul_ps(fscal,dz10);
953
954 /* Update vectorial force */
955 fix1 = _mm_add_ps(fix1,tx);
956 fiy1 = _mm_add_ps(fiy1,ty);
957 fiz1 = _mm_add_ps(fiz1,tz);
958
959 fjx0 = _mm_add_ps(fjx0,tx);
960 fjy0 = _mm_add_ps(fjy0,ty);
961 fjz0 = _mm_add_ps(fjz0,tz);
962
963 }
964
965 /**************************
966 * CALCULATE INTERACTIONS *
967 **************************/
968
969 if (gmx_mm_any_lt(rsq11,rcutoff2))
970 {
971
972 /* REACTION-FIELD ELECTROSTATICS */
973 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
974 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
975
976 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
977
978 /* Update potential sum for this i atom from the interaction with this j atom. */
979 velec = _mm_and_ps(velec,cutoff_mask);
980 velec = _mm_andnot_ps(dummy_mask,velec);
981 velecsum = _mm_add_ps(velecsum,velec);
982
983 fscal = felec;
984
985 fscal = _mm_and_ps(fscal,cutoff_mask);
986
987 fscal = _mm_andnot_ps(dummy_mask,fscal);
988
989 /* Calculate temporary vectorial force */
990 tx = _mm_mul_ps(fscal,dx11);
991 ty = _mm_mul_ps(fscal,dy11);
992 tz = _mm_mul_ps(fscal,dz11);
993
994 /* Update vectorial force */
995 fix1 = _mm_add_ps(fix1,tx);
996 fiy1 = _mm_add_ps(fiy1,ty);
997 fiz1 = _mm_add_ps(fiz1,tz);
998
999 fjx1 = _mm_add_ps(fjx1,tx);
1000 fjy1 = _mm_add_ps(fjy1,ty);
1001 fjz1 = _mm_add_ps(fjz1,tz);
1002
1003 }
1004
1005 /**************************
1006 * CALCULATE INTERACTIONS *
1007 **************************/
1008
1009 if (gmx_mm_any_lt(rsq12,rcutoff2))
1010 {
1011
1012 /* REACTION-FIELD ELECTROSTATICS */
1013 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
1014 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1015
1016 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1017
1018 /* Update potential sum for this i atom from the interaction with this j atom. */
1019 velec = _mm_and_ps(velec,cutoff_mask);
1020 velec = _mm_andnot_ps(dummy_mask,velec);
1021 velecsum = _mm_add_ps(velecsum,velec);
1022
1023 fscal = felec;
1024
1025 fscal = _mm_and_ps(fscal,cutoff_mask);
1026
1027 fscal = _mm_andnot_ps(dummy_mask,fscal);
1028
1029 /* Calculate temporary vectorial force */
1030 tx = _mm_mul_ps(fscal,dx12);
1031 ty = _mm_mul_ps(fscal,dy12);
1032 tz = _mm_mul_ps(fscal,dz12);
1033
1034 /* Update vectorial force */
1035 fix1 = _mm_add_ps(fix1,tx);
1036 fiy1 = _mm_add_ps(fiy1,ty);
1037 fiz1 = _mm_add_ps(fiz1,tz);
1038
1039 fjx2 = _mm_add_ps(fjx2,tx);
1040 fjy2 = _mm_add_ps(fjy2,ty);
1041 fjz2 = _mm_add_ps(fjz2,tz);
1042
1043 }
1044
1045 /**************************
1046 * CALCULATE INTERACTIONS *
1047 **************************/
1048
1049 if (gmx_mm_any_lt(rsq20,rcutoff2))
1050 {
1051
1052 /* REACTION-FIELD ELECTROSTATICS */
1053 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
1054 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1055
1056 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1057
1058 /* Update potential sum for this i atom from the interaction with this j atom. */
1059 velec = _mm_and_ps(velec,cutoff_mask);
1060 velec = _mm_andnot_ps(dummy_mask,velec);
1061 velecsum = _mm_add_ps(velecsum,velec);
1062
1063 fscal = felec;
1064
1065 fscal = _mm_and_ps(fscal,cutoff_mask);
1066
1067 fscal = _mm_andnot_ps(dummy_mask,fscal);
1068
1069 /* Calculate temporary vectorial force */
1070 tx = _mm_mul_ps(fscal,dx20);
1071 ty = _mm_mul_ps(fscal,dy20);
1072 tz = _mm_mul_ps(fscal,dz20);
1073
1074 /* Update vectorial force */
1075 fix2 = _mm_add_ps(fix2,tx);
1076 fiy2 = _mm_add_ps(fiy2,ty);
1077 fiz2 = _mm_add_ps(fiz2,tz);
1078
1079 fjx0 = _mm_add_ps(fjx0,tx);
1080 fjy0 = _mm_add_ps(fjy0,ty);
1081 fjz0 = _mm_add_ps(fjz0,tz);
1082
1083 }
1084
1085 /**************************
1086 * CALCULATE INTERACTIONS *
1087 **************************/
1088
1089 if (gmx_mm_any_lt(rsq21,rcutoff2))
1090 {
1091
1092 /* REACTION-FIELD ELECTROSTATICS */
1093 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
1094 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1095
1096 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1097
1098 /* Update potential sum for this i atom from the interaction with this j atom. */
1099 velec = _mm_and_ps(velec,cutoff_mask);
1100 velec = _mm_andnot_ps(dummy_mask,velec);
1101 velecsum = _mm_add_ps(velecsum,velec);
1102
1103 fscal = felec;
1104
1105 fscal = _mm_and_ps(fscal,cutoff_mask);
1106
1107 fscal = _mm_andnot_ps(dummy_mask,fscal);
1108
1109 /* Calculate temporary vectorial force */
1110 tx = _mm_mul_ps(fscal,dx21);
1111 ty = _mm_mul_ps(fscal,dy21);
1112 tz = _mm_mul_ps(fscal,dz21);
1113
1114 /* Update vectorial force */
1115 fix2 = _mm_add_ps(fix2,tx);
1116 fiy2 = _mm_add_ps(fiy2,ty);
1117 fiz2 = _mm_add_ps(fiz2,tz);
1118
1119 fjx1 = _mm_add_ps(fjx1,tx);
1120 fjy1 = _mm_add_ps(fjy1,ty);
1121 fjz1 = _mm_add_ps(fjz1,tz);
1122
1123 }
1124
1125 /**************************
1126 * CALCULATE INTERACTIONS *
1127 **************************/
1128
1129 if (gmx_mm_any_lt(rsq22,rcutoff2))
1130 {
1131
1132 /* REACTION-FIELD ELECTROSTATICS */
1133 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
1134 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1135
1136 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1137
1138 /* Update potential sum for this i atom from the interaction with this j atom. */
1139 velec = _mm_and_ps(velec,cutoff_mask);
1140 velec = _mm_andnot_ps(dummy_mask,velec);
1141 velecsum = _mm_add_ps(velecsum,velec);
1142
1143 fscal = felec;
1144
1145 fscal = _mm_and_ps(fscal,cutoff_mask);
1146
1147 fscal = _mm_andnot_ps(dummy_mask,fscal);
1148
1149 /* Calculate temporary vectorial force */
1150 tx = _mm_mul_ps(fscal,dx22);
1151 ty = _mm_mul_ps(fscal,dy22);
1152 tz = _mm_mul_ps(fscal,dz22);
1153
1154 /* Update vectorial force */
1155 fix2 = _mm_add_ps(fix2,tx);
1156 fiy2 = _mm_add_ps(fiy2,ty);
1157 fiz2 = _mm_add_ps(fiz2,tz);
1158
1159 fjx2 = _mm_add_ps(fjx2,tx);
1160 fjy2 = _mm_add_ps(fjy2,ty);
1161 fjz2 = _mm_add_ps(fjz2,tz);
1162
1163 }
1164
1165 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1166 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1167 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1168 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1169
1170 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1171 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1172
1173 /* Inner loop uses 359 flops */
1174 }
1175
1176 /* End of innermost loop */
1177
1178 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1179 f+i_coord_offset,fshift+i_shift_offset);
1180
1181 ggid = gid[iidx];
1182 /* Update potential energies */
1183 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1184 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1185
1186 /* Increment number of inner iterations */
1187 inneriter += j_index_end - j_index_start;
1188
1189 /* Outer loop uses 20 flops */
1190 }
1191
1192 /* Increment number of outer iterations */
1193 outeriter += nri;
1194
1195 /* Update outer/inner flops */
1196
1197 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*359)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W3W3_VF] += outeriter*20 +
inneriter*359
;
1198}
1199/*
1200 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse4_1_single
1201 * Electrostatics interaction: ReactionField
1202 * VdW interaction: LennardJones
1203 * Geometry: Water3-Water3
1204 * Calculate force/pot: Force
1205 */
1206void
1207nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse4_1_single
1208 (t_nblist * gmx_restrict nlist,
1209 rvec * gmx_restrict xx,
1210 rvec * gmx_restrict ff,
1211 t_forcerec * gmx_restrict fr,
1212 t_mdatoms * gmx_restrict mdatoms,
1213 nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
1214 t_nrnb * gmx_restrict nrnb)
1215{
1216 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1217 * just 0 for non-waters.
1218 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1219 * jnr indices corresponding to data put in the four positions in the SIMD register.
1220 */
1221 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1222 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1223 int jnrA,jnrB,jnrC,jnrD;
1224 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1225 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1226 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1227 real rcutoff_scalar;
1228 real *shiftvec,*fshift,*x,*f;
1229 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1230 real scratch[4*DIM3];
1231 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1232 int vdwioffset0;
1233 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1234 int vdwioffset1;
1235 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1236 int vdwioffset2;
1237 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1238 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1239 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1240 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1241 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1242 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1243 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1244 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1245 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1246 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1247 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1248 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1249 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1250 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1251 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1252 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1253 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1254 real *charge;
1255 int nvdwtype;
1256 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1257 int *vdwtype;
1258 real *vdwparam;
1259 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1260 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1261 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1262 real rswitch_scalar,d_scalar;
1263 __m128 dummy_mask,cutoff_mask;
1264 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1265 __m128 one = _mm_set1_ps(1.0);
1266 __m128 two = _mm_set1_ps(2.0);
1267 x = xx[0];
1268 f = ff[0];
1269
1270 nri = nlist->nri;
1271 iinr = nlist->iinr;
1272 jindex = nlist->jindex;
1273 jjnr = nlist->jjnr;
1274 shiftidx = nlist->shift;
1275 gid = nlist->gid;
1276 shiftvec = fr->shift_vec[0];
1277 fshift = fr->fshift[0];
1278 facel = _mm_set1_ps(fr->epsfac);
1279 charge = mdatoms->chargeA;
1280 krf = _mm_set1_ps(fr->ic->k_rf);
1281 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
1282 crf = _mm_set1_ps(fr->ic->c_rf);
1283 nvdwtype = fr->ntype;
1284 vdwparam = fr->nbfp;
1285 vdwtype = mdatoms->typeA;
1286
1287 /* Setup water-specific parameters */
1288 inr = nlist->iinr[0];
1289 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1290 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1291 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1292 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1293
1294 jq0 = _mm_set1_ps(charge[inr+0]);
1295 jq1 = _mm_set1_ps(charge[inr+1]);
1296 jq2 = _mm_set1_ps(charge[inr+2]);
1297 vdwjidx0A = 2*vdwtype[inr+0];
1298 qq00 = _mm_mul_ps(iq0,jq0);
1299 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1300 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1301 qq01 = _mm_mul_ps(iq0,jq1);
1302 qq02 = _mm_mul_ps(iq0,jq2);
1303 qq10 = _mm_mul_ps(iq1,jq0);
1304 qq11 = _mm_mul_ps(iq1,jq1);
1305 qq12 = _mm_mul_ps(iq1,jq2);
1306 qq20 = _mm_mul_ps(iq2,jq0);
1307 qq21 = _mm_mul_ps(iq2,jq1);
1308 qq22 = _mm_mul_ps(iq2,jq2);
1309
1310 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1311 rcutoff_scalar = fr->rcoulomb;
1312 rcutoff = _mm_set1_ps(rcutoff_scalar);
1313 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1314
1315 rswitch_scalar = fr->rvdw_switch;
1316 rswitch = _mm_set1_ps(rswitch_scalar);
1317 /* Setup switch parameters */
1318 d_scalar = rcutoff_scalar-rswitch_scalar;
1319 d = _mm_set1_ps(d_scalar);
1320 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1321 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1322 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1323 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1324 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1325 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1326
1327 /* Avoid stupid compiler warnings */
1328 jnrA = jnrB = jnrC = jnrD = 0;
Value stored to 'jnrA' is never read
1329 j_coord_offsetA = 0;
1330 j_coord_offsetB = 0;
1331 j_coord_offsetC = 0;
1332 j_coord_offsetD = 0;
1333
1334 outeriter = 0;
1335 inneriter = 0;
1336
1337 for(iidx=0;iidx<4*DIM3;iidx++)
1338 {
1339 scratch[iidx] = 0.0;
1340 }
1341
1342 /* Start outer loop over neighborlists */
1343 for(iidx=0; iidx<nri; iidx++)
1344 {
1345 /* Load shift vector for this list */
1346 i_shift_offset = DIM3*shiftidx[iidx];
1347
1348 /* Load limits for loop over neighbors */
1349 j_index_start = jindex[iidx];
1350 j_index_end = jindex[iidx+1];
1351
1352 /* Get outer coordinate index */
1353 inr = iinr[iidx];
1354 i_coord_offset = DIM3*inr;
1355
1356 /* Load i particle coords and add shift vector */
1357 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1358 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1359
1360 fix0 = _mm_setzero_ps();
1361 fiy0 = _mm_setzero_ps();
1362 fiz0 = _mm_setzero_ps();
1363 fix1 = _mm_setzero_ps();
1364 fiy1 = _mm_setzero_ps();
1365 fiz1 = _mm_setzero_ps();
1366 fix2 = _mm_setzero_ps();
1367 fiy2 = _mm_setzero_ps();
1368 fiz2 = _mm_setzero_ps();
1369
1370 /* Start inner kernel loop */
1371 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1372 {
1373
1374 /* Get j neighbor index, and coordinate index */
1375 jnrA = jjnr[jidx];
1376 jnrB = jjnr[jidx+1];
1377 jnrC = jjnr[jidx+2];
1378 jnrD = jjnr[jidx+3];
1379 j_coord_offsetA = DIM3*jnrA;
1380 j_coord_offsetB = DIM3*jnrB;
1381 j_coord_offsetC = DIM3*jnrC;
1382 j_coord_offsetD = DIM3*jnrD;
1383
1384 /* load j atom coordinates */
1385 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1386 x+j_coord_offsetC,x+j_coord_offsetD,
1387 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1388
1389 /* Calculate displacement vector */
1390 dx00 = _mm_sub_ps(ix0,jx0);
1391 dy00 = _mm_sub_ps(iy0,jy0);
1392 dz00 = _mm_sub_ps(iz0,jz0);
1393 dx01 = _mm_sub_ps(ix0,jx1);
1394 dy01 = _mm_sub_ps(iy0,jy1);
1395 dz01 = _mm_sub_ps(iz0,jz1);
1396 dx02 = _mm_sub_ps(ix0,jx2);
1397 dy02 = _mm_sub_ps(iy0,jy2);
1398 dz02 = _mm_sub_ps(iz0,jz2);
1399 dx10 = _mm_sub_ps(ix1,jx0);
1400 dy10 = _mm_sub_ps(iy1,jy0);
1401 dz10 = _mm_sub_ps(iz1,jz0);
1402 dx11 = _mm_sub_ps(ix1,jx1);
1403 dy11 = _mm_sub_ps(iy1,jy1);
1404 dz11 = _mm_sub_ps(iz1,jz1);
1405 dx12 = _mm_sub_ps(ix1,jx2);
1406 dy12 = _mm_sub_ps(iy1,jy2);
1407 dz12 = _mm_sub_ps(iz1,jz2);
1408 dx20 = _mm_sub_ps(ix2,jx0);
1409 dy20 = _mm_sub_ps(iy2,jy0);
1410 dz20 = _mm_sub_ps(iz2,jz0);
1411 dx21 = _mm_sub_ps(ix2,jx1);
1412 dy21 = _mm_sub_ps(iy2,jy1);
1413 dz21 = _mm_sub_ps(iz2,jz1);
1414 dx22 = _mm_sub_ps(ix2,jx2);
1415 dy22 = _mm_sub_ps(iy2,jy2);
1416 dz22 = _mm_sub_ps(iz2,jz2);
1417
1418 /* Calculate squared distance and things based on it */
1419 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1420 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1421 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1422 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1423 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1424 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1425 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1426 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1427 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1428
1429 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
1430 rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
1431 rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
1432 rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
1433 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
1434 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
1435 rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
1436 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
1437 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
1438
1439 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1440 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1441 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1442 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1443 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1444 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1445 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1446 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1447 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1448
1449 fjx0 = _mm_setzero_ps();
1450 fjy0 = _mm_setzero_ps();
1451 fjz0 = _mm_setzero_ps();
1452 fjx1 = _mm_setzero_ps();
1453 fjy1 = _mm_setzero_ps();
1454 fjz1 = _mm_setzero_ps();
1455 fjx2 = _mm_setzero_ps();
1456 fjy2 = _mm_setzero_ps();
1457 fjz2 = _mm_setzero_ps();
1458
1459 /**************************
1460 * CALCULATE INTERACTIONS *
1461 **************************/
1462
1463 if (gmx_mm_any_lt(rsq00,rcutoff2))
1464 {
1465
1466 r00 = _mm_mul_ps(rsq00,rinv00);
1467
1468 /* REACTION-FIELD ELECTROSTATICS */
1469 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1470
1471 /* LENNARD-JONES DISPERSION/REPULSION */
1472
1473 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1474 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1475 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1476 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1477 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1478
1479 d = _mm_sub_ps(r00,rswitch);
1480 d = _mm_max_ps(d,_mm_setzero_ps());
1481 d2 = _mm_mul_ps(d,d);
1482 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1483
1484 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1485
1486 /* Evaluate switch function */
1487 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1488 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1489 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1490
1491 fscal = _mm_add_ps(felec,fvdw);
1492
1493 fscal = _mm_and_ps(fscal,cutoff_mask);
1494
1495 /* Calculate temporary vectorial force */
1496 tx = _mm_mul_ps(fscal,dx00);
1497 ty = _mm_mul_ps(fscal,dy00);
1498 tz = _mm_mul_ps(fscal,dz00);
1499
1500 /* Update vectorial force */
1501 fix0 = _mm_add_ps(fix0,tx);
1502 fiy0 = _mm_add_ps(fiy0,ty);
1503 fiz0 = _mm_add_ps(fiz0,tz);
1504
1505 fjx0 = _mm_add_ps(fjx0,tx);
1506 fjy0 = _mm_add_ps(fjy0,ty);
1507 fjz0 = _mm_add_ps(fjz0,tz);
1508
1509 }
1510
1511 /**************************
1512 * CALCULATE INTERACTIONS *
1513 **************************/
1514
1515 if (gmx_mm_any_lt(rsq01,rcutoff2))
1516 {
1517
1518 /* REACTION-FIELD ELECTROSTATICS */
1519 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1520
1521 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1522
1523 fscal = felec;
1524
1525 fscal = _mm_and_ps(fscal,cutoff_mask);
1526
1527 /* Calculate temporary vectorial force */
1528 tx = _mm_mul_ps(fscal,dx01);
1529 ty = _mm_mul_ps(fscal,dy01);
1530 tz = _mm_mul_ps(fscal,dz01);
1531
1532 /* Update vectorial force */
1533 fix0 = _mm_add_ps(fix0,tx);
1534 fiy0 = _mm_add_ps(fiy0,ty);
1535 fiz0 = _mm_add_ps(fiz0,tz);
1536
1537 fjx1 = _mm_add_ps(fjx1,tx);
1538 fjy1 = _mm_add_ps(fjy1,ty);
1539 fjz1 = _mm_add_ps(fjz1,tz);
1540
1541 }
1542
1543 /**************************
1544 * CALCULATE INTERACTIONS *
1545 **************************/
1546
1547 if (gmx_mm_any_lt(rsq02,rcutoff2))
1548 {
1549
1550 /* REACTION-FIELD ELECTROSTATICS */
1551 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1552
1553 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1554
1555 fscal = felec;
1556
1557 fscal = _mm_and_ps(fscal,cutoff_mask);
1558
1559 /* Calculate temporary vectorial force */
1560 tx = _mm_mul_ps(fscal,dx02);
1561 ty = _mm_mul_ps(fscal,dy02);
1562 tz = _mm_mul_ps(fscal,dz02);
1563
1564 /* Update vectorial force */
1565 fix0 = _mm_add_ps(fix0,tx);
1566 fiy0 = _mm_add_ps(fiy0,ty);
1567 fiz0 = _mm_add_ps(fiz0,tz);
1568
1569 fjx2 = _mm_add_ps(fjx2,tx);
1570 fjy2 = _mm_add_ps(fjy2,ty);
1571 fjz2 = _mm_add_ps(fjz2,tz);
1572
1573 }
1574
1575 /**************************
1576 * CALCULATE INTERACTIONS *
1577 **************************/
1578
1579 if (gmx_mm_any_lt(rsq10,rcutoff2))
1580 {
1581
1582 /* REACTION-FIELD ELECTROSTATICS */
1583 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1584
1585 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1586
1587 fscal = felec;
1588
1589 fscal = _mm_and_ps(fscal,cutoff_mask);
1590
1591 /* Calculate temporary vectorial force */
1592 tx = _mm_mul_ps(fscal,dx10);
1593 ty = _mm_mul_ps(fscal,dy10);
1594 tz = _mm_mul_ps(fscal,dz10);
1595
1596 /* Update vectorial force */
1597 fix1 = _mm_add_ps(fix1,tx);
1598 fiy1 = _mm_add_ps(fiy1,ty);
1599 fiz1 = _mm_add_ps(fiz1,tz);
1600
1601 fjx0 = _mm_add_ps(fjx0,tx);
1602 fjy0 = _mm_add_ps(fjy0,ty);
1603 fjz0 = _mm_add_ps(fjz0,tz);
1604
1605 }
1606
1607 /**************************
1608 * CALCULATE INTERACTIONS *
1609 **************************/
1610
1611 if (gmx_mm_any_lt(rsq11,rcutoff2))
1612 {
1613
1614 /* REACTION-FIELD ELECTROSTATICS */
1615 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1616
1617 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1618
1619 fscal = felec;
1620
1621 fscal = _mm_and_ps(fscal,cutoff_mask);
1622
1623 /* Calculate temporary vectorial force */
1624 tx = _mm_mul_ps(fscal,dx11);
1625 ty = _mm_mul_ps(fscal,dy11);
1626 tz = _mm_mul_ps(fscal,dz11);
1627
1628 /* Update vectorial force */
1629 fix1 = _mm_add_ps(fix1,tx);
1630 fiy1 = _mm_add_ps(fiy1,ty);
1631 fiz1 = _mm_add_ps(fiz1,tz);
1632
1633 fjx1 = _mm_add_ps(fjx1,tx);
1634 fjy1 = _mm_add_ps(fjy1,ty);
1635 fjz1 = _mm_add_ps(fjz1,tz);
1636
1637 }
1638
1639 /**************************
1640 * CALCULATE INTERACTIONS *
1641 **************************/
1642
1643 if (gmx_mm_any_lt(rsq12,rcutoff2))
1644 {
1645
1646 /* REACTION-FIELD ELECTROSTATICS */
1647 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1648
1649 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1650
1651 fscal = felec;
1652
1653 fscal = _mm_and_ps(fscal,cutoff_mask);
1654
1655 /* Calculate temporary vectorial force */
1656 tx = _mm_mul_ps(fscal,dx12);
1657 ty = _mm_mul_ps(fscal,dy12);
1658 tz = _mm_mul_ps(fscal,dz12);
1659
1660 /* Update vectorial force */
1661 fix1 = _mm_add_ps(fix1,tx);
1662 fiy1 = _mm_add_ps(fiy1,ty);
1663 fiz1 = _mm_add_ps(fiz1,tz);
1664
1665 fjx2 = _mm_add_ps(fjx2,tx);
1666 fjy2 = _mm_add_ps(fjy2,ty);
1667 fjz2 = _mm_add_ps(fjz2,tz);
1668
1669 }
1670
1671 /**************************
1672 * CALCULATE INTERACTIONS *
1673 **************************/
1674
1675 if (gmx_mm_any_lt(rsq20,rcutoff2))
1676 {
1677
1678 /* REACTION-FIELD ELECTROSTATICS */
1679 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1680
1681 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1682
1683 fscal = felec;
1684
1685 fscal = _mm_and_ps(fscal,cutoff_mask);
1686
1687 /* Calculate temporary vectorial force */
1688 tx = _mm_mul_ps(fscal,dx20);
1689 ty = _mm_mul_ps(fscal,dy20);
1690 tz = _mm_mul_ps(fscal,dz20);
1691
1692 /* Update vectorial force */
1693 fix2 = _mm_add_ps(fix2,tx);
1694 fiy2 = _mm_add_ps(fiy2,ty);
1695 fiz2 = _mm_add_ps(fiz2,tz);
1696
1697 fjx0 = _mm_add_ps(fjx0,tx);
1698 fjy0 = _mm_add_ps(fjy0,ty);
1699 fjz0 = _mm_add_ps(fjz0,tz);
1700
1701 }
1702
1703 /**************************
1704 * CALCULATE INTERACTIONS *
1705 **************************/
1706
1707 if (gmx_mm_any_lt(rsq21,rcutoff2))
1708 {
1709
1710 /* REACTION-FIELD ELECTROSTATICS */
1711 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1712
1713 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1714
1715 fscal = felec;
1716
1717 fscal = _mm_and_ps(fscal,cutoff_mask);
1718
1719 /* Calculate temporary vectorial force */
1720 tx = _mm_mul_ps(fscal,dx21);
1721 ty = _mm_mul_ps(fscal,dy21);
1722 tz = _mm_mul_ps(fscal,dz21);
1723
1724 /* Update vectorial force */
1725 fix2 = _mm_add_ps(fix2,tx);
1726 fiy2 = _mm_add_ps(fiy2,ty);
1727 fiz2 = _mm_add_ps(fiz2,tz);
1728
1729 fjx1 = _mm_add_ps(fjx1,tx);
1730 fjy1 = _mm_add_ps(fjy1,ty);
1731 fjz1 = _mm_add_ps(fjz1,tz);
1732
1733 }
1734
1735 /**************************
1736 * CALCULATE INTERACTIONS *
1737 **************************/
1738
1739 if (gmx_mm_any_lt(rsq22,rcutoff2))
1740 {
1741
1742 /* REACTION-FIELD ELECTROSTATICS */
1743 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1744
1745 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1746
1747 fscal = felec;
1748
1749 fscal = _mm_and_ps(fscal,cutoff_mask);
1750
1751 /* Calculate temporary vectorial force */
1752 tx = _mm_mul_ps(fscal,dx22);
1753 ty = _mm_mul_ps(fscal,dy22);
1754 tz = _mm_mul_ps(fscal,dz22);
1755
1756 /* Update vectorial force */
1757 fix2 = _mm_add_ps(fix2,tx);
1758 fiy2 = _mm_add_ps(fiy2,ty);
1759 fiz2 = _mm_add_ps(fiz2,tz);
1760
1761 fjx2 = _mm_add_ps(fjx2,tx);
1762 fjy2 = _mm_add_ps(fjy2,ty);
1763 fjz2 = _mm_add_ps(fjz2,tz);
1764
1765 }
1766
1767 fjptrA = f+j_coord_offsetA;
1768 fjptrB = f+j_coord_offsetB;
1769 fjptrC = f+j_coord_offsetC;
1770 fjptrD = f+j_coord_offsetD;
1771
1772 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1773 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1774
1775 /* Inner loop uses 301 flops */
1776 }
1777
1778 if(jidx<j_index_end)
1779 {
1780
1781 /* Get j neighbor index, and coordinate index */
1782 jnrlistA = jjnr[jidx];
1783 jnrlistB = jjnr[jidx+1];
1784 jnrlistC = jjnr[jidx+2];
1785 jnrlistD = jjnr[jidx+3];
1786 /* Sign of each element will be negative for non-real atoms.
1787 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1788 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1789 */
1790 dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1791 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1792 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1793 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1794 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1795 j_coord_offsetA = DIM3*jnrA;
1796 j_coord_offsetB = DIM3*jnrB;
1797 j_coord_offsetC = DIM3*jnrC;
1798 j_coord_offsetD = DIM3*jnrD;
1799
1800 /* load j atom coordinates */
1801 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1802 x+j_coord_offsetC,x+j_coord_offsetD,
1803 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1804
1805 /* Calculate displacement vector */
1806 dx00 = _mm_sub_ps(ix0,jx0);
1807 dy00 = _mm_sub_ps(iy0,jy0);
1808 dz00 = _mm_sub_ps(iz0,jz0);
1809 dx01 = _mm_sub_ps(ix0,jx1);
1810 dy01 = _mm_sub_ps(iy0,jy1);
1811 dz01 = _mm_sub_ps(iz0,jz1);
1812 dx02 = _mm_sub_ps(ix0,jx2);
1813 dy02 = _mm_sub_ps(iy0,jy2);
1814 dz02 = _mm_sub_ps(iz0,jz2);
1815 dx10 = _mm_sub_ps(ix1,jx0);
1816 dy10 = _mm_sub_ps(iy1,jy0);
1817 dz10 = _mm_sub_ps(iz1,jz0);
1818 dx11 = _mm_sub_ps(ix1,jx1);
1819 dy11 = _mm_sub_ps(iy1,jy1);
1820 dz11 = _mm_sub_ps(iz1,jz1);
1821 dx12 = _mm_sub_ps(ix1,jx2);
1822 dy12 = _mm_sub_ps(iy1,jy2);
1823 dz12 = _mm_sub_ps(iz1,jz2);
1824 dx20 = _mm_sub_ps(ix2,jx0);
1825 dy20 = _mm_sub_ps(iy2,jy0);
1826 dz20 = _mm_sub_ps(iz2,jz0);
1827 dx21 = _mm_sub_ps(ix2,jx1);
1828 dy21 = _mm_sub_ps(iy2,jy1);
1829 dz21 = _mm_sub_ps(iz2,jz1);
1830 dx22 = _mm_sub_ps(ix2,jx2);
1831 dy22 = _mm_sub_ps(iy2,jy2);
1832 dz22 = _mm_sub_ps(iz2,jz2);
1833
1834 /* Calculate squared distance and things based on it */
1835 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1836 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1837 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1838 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1839 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1840 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1841 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1842 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1843 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1844
1845 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
1846 rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
1847 rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
1848 rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
1849 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
1850 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
1851 rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
1852 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
1853 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
1854
1855 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1856 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1857 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1858 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1859 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1860 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1861 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1862 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1863 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1864
1865 fjx0 = _mm_setzero_ps();
1866 fjy0 = _mm_setzero_ps();
1867 fjz0 = _mm_setzero_ps();
1868 fjx1 = _mm_setzero_ps();
1869 fjy1 = _mm_setzero_ps();
1870 fjz1 = _mm_setzero_ps();
1871 fjx2 = _mm_setzero_ps();
1872 fjy2 = _mm_setzero_ps();
1873 fjz2 = _mm_setzero_ps();
1874
1875 /**************************
1876 * CALCULATE INTERACTIONS *
1877 **************************/
1878
1879 if (gmx_mm_any_lt(rsq00,rcutoff2))
1880 {
1881
1882 r00 = _mm_mul_ps(rsq00,rinv00);
1883 r00 = _mm_andnot_ps(dummy_mask,r00);
1884
1885 /* REACTION-FIELD ELECTROSTATICS */
1886 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1887
1888 /* LENNARD-JONES DISPERSION/REPULSION */
1889
1890 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1891 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1892 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1893 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1894 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1895
1896 d = _mm_sub_ps(r00,rswitch);
1897 d = _mm_max_ps(d,_mm_setzero_ps());
1898 d2 = _mm_mul_ps(d,d);
1899 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1900
1901 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1902
1903 /* Evaluate switch function */
1904 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1905 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1906 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1907
1908 fscal = _mm_add_ps(felec,fvdw);
1909
1910 fscal = _mm_and_ps(fscal,cutoff_mask);
1911
1912 fscal = _mm_andnot_ps(dummy_mask,fscal);
1913
1914 /* Calculate temporary vectorial force */
1915 tx = _mm_mul_ps(fscal,dx00);
1916 ty = _mm_mul_ps(fscal,dy00);
1917 tz = _mm_mul_ps(fscal,dz00);
1918
1919 /* Update vectorial force */
1920 fix0 = _mm_add_ps(fix0,tx);
1921 fiy0 = _mm_add_ps(fiy0,ty);
1922 fiz0 = _mm_add_ps(fiz0,tz);
1923
1924 fjx0 = _mm_add_ps(fjx0,tx);
1925 fjy0 = _mm_add_ps(fjy0,ty);
1926 fjz0 = _mm_add_ps(fjz0,tz);
1927
1928 }
1929
1930 /**************************
1931 * CALCULATE INTERACTIONS *
1932 **************************/
1933
1934 if (gmx_mm_any_lt(rsq01,rcutoff2))
1935 {
1936
1937 /* REACTION-FIELD ELECTROSTATICS */
1938 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1939
1940 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1941
1942 fscal = felec;
1943
1944 fscal = _mm_and_ps(fscal,cutoff_mask);
1945
1946 fscal = _mm_andnot_ps(dummy_mask,fscal);
1947
1948 /* Calculate temporary vectorial force */
1949 tx = _mm_mul_ps(fscal,dx01);
1950 ty = _mm_mul_ps(fscal,dy01);
1951 tz = _mm_mul_ps(fscal,dz01);
1952
1953 /* Update vectorial force */
1954 fix0 = _mm_add_ps(fix0,tx);
1955 fiy0 = _mm_add_ps(fiy0,ty);
1956 fiz0 = _mm_add_ps(fiz0,tz);
1957
1958 fjx1 = _mm_add_ps(fjx1,tx);
1959 fjy1 = _mm_add_ps(fjy1,ty);
1960 fjz1 = _mm_add_ps(fjz1,tz);
1961
1962 }
1963
1964 /**************************
1965 * CALCULATE INTERACTIONS *
1966 **************************/
1967
1968 if (gmx_mm_any_lt(rsq02,rcutoff2))
1969 {
1970
1971 /* REACTION-FIELD ELECTROSTATICS */
1972 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1973
1974 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1975
1976 fscal = felec;
1977
1978 fscal = _mm_and_ps(fscal,cutoff_mask);
1979
1980 fscal = _mm_andnot_ps(dummy_mask,fscal);
1981
1982 /* Calculate temporary vectorial force */
1983 tx = _mm_mul_ps(fscal,dx02);
1984 ty = _mm_mul_ps(fscal,dy02);
1985 tz = _mm_mul_ps(fscal,dz02);
1986
1987 /* Update vectorial force */
1988 fix0 = _mm_add_ps(fix0,tx);
1989 fiy0 = _mm_add_ps(fiy0,ty);
1990 fiz0 = _mm_add_ps(fiz0,tz);
1991
1992 fjx2 = _mm_add_ps(fjx2,tx);
1993 fjy2 = _mm_add_ps(fjy2,ty);
1994 fjz2 = _mm_add_ps(fjz2,tz);
1995
1996 }
1997
1998 /**************************
1999 * CALCULATE INTERACTIONS *
2000 **************************/
2001
2002 if (gmx_mm_any_lt(rsq10,rcutoff2))
2003 {
2004
2005 /* REACTION-FIELD ELECTROSTATICS */
2006 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
2007
2008 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2009
2010 fscal = felec;
2011
2012 fscal = _mm_and_ps(fscal,cutoff_mask);
2013
2014 fscal = _mm_andnot_ps(dummy_mask,fscal);
2015
2016 /* Calculate temporary vectorial force */
2017 tx = _mm_mul_ps(fscal,dx10);
2018 ty = _mm_mul_ps(fscal,dy10);
2019 tz = _mm_mul_ps(fscal,dz10);
2020
2021 /* Update vectorial force */
2022 fix1 = _mm_add_ps(fix1,tx);
2023 fiy1 = _mm_add_ps(fiy1,ty);
2024 fiz1 = _mm_add_ps(fiz1,tz);
2025
2026 fjx0 = _mm_add_ps(fjx0,tx);
2027 fjy0 = _mm_add_ps(fjy0,ty);
2028 fjz0 = _mm_add_ps(fjz0,tz);
2029
2030 }
2031
2032 /**************************
2033 * CALCULATE INTERACTIONS *
2034 **************************/
2035
2036 if (gmx_mm_any_lt(rsq11,rcutoff2))
2037 {
2038
2039 /* REACTION-FIELD ELECTROSTATICS */
2040 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
2041
2042 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2043
2044 fscal = felec;
2045
2046 fscal = _mm_and_ps(fscal,cutoff_mask);
2047
2048 fscal = _mm_andnot_ps(dummy_mask,fscal);
2049
2050 /* Calculate temporary vectorial force */
2051 tx = _mm_mul_ps(fscal,dx11);
2052 ty = _mm_mul_ps(fscal,dy11);
2053 tz = _mm_mul_ps(fscal,dz11);
2054
2055 /* Update vectorial force */
2056 fix1 = _mm_add_ps(fix1,tx);
2057 fiy1 = _mm_add_ps(fiy1,ty);
2058 fiz1 = _mm_add_ps(fiz1,tz);
2059
2060 fjx1 = _mm_add_ps(fjx1,tx);
2061 fjy1 = _mm_add_ps(fjy1,ty);
2062 fjz1 = _mm_add_ps(fjz1,tz);
2063
2064 }
2065
2066 /**************************
2067 * CALCULATE INTERACTIONS *
2068 **************************/
2069
2070 if (gmx_mm_any_lt(rsq12,rcutoff2))
2071 {
2072
2073 /* REACTION-FIELD ELECTROSTATICS */
2074 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
2075
2076 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2077
2078 fscal = felec;
2079
2080 fscal = _mm_and_ps(fscal,cutoff_mask);
2081
2082 fscal = _mm_andnot_ps(dummy_mask,fscal);
2083
2084 /* Calculate temporary vectorial force */
2085 tx = _mm_mul_ps(fscal,dx12);
2086 ty = _mm_mul_ps(fscal,dy12);
2087 tz = _mm_mul_ps(fscal,dz12);
2088
2089 /* Update vectorial force */
2090 fix1 = _mm_add_ps(fix1,tx);
2091 fiy1 = _mm_add_ps(fiy1,ty);
2092 fiz1 = _mm_add_ps(fiz1,tz);
2093
2094 fjx2 = _mm_add_ps(fjx2,tx);
2095 fjy2 = _mm_add_ps(fjy2,ty);
2096 fjz2 = _mm_add_ps(fjz2,tz);
2097
2098 }
2099
2100 /**************************
2101 * CALCULATE INTERACTIONS *
2102 **************************/
2103
2104 if (gmx_mm_any_lt(rsq20,rcutoff2))
2105 {
2106
2107 /* REACTION-FIELD ELECTROSTATICS */
2108 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
2109
2110 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2111
2112 fscal = felec;
2113
2114 fscal = _mm_and_ps(fscal,cutoff_mask);
2115
2116 fscal = _mm_andnot_ps(dummy_mask,fscal);
2117
2118 /* Calculate temporary vectorial force */
2119 tx = _mm_mul_ps(fscal,dx20);
2120 ty = _mm_mul_ps(fscal,dy20);
2121 tz = _mm_mul_ps(fscal,dz20);
2122
2123 /* Update vectorial force */
2124 fix2 = _mm_add_ps(fix2,tx);
2125 fiy2 = _mm_add_ps(fiy2,ty);
2126 fiz2 = _mm_add_ps(fiz2,tz);
2127
2128 fjx0 = _mm_add_ps(fjx0,tx);
2129 fjy0 = _mm_add_ps(fjy0,ty);
2130 fjz0 = _mm_add_ps(fjz0,tz);
2131
2132 }
2133
2134 /**************************
2135 * CALCULATE INTERACTIONS *
2136 **************************/
2137
2138 if (gmx_mm_any_lt(rsq21,rcutoff2))
2139 {
2140
2141 /* REACTION-FIELD ELECTROSTATICS */
2142 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
2143
2144 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2145
2146 fscal = felec;
2147
2148 fscal = _mm_and_ps(fscal,cutoff_mask);
2149
2150 fscal = _mm_andnot_ps(dummy_mask,fscal);
2151
2152 /* Calculate temporary vectorial force */
2153 tx = _mm_mul_ps(fscal,dx21);
2154 ty = _mm_mul_ps(fscal,dy21);
2155 tz = _mm_mul_ps(fscal,dz21);
2156
2157 /* Update vectorial force */
2158 fix2 = _mm_add_ps(fix2,tx);
2159 fiy2 = _mm_add_ps(fiy2,ty);
2160 fiz2 = _mm_add_ps(fiz2,tz);
2161
2162 fjx1 = _mm_add_ps(fjx1,tx);
2163 fjy1 = _mm_add_ps(fjy1,ty);
2164 fjz1 = _mm_add_ps(fjz1,tz);
2165
2166 }
2167
2168 /**************************
2169 * CALCULATE INTERACTIONS *
2170 **************************/
2171
2172 if (gmx_mm_any_lt(rsq22,rcutoff2))
2173 {
2174
2175 /* REACTION-FIELD ELECTROSTATICS */
2176 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
2177
2178 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2179
2180 fscal = felec;
2181
2182 fscal = _mm_and_ps(fscal,cutoff_mask);
2183
2184 fscal = _mm_andnot_ps(dummy_mask,fscal);
2185
2186 /* Calculate temporary vectorial force */
2187 tx = _mm_mul_ps(fscal,dx22);
2188 ty = _mm_mul_ps(fscal,dy22);
2189 tz = _mm_mul_ps(fscal,dz22);
2190
2191 /* Update vectorial force */
2192 fix2 = _mm_add_ps(fix2,tx);
2193 fiy2 = _mm_add_ps(fiy2,ty);
2194 fiz2 = _mm_add_ps(fiz2,tz);
2195
2196 fjx2 = _mm_add_ps(fjx2,tx);
2197 fjy2 = _mm_add_ps(fjy2,ty);
2198 fjz2 = _mm_add_ps(fjz2,tz);
2199
2200 }
2201
2202 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2203 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2204 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2205 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2206
2207 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2208 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2209
2210 /* Inner loop uses 302 flops */
2211 }
2212
2213 /* End of innermost loop */
2214
2215 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2216 f+i_coord_offset,fshift+i_shift_offset);
2217
2218 /* Increment number of inner iterations */
2219 inneriter += j_index_end - j_index_start;
2220
2221 /* Outer loop uses 18 flops */
2222 }
2223
2224 /* Increment number of outer iterations */
2225 outeriter += nri;
2226
2227 /* Update outer/inner flops */
2228
2229 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*302)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W3W3_F] += outeriter*18 + inneriter
*302
;
2230}