2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "kernelutil_sparc64_hpc_ace_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double
51 * Electrostatics interaction: Ewald
52 * VdW interaction: LennardJones
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int j_coord_offsetA,j_coord_offsetB;
75 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
77 real *shiftvec,*fshift,*x,*f;
78 _fjsp_v2r8 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
80 _fjsp_v2r8 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
82 _fjsp_v2r8 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
84 _fjsp_v2r8 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
85 int vdwjidx0A,vdwjidx0B;
86 _fjsp_v2r8 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
87 int vdwjidx1A,vdwjidx1B;
88 _fjsp_v2r8 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
89 int vdwjidx2A,vdwjidx2B;
90 _fjsp_v2r8 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
91 _fjsp_v2r8 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
92 _fjsp_v2r8 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
93 _fjsp_v2r8 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
94 _fjsp_v2r8 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
95 _fjsp_v2r8 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
96 _fjsp_v2r8 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
97 _fjsp_v2r8 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
98 _fjsp_v2r8 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
99 _fjsp_v2r8 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
100 _fjsp_v2r8 velec,felec,velecsum,facel,crf,krf,krf2;
103 _fjsp_v2r8 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
106 _fjsp_v2r8 one_sixth = gmx_fjsp_set1_v2r8(1.0/6.0);
107 _fjsp_v2r8 one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
108 _fjsp_v2r8 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
110 _fjsp_v2r8 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
111 real rswitch_scalar,d_scalar;
113 _fjsp_v2r8 dummy_mask,cutoff_mask;
114 _fjsp_v2r8 one = gmx_fjsp_set1_v2r8(1.0);
115 _fjsp_v2r8 two = gmx_fjsp_set1_v2r8(2.0);
116 union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
123 jindex = nlist->jindex;
125 shiftidx = nlist->shift;
127 shiftvec = fr->shift_vec[0];
128 fshift = fr->fshift[0];
129 facel = gmx_fjsp_set1_v2r8(fr->epsfac);
130 charge = mdatoms->chargeA;
131 nvdwtype = fr->ntype;
133 vdwtype = mdatoms->typeA;
135 sh_ewald = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
136 ewtab = fr->ic->tabq_coul_FDV0;
137 ewtabscale = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
138 ewtabhalfspace = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
140 /* Setup water-specific parameters */
141 inr = nlist->iinr[0];
142 iq0 = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
143 iq1 = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
144 iq2 = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
145 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
147 jq0 = gmx_fjsp_set1_v2r8(charge[inr+0]);
148 jq1 = gmx_fjsp_set1_v2r8(charge[inr+1]);
149 jq2 = gmx_fjsp_set1_v2r8(charge[inr+2]);
150 vdwjidx0A = 2*vdwtype[inr+0];
151 qq00 = _fjsp_mul_v2r8(iq0,jq0);
152 c6_00 = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
153 c12_00 = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
154 qq01 = _fjsp_mul_v2r8(iq0,jq1);
155 qq02 = _fjsp_mul_v2r8(iq0,jq2);
156 qq10 = _fjsp_mul_v2r8(iq1,jq0);
157 qq11 = _fjsp_mul_v2r8(iq1,jq1);
158 qq12 = _fjsp_mul_v2r8(iq1,jq2);
159 qq20 = _fjsp_mul_v2r8(iq2,jq0);
160 qq21 = _fjsp_mul_v2r8(iq2,jq1);
161 qq22 = _fjsp_mul_v2r8(iq2,jq2);
163 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
164 rcutoff_scalar = fr->rcoulomb;
165 rcutoff = gmx_fjsp_set1_v2r8(rcutoff_scalar);
166 rcutoff2 = _fjsp_mul_v2r8(rcutoff,rcutoff);
168 rswitch_scalar = fr->rcoulomb_switch;
169 rswitch = gmx_fjsp_set1_v2r8(rswitch_scalar);
170 /* Setup switch parameters */
171 d_scalar = rcutoff_scalar-rswitch_scalar;
172 d = gmx_fjsp_set1_v2r8(d_scalar);
173 swV3 = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
174 swV4 = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
175 swV5 = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
176 swF2 = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
177 swF3 = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
178 swF4 = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
180 /* Avoid stupid compiler warnings */
188 /* Start outer loop over neighborlists */
189 for(iidx=0; iidx<nri; iidx++)
191 /* Load shift vector for this list */
192 i_shift_offset = DIM*shiftidx[iidx];
194 /* Load limits for loop over neighbors */
195 j_index_start = jindex[iidx];
196 j_index_end = jindex[iidx+1];
198 /* Get outer coordinate index */
200 i_coord_offset = DIM*inr;
202 /* Load i particle coords and add shift vector */
203 gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
204 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
206 fix0 = _fjsp_setzero_v2r8();
207 fiy0 = _fjsp_setzero_v2r8();
208 fiz0 = _fjsp_setzero_v2r8();
209 fix1 = _fjsp_setzero_v2r8();
210 fiy1 = _fjsp_setzero_v2r8();
211 fiz1 = _fjsp_setzero_v2r8();
212 fix2 = _fjsp_setzero_v2r8();
213 fiy2 = _fjsp_setzero_v2r8();
214 fiz2 = _fjsp_setzero_v2r8();
216 /* Reset potential sums */
217 velecsum = _fjsp_setzero_v2r8();
218 vvdwsum = _fjsp_setzero_v2r8();
220 /* Start inner kernel loop */
221 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
224 /* Get j neighbor index, and coordinate index */
227 j_coord_offsetA = DIM*jnrA;
228 j_coord_offsetB = DIM*jnrB;
230 /* load j atom coordinates */
231 gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
232 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
234 /* Calculate displacement vector */
235 dx00 = _fjsp_sub_v2r8(ix0,jx0);
236 dy00 = _fjsp_sub_v2r8(iy0,jy0);
237 dz00 = _fjsp_sub_v2r8(iz0,jz0);
238 dx01 = _fjsp_sub_v2r8(ix0,jx1);
239 dy01 = _fjsp_sub_v2r8(iy0,jy1);
240 dz01 = _fjsp_sub_v2r8(iz0,jz1);
241 dx02 = _fjsp_sub_v2r8(ix0,jx2);
242 dy02 = _fjsp_sub_v2r8(iy0,jy2);
243 dz02 = _fjsp_sub_v2r8(iz0,jz2);
244 dx10 = _fjsp_sub_v2r8(ix1,jx0);
245 dy10 = _fjsp_sub_v2r8(iy1,jy0);
246 dz10 = _fjsp_sub_v2r8(iz1,jz0);
247 dx11 = _fjsp_sub_v2r8(ix1,jx1);
248 dy11 = _fjsp_sub_v2r8(iy1,jy1);
249 dz11 = _fjsp_sub_v2r8(iz1,jz1);
250 dx12 = _fjsp_sub_v2r8(ix1,jx2);
251 dy12 = _fjsp_sub_v2r8(iy1,jy2);
252 dz12 = _fjsp_sub_v2r8(iz1,jz2);
253 dx20 = _fjsp_sub_v2r8(ix2,jx0);
254 dy20 = _fjsp_sub_v2r8(iy2,jy0);
255 dz20 = _fjsp_sub_v2r8(iz2,jz0);
256 dx21 = _fjsp_sub_v2r8(ix2,jx1);
257 dy21 = _fjsp_sub_v2r8(iy2,jy1);
258 dz21 = _fjsp_sub_v2r8(iz2,jz1);
259 dx22 = _fjsp_sub_v2r8(ix2,jx2);
260 dy22 = _fjsp_sub_v2r8(iy2,jy2);
261 dz22 = _fjsp_sub_v2r8(iz2,jz2);
263 /* Calculate squared distance and things based on it */
264 rsq00 = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
265 rsq01 = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
266 rsq02 = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
267 rsq10 = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
268 rsq11 = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
269 rsq12 = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
270 rsq20 = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
271 rsq21 = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
272 rsq22 = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
274 rinv00 = gmx_fjsp_invsqrt_v2r8(rsq00);
275 rinv01 = gmx_fjsp_invsqrt_v2r8(rsq01);
276 rinv02 = gmx_fjsp_invsqrt_v2r8(rsq02);
277 rinv10 = gmx_fjsp_invsqrt_v2r8(rsq10);
278 rinv11 = gmx_fjsp_invsqrt_v2r8(rsq11);
279 rinv12 = gmx_fjsp_invsqrt_v2r8(rsq12);
280 rinv20 = gmx_fjsp_invsqrt_v2r8(rsq20);
281 rinv21 = gmx_fjsp_invsqrt_v2r8(rsq21);
282 rinv22 = gmx_fjsp_invsqrt_v2r8(rsq22);
284 rinvsq00 = _fjsp_mul_v2r8(rinv00,rinv00);
285 rinvsq01 = _fjsp_mul_v2r8(rinv01,rinv01);
286 rinvsq02 = _fjsp_mul_v2r8(rinv02,rinv02);
287 rinvsq10 = _fjsp_mul_v2r8(rinv10,rinv10);
288 rinvsq11 = _fjsp_mul_v2r8(rinv11,rinv11);
289 rinvsq12 = _fjsp_mul_v2r8(rinv12,rinv12);
290 rinvsq20 = _fjsp_mul_v2r8(rinv20,rinv20);
291 rinvsq21 = _fjsp_mul_v2r8(rinv21,rinv21);
292 rinvsq22 = _fjsp_mul_v2r8(rinv22,rinv22);
294 fjx0 = _fjsp_setzero_v2r8();
295 fjy0 = _fjsp_setzero_v2r8();
296 fjz0 = _fjsp_setzero_v2r8();
297 fjx1 = _fjsp_setzero_v2r8();
298 fjy1 = _fjsp_setzero_v2r8();
299 fjz1 = _fjsp_setzero_v2r8();
300 fjx2 = _fjsp_setzero_v2r8();
301 fjy2 = _fjsp_setzero_v2r8();
302 fjz2 = _fjsp_setzero_v2r8();
304 /**************************
305 * CALCULATE INTERACTIONS *
306 **************************/
308 if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
311 r00 = _fjsp_mul_v2r8(rsq00,rinv00);
313 /* EWALD ELECTROSTATICS */
315 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
316 ewrt = _fjsp_mul_v2r8(r00,ewtabscale);
317 itab_tmp = _fjsp_dtox_v2r8(ewrt);
318 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
319 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
321 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
322 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
323 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
324 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
325 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
326 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
327 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
328 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
329 velec = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
330 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
332 /* LENNARD-JONES DISPERSION/REPULSION */
334 rinvsix = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
335 vvdw6 = _fjsp_mul_v2r8(c6_00,rinvsix);
336 vvdw12 = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
337 vvdw = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
338 fvdw = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
340 d = _fjsp_sub_v2r8(r00,rswitch);
341 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
342 d2 = _fjsp_mul_v2r8(d,d);
343 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
345 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
347 /* Evaluate switch function */
348 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
349 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
350 fvdw = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
351 velec = _fjsp_mul_v2r8(velec,sw);
352 vvdw = _fjsp_mul_v2r8(vvdw,sw);
353 cutoff_mask = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
355 /* Update potential sum for this i atom from the interaction with this j atom. */
356 velec = _fjsp_and_v2r8(velec,cutoff_mask);
357 velecsum = _fjsp_add_v2r8(velecsum,velec);
358 vvdw = _fjsp_and_v2r8(vvdw,cutoff_mask);
359 vvdwsum = _fjsp_add_v2r8(vvdwsum,vvdw);
361 fscal = _fjsp_add_v2r8(felec,fvdw);
363 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
365 /* Update vectorial force */
366 fix0 = _fjsp_madd_v2r8(dx00,fscal,fix0);
367 fiy0 = _fjsp_madd_v2r8(dy00,fscal,fiy0);
368 fiz0 = _fjsp_madd_v2r8(dz00,fscal,fiz0);
370 fjx0 = _fjsp_madd_v2r8(dx00,fscal,fjx0);
371 fjy0 = _fjsp_madd_v2r8(dy00,fscal,fjy0);
372 fjz0 = _fjsp_madd_v2r8(dz00,fscal,fjz0);
376 /**************************
377 * CALCULATE INTERACTIONS *
378 **************************/
380 if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
383 r01 = _fjsp_mul_v2r8(rsq01,rinv01);
385 /* EWALD ELECTROSTATICS */
387 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
388 ewrt = _fjsp_mul_v2r8(r01,ewtabscale);
389 itab_tmp = _fjsp_dtox_v2r8(ewrt);
390 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
391 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
393 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
394 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
395 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
396 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
397 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
398 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
399 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
400 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
401 velec = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
402 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
404 d = _fjsp_sub_v2r8(r01,rswitch);
405 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
406 d2 = _fjsp_mul_v2r8(d,d);
407 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
409 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
411 /* Evaluate switch function */
412 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
413 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
414 velec = _fjsp_mul_v2r8(velec,sw);
415 cutoff_mask = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
417 /* Update potential sum for this i atom from the interaction with this j atom. */
418 velec = _fjsp_and_v2r8(velec,cutoff_mask);
419 velecsum = _fjsp_add_v2r8(velecsum,velec);
423 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
425 /* Update vectorial force */
426 fix0 = _fjsp_madd_v2r8(dx01,fscal,fix0);
427 fiy0 = _fjsp_madd_v2r8(dy01,fscal,fiy0);
428 fiz0 = _fjsp_madd_v2r8(dz01,fscal,fiz0);
430 fjx1 = _fjsp_madd_v2r8(dx01,fscal,fjx1);
431 fjy1 = _fjsp_madd_v2r8(dy01,fscal,fjy1);
432 fjz1 = _fjsp_madd_v2r8(dz01,fscal,fjz1);
436 /**************************
437 * CALCULATE INTERACTIONS *
438 **************************/
440 if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
443 r02 = _fjsp_mul_v2r8(rsq02,rinv02);
445 /* EWALD ELECTROSTATICS */
447 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
448 ewrt = _fjsp_mul_v2r8(r02,ewtabscale);
449 itab_tmp = _fjsp_dtox_v2r8(ewrt);
450 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
451 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
453 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
454 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
455 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
456 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
457 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
458 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
459 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
460 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
461 velec = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
462 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
464 d = _fjsp_sub_v2r8(r02,rswitch);
465 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
466 d2 = _fjsp_mul_v2r8(d,d);
467 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
469 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
471 /* Evaluate switch function */
472 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
473 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
474 velec = _fjsp_mul_v2r8(velec,sw);
475 cutoff_mask = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
477 /* Update potential sum for this i atom from the interaction with this j atom. */
478 velec = _fjsp_and_v2r8(velec,cutoff_mask);
479 velecsum = _fjsp_add_v2r8(velecsum,velec);
483 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
485 /* Update vectorial force */
486 fix0 = _fjsp_madd_v2r8(dx02,fscal,fix0);
487 fiy0 = _fjsp_madd_v2r8(dy02,fscal,fiy0);
488 fiz0 = _fjsp_madd_v2r8(dz02,fscal,fiz0);
490 fjx2 = _fjsp_madd_v2r8(dx02,fscal,fjx2);
491 fjy2 = _fjsp_madd_v2r8(dy02,fscal,fjy2);
492 fjz2 = _fjsp_madd_v2r8(dz02,fscal,fjz2);
496 /**************************
497 * CALCULATE INTERACTIONS *
498 **************************/
500 if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
503 r10 = _fjsp_mul_v2r8(rsq10,rinv10);
505 /* EWALD ELECTROSTATICS */
507 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
508 ewrt = _fjsp_mul_v2r8(r10,ewtabscale);
509 itab_tmp = _fjsp_dtox_v2r8(ewrt);
510 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
511 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
513 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
514 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
515 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
516 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
517 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
518 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
519 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
520 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
521 velec = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
522 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
524 d = _fjsp_sub_v2r8(r10,rswitch);
525 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
526 d2 = _fjsp_mul_v2r8(d,d);
527 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
529 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
531 /* Evaluate switch function */
532 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
533 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
534 velec = _fjsp_mul_v2r8(velec,sw);
535 cutoff_mask = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
537 /* Update potential sum for this i atom from the interaction with this j atom. */
538 velec = _fjsp_and_v2r8(velec,cutoff_mask);
539 velecsum = _fjsp_add_v2r8(velecsum,velec);
543 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
545 /* Update vectorial force */
546 fix1 = _fjsp_madd_v2r8(dx10,fscal,fix1);
547 fiy1 = _fjsp_madd_v2r8(dy10,fscal,fiy1);
548 fiz1 = _fjsp_madd_v2r8(dz10,fscal,fiz1);
550 fjx0 = _fjsp_madd_v2r8(dx10,fscal,fjx0);
551 fjy0 = _fjsp_madd_v2r8(dy10,fscal,fjy0);
552 fjz0 = _fjsp_madd_v2r8(dz10,fscal,fjz0);
556 /**************************
557 * CALCULATE INTERACTIONS *
558 **************************/
560 if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
563 r11 = _fjsp_mul_v2r8(rsq11,rinv11);
565 /* EWALD ELECTROSTATICS */
567 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
568 ewrt = _fjsp_mul_v2r8(r11,ewtabscale);
569 itab_tmp = _fjsp_dtox_v2r8(ewrt);
570 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
571 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
573 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
574 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
575 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
576 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
577 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
578 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
579 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
580 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
581 velec = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
582 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
584 d = _fjsp_sub_v2r8(r11,rswitch);
585 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
586 d2 = _fjsp_mul_v2r8(d,d);
587 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
589 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
591 /* Evaluate switch function */
592 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
593 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
594 velec = _fjsp_mul_v2r8(velec,sw);
595 cutoff_mask = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
597 /* Update potential sum for this i atom from the interaction with this j atom. */
598 velec = _fjsp_and_v2r8(velec,cutoff_mask);
599 velecsum = _fjsp_add_v2r8(velecsum,velec);
603 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
605 /* Update vectorial force */
606 fix1 = _fjsp_madd_v2r8(dx11,fscal,fix1);
607 fiy1 = _fjsp_madd_v2r8(dy11,fscal,fiy1);
608 fiz1 = _fjsp_madd_v2r8(dz11,fscal,fiz1);
610 fjx1 = _fjsp_madd_v2r8(dx11,fscal,fjx1);
611 fjy1 = _fjsp_madd_v2r8(dy11,fscal,fjy1);
612 fjz1 = _fjsp_madd_v2r8(dz11,fscal,fjz1);
616 /**************************
617 * CALCULATE INTERACTIONS *
618 **************************/
620 if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
623 r12 = _fjsp_mul_v2r8(rsq12,rinv12);
625 /* EWALD ELECTROSTATICS */
627 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
628 ewrt = _fjsp_mul_v2r8(r12,ewtabscale);
629 itab_tmp = _fjsp_dtox_v2r8(ewrt);
630 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
631 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
633 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
634 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
635 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
636 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
637 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
638 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
639 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
640 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
641 velec = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
642 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
644 d = _fjsp_sub_v2r8(r12,rswitch);
645 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
646 d2 = _fjsp_mul_v2r8(d,d);
647 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
649 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
651 /* Evaluate switch function */
652 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
653 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
654 velec = _fjsp_mul_v2r8(velec,sw);
655 cutoff_mask = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
657 /* Update potential sum for this i atom from the interaction with this j atom. */
658 velec = _fjsp_and_v2r8(velec,cutoff_mask);
659 velecsum = _fjsp_add_v2r8(velecsum,velec);
663 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
665 /* Update vectorial force */
666 fix1 = _fjsp_madd_v2r8(dx12,fscal,fix1);
667 fiy1 = _fjsp_madd_v2r8(dy12,fscal,fiy1);
668 fiz1 = _fjsp_madd_v2r8(dz12,fscal,fiz1);
670 fjx2 = _fjsp_madd_v2r8(dx12,fscal,fjx2);
671 fjy2 = _fjsp_madd_v2r8(dy12,fscal,fjy2);
672 fjz2 = _fjsp_madd_v2r8(dz12,fscal,fjz2);
676 /**************************
677 * CALCULATE INTERACTIONS *
678 **************************/
680 if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
683 r20 = _fjsp_mul_v2r8(rsq20,rinv20);
685 /* EWALD ELECTROSTATICS */
687 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
688 ewrt = _fjsp_mul_v2r8(r20,ewtabscale);
689 itab_tmp = _fjsp_dtox_v2r8(ewrt);
690 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
691 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
693 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
694 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
695 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
696 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
697 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
698 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
699 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
700 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
701 velec = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
702 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
704 d = _fjsp_sub_v2r8(r20,rswitch);
705 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
706 d2 = _fjsp_mul_v2r8(d,d);
707 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
709 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
711 /* Evaluate switch function */
712 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
713 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
714 velec = _fjsp_mul_v2r8(velec,sw);
715 cutoff_mask = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
717 /* Update potential sum for this i atom from the interaction with this j atom. */
718 velec = _fjsp_and_v2r8(velec,cutoff_mask);
719 velecsum = _fjsp_add_v2r8(velecsum,velec);
723 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
725 /* Update vectorial force */
726 fix2 = _fjsp_madd_v2r8(dx20,fscal,fix2);
727 fiy2 = _fjsp_madd_v2r8(dy20,fscal,fiy2);
728 fiz2 = _fjsp_madd_v2r8(dz20,fscal,fiz2);
730 fjx0 = _fjsp_madd_v2r8(dx20,fscal,fjx0);
731 fjy0 = _fjsp_madd_v2r8(dy20,fscal,fjy0);
732 fjz0 = _fjsp_madd_v2r8(dz20,fscal,fjz0);
736 /**************************
737 * CALCULATE INTERACTIONS *
738 **************************/
740 if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
743 r21 = _fjsp_mul_v2r8(rsq21,rinv21);
745 /* EWALD ELECTROSTATICS */
747 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
748 ewrt = _fjsp_mul_v2r8(r21,ewtabscale);
749 itab_tmp = _fjsp_dtox_v2r8(ewrt);
750 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
751 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
753 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
754 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
755 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
756 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
757 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
758 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
759 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
760 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
761 velec = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
762 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
764 d = _fjsp_sub_v2r8(r21,rswitch);
765 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
766 d2 = _fjsp_mul_v2r8(d,d);
767 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
769 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
771 /* Evaluate switch function */
772 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
773 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
774 velec = _fjsp_mul_v2r8(velec,sw);
775 cutoff_mask = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
777 /* Update potential sum for this i atom from the interaction with this j atom. */
778 velec = _fjsp_and_v2r8(velec,cutoff_mask);
779 velecsum = _fjsp_add_v2r8(velecsum,velec);
783 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
785 /* Update vectorial force */
786 fix2 = _fjsp_madd_v2r8(dx21,fscal,fix2);
787 fiy2 = _fjsp_madd_v2r8(dy21,fscal,fiy2);
788 fiz2 = _fjsp_madd_v2r8(dz21,fscal,fiz2);
790 fjx1 = _fjsp_madd_v2r8(dx21,fscal,fjx1);
791 fjy1 = _fjsp_madd_v2r8(dy21,fscal,fjy1);
792 fjz1 = _fjsp_madd_v2r8(dz21,fscal,fjz1);
796 /**************************
797 * CALCULATE INTERACTIONS *
798 **************************/
800 if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
803 r22 = _fjsp_mul_v2r8(rsq22,rinv22);
805 /* EWALD ELECTROSTATICS */
807 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
808 ewrt = _fjsp_mul_v2r8(r22,ewtabscale);
809 itab_tmp = _fjsp_dtox_v2r8(ewrt);
810 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
811 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
813 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
814 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
815 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
816 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
817 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
818 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
819 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
820 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
821 velec = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
822 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
824 d = _fjsp_sub_v2r8(r22,rswitch);
825 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
826 d2 = _fjsp_mul_v2r8(d,d);
827 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
829 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
831 /* Evaluate switch function */
832 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
833 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
834 velec = _fjsp_mul_v2r8(velec,sw);
835 cutoff_mask = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
837 /* Update potential sum for this i atom from the interaction with this j atom. */
838 velec = _fjsp_and_v2r8(velec,cutoff_mask);
839 velecsum = _fjsp_add_v2r8(velecsum,velec);
843 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
845 /* Update vectorial force */
846 fix2 = _fjsp_madd_v2r8(dx22,fscal,fix2);
847 fiy2 = _fjsp_madd_v2r8(dy22,fscal,fiy2);
848 fiz2 = _fjsp_madd_v2r8(dz22,fscal,fiz2);
850 fjx2 = _fjsp_madd_v2r8(dx22,fscal,fjx2);
851 fjy2 = _fjsp_madd_v2r8(dy22,fscal,fjy2);
852 fjz2 = _fjsp_madd_v2r8(dz22,fscal,fjz2);
856 gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
858 /* Inner loop uses 630 flops */
865 j_coord_offsetA = DIM*jnrA;
867 /* load j atom coordinates */
868 gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
869 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
871 /* Calculate displacement vector */
872 dx00 = _fjsp_sub_v2r8(ix0,jx0);
873 dy00 = _fjsp_sub_v2r8(iy0,jy0);
874 dz00 = _fjsp_sub_v2r8(iz0,jz0);
875 dx01 = _fjsp_sub_v2r8(ix0,jx1);
876 dy01 = _fjsp_sub_v2r8(iy0,jy1);
877 dz01 = _fjsp_sub_v2r8(iz0,jz1);
878 dx02 = _fjsp_sub_v2r8(ix0,jx2);
879 dy02 = _fjsp_sub_v2r8(iy0,jy2);
880 dz02 = _fjsp_sub_v2r8(iz0,jz2);
881 dx10 = _fjsp_sub_v2r8(ix1,jx0);
882 dy10 = _fjsp_sub_v2r8(iy1,jy0);
883 dz10 = _fjsp_sub_v2r8(iz1,jz0);
884 dx11 = _fjsp_sub_v2r8(ix1,jx1);
885 dy11 = _fjsp_sub_v2r8(iy1,jy1);
886 dz11 = _fjsp_sub_v2r8(iz1,jz1);
887 dx12 = _fjsp_sub_v2r8(ix1,jx2);
888 dy12 = _fjsp_sub_v2r8(iy1,jy2);
889 dz12 = _fjsp_sub_v2r8(iz1,jz2);
890 dx20 = _fjsp_sub_v2r8(ix2,jx0);
891 dy20 = _fjsp_sub_v2r8(iy2,jy0);
892 dz20 = _fjsp_sub_v2r8(iz2,jz0);
893 dx21 = _fjsp_sub_v2r8(ix2,jx1);
894 dy21 = _fjsp_sub_v2r8(iy2,jy1);
895 dz21 = _fjsp_sub_v2r8(iz2,jz1);
896 dx22 = _fjsp_sub_v2r8(ix2,jx2);
897 dy22 = _fjsp_sub_v2r8(iy2,jy2);
898 dz22 = _fjsp_sub_v2r8(iz2,jz2);
900 /* Calculate squared distance and things based on it */
901 rsq00 = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
902 rsq01 = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
903 rsq02 = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
904 rsq10 = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
905 rsq11 = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
906 rsq12 = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
907 rsq20 = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
908 rsq21 = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
909 rsq22 = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
911 rinv00 = gmx_fjsp_invsqrt_v2r8(rsq00);
912 rinv01 = gmx_fjsp_invsqrt_v2r8(rsq01);
913 rinv02 = gmx_fjsp_invsqrt_v2r8(rsq02);
914 rinv10 = gmx_fjsp_invsqrt_v2r8(rsq10);
915 rinv11 = gmx_fjsp_invsqrt_v2r8(rsq11);
916 rinv12 = gmx_fjsp_invsqrt_v2r8(rsq12);
917 rinv20 = gmx_fjsp_invsqrt_v2r8(rsq20);
918 rinv21 = gmx_fjsp_invsqrt_v2r8(rsq21);
919 rinv22 = gmx_fjsp_invsqrt_v2r8(rsq22);
921 rinvsq00 = _fjsp_mul_v2r8(rinv00,rinv00);
922 rinvsq01 = _fjsp_mul_v2r8(rinv01,rinv01);
923 rinvsq02 = _fjsp_mul_v2r8(rinv02,rinv02);
924 rinvsq10 = _fjsp_mul_v2r8(rinv10,rinv10);
925 rinvsq11 = _fjsp_mul_v2r8(rinv11,rinv11);
926 rinvsq12 = _fjsp_mul_v2r8(rinv12,rinv12);
927 rinvsq20 = _fjsp_mul_v2r8(rinv20,rinv20);
928 rinvsq21 = _fjsp_mul_v2r8(rinv21,rinv21);
929 rinvsq22 = _fjsp_mul_v2r8(rinv22,rinv22);
931 fjx0 = _fjsp_setzero_v2r8();
932 fjy0 = _fjsp_setzero_v2r8();
933 fjz0 = _fjsp_setzero_v2r8();
934 fjx1 = _fjsp_setzero_v2r8();
935 fjy1 = _fjsp_setzero_v2r8();
936 fjz1 = _fjsp_setzero_v2r8();
937 fjx2 = _fjsp_setzero_v2r8();
938 fjy2 = _fjsp_setzero_v2r8();
939 fjz2 = _fjsp_setzero_v2r8();
941 /**************************
942 * CALCULATE INTERACTIONS *
943 **************************/
945 if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
948 r00 = _fjsp_mul_v2r8(rsq00,rinv00);
950 /* EWALD ELECTROSTATICS */
952 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
953 ewrt = _fjsp_mul_v2r8(r00,ewtabscale);
954 itab_tmp = _fjsp_dtox_v2r8(ewrt);
955 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
956 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
958 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
959 ewtabD = _fjsp_setzero_v2r8();
960 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
961 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
962 ewtabFn = _fjsp_setzero_v2r8();
963 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
964 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
965 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
966 velec = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
967 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
969 /* LENNARD-JONES DISPERSION/REPULSION */
971 rinvsix = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
972 vvdw6 = _fjsp_mul_v2r8(c6_00,rinvsix);
973 vvdw12 = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
974 vvdw = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
975 fvdw = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
977 d = _fjsp_sub_v2r8(r00,rswitch);
978 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
979 d2 = _fjsp_mul_v2r8(d,d);
980 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
982 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
984 /* Evaluate switch function */
985 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
986 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
987 fvdw = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
988 velec = _fjsp_mul_v2r8(velec,sw);
989 vvdw = _fjsp_mul_v2r8(vvdw,sw);
990 cutoff_mask = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
992 /* Update potential sum for this i atom from the interaction with this j atom. */
993 velec = _fjsp_and_v2r8(velec,cutoff_mask);
994 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
995 velecsum = _fjsp_add_v2r8(velecsum,velec);
996 vvdw = _fjsp_and_v2r8(vvdw,cutoff_mask);
997 vvdw = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
998 vvdwsum = _fjsp_add_v2r8(vvdwsum,vvdw);
1000 fscal = _fjsp_add_v2r8(felec,fvdw);
1002 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1004 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1006 /* Update vectorial force */
1007 fix0 = _fjsp_madd_v2r8(dx00,fscal,fix0);
1008 fiy0 = _fjsp_madd_v2r8(dy00,fscal,fiy0);
1009 fiz0 = _fjsp_madd_v2r8(dz00,fscal,fiz0);
1011 fjx0 = _fjsp_madd_v2r8(dx00,fscal,fjx0);
1012 fjy0 = _fjsp_madd_v2r8(dy00,fscal,fjy0);
1013 fjz0 = _fjsp_madd_v2r8(dz00,fscal,fjz0);
1017 /**************************
1018 * CALCULATE INTERACTIONS *
1019 **************************/
1021 if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
1024 r01 = _fjsp_mul_v2r8(rsq01,rinv01);
1026 /* EWALD ELECTROSTATICS */
1028 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1029 ewrt = _fjsp_mul_v2r8(r01,ewtabscale);
1030 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1031 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1032 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1034 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1035 ewtabD = _fjsp_setzero_v2r8();
1036 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1037 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1038 ewtabFn = _fjsp_setzero_v2r8();
1039 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1040 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1041 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1042 velec = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
1043 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
1045 d = _fjsp_sub_v2r8(r01,rswitch);
1046 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1047 d2 = _fjsp_mul_v2r8(d,d);
1048 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1050 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1052 /* Evaluate switch function */
1053 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1054 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
1055 velec = _fjsp_mul_v2r8(velec,sw);
1056 cutoff_mask = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
1058 /* Update potential sum for this i atom from the interaction with this j atom. */
1059 velec = _fjsp_and_v2r8(velec,cutoff_mask);
1060 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1061 velecsum = _fjsp_add_v2r8(velecsum,velec);
1065 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1067 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1069 /* Update vectorial force */
1070 fix0 = _fjsp_madd_v2r8(dx01,fscal,fix0);
1071 fiy0 = _fjsp_madd_v2r8(dy01,fscal,fiy0);
1072 fiz0 = _fjsp_madd_v2r8(dz01,fscal,fiz0);
1074 fjx1 = _fjsp_madd_v2r8(dx01,fscal,fjx1);
1075 fjy1 = _fjsp_madd_v2r8(dy01,fscal,fjy1);
1076 fjz1 = _fjsp_madd_v2r8(dz01,fscal,fjz1);
1080 /**************************
1081 * CALCULATE INTERACTIONS *
1082 **************************/
1084 if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
1087 r02 = _fjsp_mul_v2r8(rsq02,rinv02);
1089 /* EWALD ELECTROSTATICS */
1091 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1092 ewrt = _fjsp_mul_v2r8(r02,ewtabscale);
1093 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1094 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1095 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1097 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1098 ewtabD = _fjsp_setzero_v2r8();
1099 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1100 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1101 ewtabFn = _fjsp_setzero_v2r8();
1102 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1103 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1104 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1105 velec = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
1106 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
1108 d = _fjsp_sub_v2r8(r02,rswitch);
1109 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1110 d2 = _fjsp_mul_v2r8(d,d);
1111 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1113 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1115 /* Evaluate switch function */
1116 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1117 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
1118 velec = _fjsp_mul_v2r8(velec,sw);
1119 cutoff_mask = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
1121 /* Update potential sum for this i atom from the interaction with this j atom. */
1122 velec = _fjsp_and_v2r8(velec,cutoff_mask);
1123 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1124 velecsum = _fjsp_add_v2r8(velecsum,velec);
1128 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1130 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1132 /* Update vectorial force */
1133 fix0 = _fjsp_madd_v2r8(dx02,fscal,fix0);
1134 fiy0 = _fjsp_madd_v2r8(dy02,fscal,fiy0);
1135 fiz0 = _fjsp_madd_v2r8(dz02,fscal,fiz0);
1137 fjx2 = _fjsp_madd_v2r8(dx02,fscal,fjx2);
1138 fjy2 = _fjsp_madd_v2r8(dy02,fscal,fjy2);
1139 fjz2 = _fjsp_madd_v2r8(dz02,fscal,fjz2);
1143 /**************************
1144 * CALCULATE INTERACTIONS *
1145 **************************/
1147 if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
1150 r10 = _fjsp_mul_v2r8(rsq10,rinv10);
1152 /* EWALD ELECTROSTATICS */
1154 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1155 ewrt = _fjsp_mul_v2r8(r10,ewtabscale);
1156 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1157 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1158 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1160 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1161 ewtabD = _fjsp_setzero_v2r8();
1162 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1163 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1164 ewtabFn = _fjsp_setzero_v2r8();
1165 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1166 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1167 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1168 velec = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
1169 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
1171 d = _fjsp_sub_v2r8(r10,rswitch);
1172 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1173 d2 = _fjsp_mul_v2r8(d,d);
1174 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1176 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1178 /* Evaluate switch function */
1179 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1180 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
1181 velec = _fjsp_mul_v2r8(velec,sw);
1182 cutoff_mask = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
1184 /* Update potential sum for this i atom from the interaction with this j atom. */
1185 velec = _fjsp_and_v2r8(velec,cutoff_mask);
1186 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1187 velecsum = _fjsp_add_v2r8(velecsum,velec);
1191 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1193 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1195 /* Update vectorial force */
1196 fix1 = _fjsp_madd_v2r8(dx10,fscal,fix1);
1197 fiy1 = _fjsp_madd_v2r8(dy10,fscal,fiy1);
1198 fiz1 = _fjsp_madd_v2r8(dz10,fscal,fiz1);
1200 fjx0 = _fjsp_madd_v2r8(dx10,fscal,fjx0);
1201 fjy0 = _fjsp_madd_v2r8(dy10,fscal,fjy0);
1202 fjz0 = _fjsp_madd_v2r8(dz10,fscal,fjz0);
1206 /**************************
1207 * CALCULATE INTERACTIONS *
1208 **************************/
1210 if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
1213 r11 = _fjsp_mul_v2r8(rsq11,rinv11);
1215 /* EWALD ELECTROSTATICS */
1217 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1218 ewrt = _fjsp_mul_v2r8(r11,ewtabscale);
1219 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1220 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1221 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1223 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1224 ewtabD = _fjsp_setzero_v2r8();
1225 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1226 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1227 ewtabFn = _fjsp_setzero_v2r8();
1228 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1229 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1230 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1231 velec = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
1232 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
1234 d = _fjsp_sub_v2r8(r11,rswitch);
1235 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1236 d2 = _fjsp_mul_v2r8(d,d);
1237 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1239 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1241 /* Evaluate switch function */
1242 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1243 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
1244 velec = _fjsp_mul_v2r8(velec,sw);
1245 cutoff_mask = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
1247 /* Update potential sum for this i atom from the interaction with this j atom. */
1248 velec = _fjsp_and_v2r8(velec,cutoff_mask);
1249 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1250 velecsum = _fjsp_add_v2r8(velecsum,velec);
1254 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1256 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1258 /* Update vectorial force */
1259 fix1 = _fjsp_madd_v2r8(dx11,fscal,fix1);
1260 fiy1 = _fjsp_madd_v2r8(dy11,fscal,fiy1);
1261 fiz1 = _fjsp_madd_v2r8(dz11,fscal,fiz1);
1263 fjx1 = _fjsp_madd_v2r8(dx11,fscal,fjx1);
1264 fjy1 = _fjsp_madd_v2r8(dy11,fscal,fjy1);
1265 fjz1 = _fjsp_madd_v2r8(dz11,fscal,fjz1);
1269 /**************************
1270 * CALCULATE INTERACTIONS *
1271 **************************/
1273 if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
1276 r12 = _fjsp_mul_v2r8(rsq12,rinv12);
1278 /* EWALD ELECTROSTATICS */
1280 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1281 ewrt = _fjsp_mul_v2r8(r12,ewtabscale);
1282 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1283 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1284 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1286 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1287 ewtabD = _fjsp_setzero_v2r8();
1288 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1289 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1290 ewtabFn = _fjsp_setzero_v2r8();
1291 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1292 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1293 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1294 velec = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
1295 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
1297 d = _fjsp_sub_v2r8(r12,rswitch);
1298 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1299 d2 = _fjsp_mul_v2r8(d,d);
1300 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1302 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1304 /* Evaluate switch function */
1305 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1306 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
1307 velec = _fjsp_mul_v2r8(velec,sw);
1308 cutoff_mask = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
1310 /* Update potential sum for this i atom from the interaction with this j atom. */
1311 velec = _fjsp_and_v2r8(velec,cutoff_mask);
1312 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1313 velecsum = _fjsp_add_v2r8(velecsum,velec);
1317 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1319 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1321 /* Update vectorial force */
1322 fix1 = _fjsp_madd_v2r8(dx12,fscal,fix1);
1323 fiy1 = _fjsp_madd_v2r8(dy12,fscal,fiy1);
1324 fiz1 = _fjsp_madd_v2r8(dz12,fscal,fiz1);
1326 fjx2 = _fjsp_madd_v2r8(dx12,fscal,fjx2);
1327 fjy2 = _fjsp_madd_v2r8(dy12,fscal,fjy2);
1328 fjz2 = _fjsp_madd_v2r8(dz12,fscal,fjz2);
1332 /**************************
1333 * CALCULATE INTERACTIONS *
1334 **************************/
1336 if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
1339 r20 = _fjsp_mul_v2r8(rsq20,rinv20);
1341 /* EWALD ELECTROSTATICS */
1343 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1344 ewrt = _fjsp_mul_v2r8(r20,ewtabscale);
1345 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1346 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1347 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1349 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1350 ewtabD = _fjsp_setzero_v2r8();
1351 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1352 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1353 ewtabFn = _fjsp_setzero_v2r8();
1354 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1355 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1356 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1357 velec = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
1358 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
1360 d = _fjsp_sub_v2r8(r20,rswitch);
1361 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1362 d2 = _fjsp_mul_v2r8(d,d);
1363 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1365 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1367 /* Evaluate switch function */
1368 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1369 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
1370 velec = _fjsp_mul_v2r8(velec,sw);
1371 cutoff_mask = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
1373 /* Update potential sum for this i atom from the interaction with this j atom. */
1374 velec = _fjsp_and_v2r8(velec,cutoff_mask);
1375 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1376 velecsum = _fjsp_add_v2r8(velecsum,velec);
1380 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1382 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1384 /* Update vectorial force */
1385 fix2 = _fjsp_madd_v2r8(dx20,fscal,fix2);
1386 fiy2 = _fjsp_madd_v2r8(dy20,fscal,fiy2);
1387 fiz2 = _fjsp_madd_v2r8(dz20,fscal,fiz2);
1389 fjx0 = _fjsp_madd_v2r8(dx20,fscal,fjx0);
1390 fjy0 = _fjsp_madd_v2r8(dy20,fscal,fjy0);
1391 fjz0 = _fjsp_madd_v2r8(dz20,fscal,fjz0);
1395 /**************************
1396 * CALCULATE INTERACTIONS *
1397 **************************/
1399 if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
1402 r21 = _fjsp_mul_v2r8(rsq21,rinv21);
1404 /* EWALD ELECTROSTATICS */
1406 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1407 ewrt = _fjsp_mul_v2r8(r21,ewtabscale);
1408 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1409 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1410 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1412 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1413 ewtabD = _fjsp_setzero_v2r8();
1414 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1415 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1416 ewtabFn = _fjsp_setzero_v2r8();
1417 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1418 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1419 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1420 velec = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
1421 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
1423 d = _fjsp_sub_v2r8(r21,rswitch);
1424 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1425 d2 = _fjsp_mul_v2r8(d,d);
1426 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1428 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1430 /* Evaluate switch function */
1431 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1432 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
1433 velec = _fjsp_mul_v2r8(velec,sw);
1434 cutoff_mask = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
1436 /* Update potential sum for this i atom from the interaction with this j atom. */
1437 velec = _fjsp_and_v2r8(velec,cutoff_mask);
1438 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1439 velecsum = _fjsp_add_v2r8(velecsum,velec);
1443 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1445 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1447 /* Update vectorial force */
1448 fix2 = _fjsp_madd_v2r8(dx21,fscal,fix2);
1449 fiy2 = _fjsp_madd_v2r8(dy21,fscal,fiy2);
1450 fiz2 = _fjsp_madd_v2r8(dz21,fscal,fiz2);
1452 fjx1 = _fjsp_madd_v2r8(dx21,fscal,fjx1);
1453 fjy1 = _fjsp_madd_v2r8(dy21,fscal,fjy1);
1454 fjz1 = _fjsp_madd_v2r8(dz21,fscal,fjz1);
1458 /**************************
1459 * CALCULATE INTERACTIONS *
1460 **************************/
1462 if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
1465 r22 = _fjsp_mul_v2r8(rsq22,rinv22);
1467 /* EWALD ELECTROSTATICS */
1469 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1470 ewrt = _fjsp_mul_v2r8(r22,ewtabscale);
1471 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1472 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1473 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1475 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1476 ewtabD = _fjsp_setzero_v2r8();
1477 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1478 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1479 ewtabFn = _fjsp_setzero_v2r8();
1480 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1481 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1482 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1483 velec = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
1484 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
1486 d = _fjsp_sub_v2r8(r22,rswitch);
1487 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1488 d2 = _fjsp_mul_v2r8(d,d);
1489 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1491 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1493 /* Evaluate switch function */
1494 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1495 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
1496 velec = _fjsp_mul_v2r8(velec,sw);
1497 cutoff_mask = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
1499 /* Update potential sum for this i atom from the interaction with this j atom. */
1500 velec = _fjsp_and_v2r8(velec,cutoff_mask);
1501 velec = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
1502 velecsum = _fjsp_add_v2r8(velecsum,velec);
1506 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1508 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
1510 /* Update vectorial force */
1511 fix2 = _fjsp_madd_v2r8(dx22,fscal,fix2);
1512 fiy2 = _fjsp_madd_v2r8(dy22,fscal,fiy2);
1513 fiz2 = _fjsp_madd_v2r8(dz22,fscal,fiz2);
1515 fjx2 = _fjsp_madd_v2r8(dx22,fscal,fjx2);
1516 fjy2 = _fjsp_madd_v2r8(dy22,fscal,fjy2);
1517 fjz2 = _fjsp_madd_v2r8(dz22,fscal,fjz2);
1521 gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1523 /* Inner loop uses 630 flops */
1526 /* End of innermost loop */
1528 gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1529 f+i_coord_offset,fshift+i_shift_offset);
1532 /* Update potential energies */
1533 gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
1534 gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
1536 /* Increment number of inner iterations */
1537 inneriter += j_index_end - j_index_start;
1539 /* Outer loop uses 20 flops */
1542 /* Increment number of outer iterations */
1545 /* Update outer/inner flops */
1547 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*630);
1550 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double
1551 * Electrostatics interaction: Ewald
1552 * VdW interaction: LennardJones
1553 * Geometry: Water3-Water3
1554 * Calculate force/pot: Force
1557 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double
1558 (t_nblist * gmx_restrict nlist,
1559 rvec * gmx_restrict xx,
1560 rvec * gmx_restrict ff,
1561 t_forcerec * gmx_restrict fr,
1562 t_mdatoms * gmx_restrict mdatoms,
1563 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1564 t_nrnb * gmx_restrict nrnb)
1566 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1567 * just 0 for non-waters.
1568 * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
1569 * jnr indices corresponding to data put in the four positions in the SIMD register.
1571 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1572 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1574 int j_coord_offsetA,j_coord_offsetB;
1575 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1576 real rcutoff_scalar;
1577 real *shiftvec,*fshift,*x,*f;
1578 _fjsp_v2r8 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1580 _fjsp_v2r8 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1582 _fjsp_v2r8 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1584 _fjsp_v2r8 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1585 int vdwjidx0A,vdwjidx0B;
1586 _fjsp_v2r8 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1587 int vdwjidx1A,vdwjidx1B;
1588 _fjsp_v2r8 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1589 int vdwjidx2A,vdwjidx2B;
1590 _fjsp_v2r8 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1591 _fjsp_v2r8 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1592 _fjsp_v2r8 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1593 _fjsp_v2r8 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1594 _fjsp_v2r8 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1595 _fjsp_v2r8 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1596 _fjsp_v2r8 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1597 _fjsp_v2r8 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1598 _fjsp_v2r8 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1599 _fjsp_v2r8 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1600 _fjsp_v2r8 velec,felec,velecsum,facel,crf,krf,krf2;
1603 _fjsp_v2r8 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1606 _fjsp_v2r8 one_sixth = gmx_fjsp_set1_v2r8(1.0/6.0);
1607 _fjsp_v2r8 one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
1608 _fjsp_v2r8 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1610 _fjsp_v2r8 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1611 real rswitch_scalar,d_scalar;
1612 _fjsp_v2r8 itab_tmp;
1613 _fjsp_v2r8 dummy_mask,cutoff_mask;
1614 _fjsp_v2r8 one = gmx_fjsp_set1_v2r8(1.0);
1615 _fjsp_v2r8 two = gmx_fjsp_set1_v2r8(2.0);
1616 union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
1623 jindex = nlist->jindex;
1625 shiftidx = nlist->shift;
1627 shiftvec = fr->shift_vec[0];
1628 fshift = fr->fshift[0];
1629 facel = gmx_fjsp_set1_v2r8(fr->epsfac);
1630 charge = mdatoms->chargeA;
1631 nvdwtype = fr->ntype;
1632 vdwparam = fr->nbfp;
1633 vdwtype = mdatoms->typeA;
1635 sh_ewald = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
1636 ewtab = fr->ic->tabq_coul_FDV0;
1637 ewtabscale = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
1638 ewtabhalfspace = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
1640 /* Setup water-specific parameters */
1641 inr = nlist->iinr[0];
1642 iq0 = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
1643 iq1 = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
1644 iq2 = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
1645 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1647 jq0 = gmx_fjsp_set1_v2r8(charge[inr+0]);
1648 jq1 = gmx_fjsp_set1_v2r8(charge[inr+1]);
1649 jq2 = gmx_fjsp_set1_v2r8(charge[inr+2]);
1650 vdwjidx0A = 2*vdwtype[inr+0];
1651 qq00 = _fjsp_mul_v2r8(iq0,jq0);
1652 c6_00 = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
1653 c12_00 = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
1654 qq01 = _fjsp_mul_v2r8(iq0,jq1);
1655 qq02 = _fjsp_mul_v2r8(iq0,jq2);
1656 qq10 = _fjsp_mul_v2r8(iq1,jq0);
1657 qq11 = _fjsp_mul_v2r8(iq1,jq1);
1658 qq12 = _fjsp_mul_v2r8(iq1,jq2);
1659 qq20 = _fjsp_mul_v2r8(iq2,jq0);
1660 qq21 = _fjsp_mul_v2r8(iq2,jq1);
1661 qq22 = _fjsp_mul_v2r8(iq2,jq2);
1663 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1664 rcutoff_scalar = fr->rcoulomb;
1665 rcutoff = gmx_fjsp_set1_v2r8(rcutoff_scalar);
1666 rcutoff2 = _fjsp_mul_v2r8(rcutoff,rcutoff);
1668 rswitch_scalar = fr->rcoulomb_switch;
1669 rswitch = gmx_fjsp_set1_v2r8(rswitch_scalar);
1670 /* Setup switch parameters */
1671 d_scalar = rcutoff_scalar-rswitch_scalar;
1672 d = gmx_fjsp_set1_v2r8(d_scalar);
1673 swV3 = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
1674 swV4 = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1675 swV5 = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1676 swF2 = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
1677 swF3 = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1678 swF4 = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1680 /* Avoid stupid compiler warnings */
1682 j_coord_offsetA = 0;
1683 j_coord_offsetB = 0;
1688 /* Start outer loop over neighborlists */
1689 for(iidx=0; iidx<nri; iidx++)
1691 /* Load shift vector for this list */
1692 i_shift_offset = DIM*shiftidx[iidx];
1694 /* Load limits for loop over neighbors */
1695 j_index_start = jindex[iidx];
1696 j_index_end = jindex[iidx+1];
1698 /* Get outer coordinate index */
1700 i_coord_offset = DIM*inr;
1702 /* Load i particle coords and add shift vector */
1703 gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
1704 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1706 fix0 = _fjsp_setzero_v2r8();
1707 fiy0 = _fjsp_setzero_v2r8();
1708 fiz0 = _fjsp_setzero_v2r8();
1709 fix1 = _fjsp_setzero_v2r8();
1710 fiy1 = _fjsp_setzero_v2r8();
1711 fiz1 = _fjsp_setzero_v2r8();
1712 fix2 = _fjsp_setzero_v2r8();
1713 fiy2 = _fjsp_setzero_v2r8();
1714 fiz2 = _fjsp_setzero_v2r8();
1716 /* Start inner kernel loop */
1717 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1720 /* Get j neighbor index, and coordinate index */
1722 jnrB = jjnr[jidx+1];
1723 j_coord_offsetA = DIM*jnrA;
1724 j_coord_offsetB = DIM*jnrB;
1726 /* load j atom coordinates */
1727 gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
1728 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1730 /* Calculate displacement vector */
1731 dx00 = _fjsp_sub_v2r8(ix0,jx0);
1732 dy00 = _fjsp_sub_v2r8(iy0,jy0);
1733 dz00 = _fjsp_sub_v2r8(iz0,jz0);
1734 dx01 = _fjsp_sub_v2r8(ix0,jx1);
1735 dy01 = _fjsp_sub_v2r8(iy0,jy1);
1736 dz01 = _fjsp_sub_v2r8(iz0,jz1);
1737 dx02 = _fjsp_sub_v2r8(ix0,jx2);
1738 dy02 = _fjsp_sub_v2r8(iy0,jy2);
1739 dz02 = _fjsp_sub_v2r8(iz0,jz2);
1740 dx10 = _fjsp_sub_v2r8(ix1,jx0);
1741 dy10 = _fjsp_sub_v2r8(iy1,jy0);
1742 dz10 = _fjsp_sub_v2r8(iz1,jz0);
1743 dx11 = _fjsp_sub_v2r8(ix1,jx1);
1744 dy11 = _fjsp_sub_v2r8(iy1,jy1);
1745 dz11 = _fjsp_sub_v2r8(iz1,jz1);
1746 dx12 = _fjsp_sub_v2r8(ix1,jx2);
1747 dy12 = _fjsp_sub_v2r8(iy1,jy2);
1748 dz12 = _fjsp_sub_v2r8(iz1,jz2);
1749 dx20 = _fjsp_sub_v2r8(ix2,jx0);
1750 dy20 = _fjsp_sub_v2r8(iy2,jy0);
1751 dz20 = _fjsp_sub_v2r8(iz2,jz0);
1752 dx21 = _fjsp_sub_v2r8(ix2,jx1);
1753 dy21 = _fjsp_sub_v2r8(iy2,jy1);
1754 dz21 = _fjsp_sub_v2r8(iz2,jz1);
1755 dx22 = _fjsp_sub_v2r8(ix2,jx2);
1756 dy22 = _fjsp_sub_v2r8(iy2,jy2);
1757 dz22 = _fjsp_sub_v2r8(iz2,jz2);
1759 /* Calculate squared distance and things based on it */
1760 rsq00 = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
1761 rsq01 = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
1762 rsq02 = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
1763 rsq10 = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
1764 rsq11 = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
1765 rsq12 = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
1766 rsq20 = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
1767 rsq21 = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
1768 rsq22 = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
1770 rinv00 = gmx_fjsp_invsqrt_v2r8(rsq00);
1771 rinv01 = gmx_fjsp_invsqrt_v2r8(rsq01);
1772 rinv02 = gmx_fjsp_invsqrt_v2r8(rsq02);
1773 rinv10 = gmx_fjsp_invsqrt_v2r8(rsq10);
1774 rinv11 = gmx_fjsp_invsqrt_v2r8(rsq11);
1775 rinv12 = gmx_fjsp_invsqrt_v2r8(rsq12);
1776 rinv20 = gmx_fjsp_invsqrt_v2r8(rsq20);
1777 rinv21 = gmx_fjsp_invsqrt_v2r8(rsq21);
1778 rinv22 = gmx_fjsp_invsqrt_v2r8(rsq22);
1780 rinvsq00 = _fjsp_mul_v2r8(rinv00,rinv00);
1781 rinvsq01 = _fjsp_mul_v2r8(rinv01,rinv01);
1782 rinvsq02 = _fjsp_mul_v2r8(rinv02,rinv02);
1783 rinvsq10 = _fjsp_mul_v2r8(rinv10,rinv10);
1784 rinvsq11 = _fjsp_mul_v2r8(rinv11,rinv11);
1785 rinvsq12 = _fjsp_mul_v2r8(rinv12,rinv12);
1786 rinvsq20 = _fjsp_mul_v2r8(rinv20,rinv20);
1787 rinvsq21 = _fjsp_mul_v2r8(rinv21,rinv21);
1788 rinvsq22 = _fjsp_mul_v2r8(rinv22,rinv22);
1790 fjx0 = _fjsp_setzero_v2r8();
1791 fjy0 = _fjsp_setzero_v2r8();
1792 fjz0 = _fjsp_setzero_v2r8();
1793 fjx1 = _fjsp_setzero_v2r8();
1794 fjy1 = _fjsp_setzero_v2r8();
1795 fjz1 = _fjsp_setzero_v2r8();
1796 fjx2 = _fjsp_setzero_v2r8();
1797 fjy2 = _fjsp_setzero_v2r8();
1798 fjz2 = _fjsp_setzero_v2r8();
1800 /**************************
1801 * CALCULATE INTERACTIONS *
1802 **************************/
1804 if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
1807 r00 = _fjsp_mul_v2r8(rsq00,rinv00);
1809 /* EWALD ELECTROSTATICS */
1811 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1812 ewrt = _fjsp_mul_v2r8(r00,ewtabscale);
1813 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1814 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1815 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1817 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1818 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
1819 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1820 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1821 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
1822 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1823 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1824 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1825 velec = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
1826 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
1828 /* LENNARD-JONES DISPERSION/REPULSION */
1830 rinvsix = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
1831 vvdw6 = _fjsp_mul_v2r8(c6_00,rinvsix);
1832 vvdw12 = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
1833 vvdw = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
1834 fvdw = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
1836 d = _fjsp_sub_v2r8(r00,rswitch);
1837 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1838 d2 = _fjsp_mul_v2r8(d,d);
1839 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1841 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1843 /* Evaluate switch function */
1844 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1845 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
1846 fvdw = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
1847 cutoff_mask = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
1849 fscal = _fjsp_add_v2r8(felec,fvdw);
1851 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1853 /* Update vectorial force */
1854 fix0 = _fjsp_madd_v2r8(dx00,fscal,fix0);
1855 fiy0 = _fjsp_madd_v2r8(dy00,fscal,fiy0);
1856 fiz0 = _fjsp_madd_v2r8(dz00,fscal,fiz0);
1858 fjx0 = _fjsp_madd_v2r8(dx00,fscal,fjx0);
1859 fjy0 = _fjsp_madd_v2r8(dy00,fscal,fjy0);
1860 fjz0 = _fjsp_madd_v2r8(dz00,fscal,fjz0);
1864 /**************************
1865 * CALCULATE INTERACTIONS *
1866 **************************/
1868 if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
1871 r01 = _fjsp_mul_v2r8(rsq01,rinv01);
1873 /* EWALD ELECTROSTATICS */
1875 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1876 ewrt = _fjsp_mul_v2r8(r01,ewtabscale);
1877 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1878 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1879 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1881 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1882 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
1883 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1884 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1885 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
1886 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1887 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1888 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1889 velec = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
1890 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
1892 d = _fjsp_sub_v2r8(r01,rswitch);
1893 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1894 d2 = _fjsp_mul_v2r8(d,d);
1895 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1897 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1899 /* Evaluate switch function */
1900 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1901 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
1902 cutoff_mask = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
1906 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1908 /* Update vectorial force */
1909 fix0 = _fjsp_madd_v2r8(dx01,fscal,fix0);
1910 fiy0 = _fjsp_madd_v2r8(dy01,fscal,fiy0);
1911 fiz0 = _fjsp_madd_v2r8(dz01,fscal,fiz0);
1913 fjx1 = _fjsp_madd_v2r8(dx01,fscal,fjx1);
1914 fjy1 = _fjsp_madd_v2r8(dy01,fscal,fjy1);
1915 fjz1 = _fjsp_madd_v2r8(dz01,fscal,fjz1);
1919 /**************************
1920 * CALCULATE INTERACTIONS *
1921 **************************/
1923 if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
1926 r02 = _fjsp_mul_v2r8(rsq02,rinv02);
1928 /* EWALD ELECTROSTATICS */
1930 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1931 ewrt = _fjsp_mul_v2r8(r02,ewtabscale);
1932 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1933 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1934 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1936 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1937 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
1938 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1939 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1940 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
1941 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1942 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1943 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1944 velec = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
1945 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
1947 d = _fjsp_sub_v2r8(r02,rswitch);
1948 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
1949 d2 = _fjsp_mul_v2r8(d,d);
1950 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
1952 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
1954 /* Evaluate switch function */
1955 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1956 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
1957 cutoff_mask = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
1961 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
1963 /* Update vectorial force */
1964 fix0 = _fjsp_madd_v2r8(dx02,fscal,fix0);
1965 fiy0 = _fjsp_madd_v2r8(dy02,fscal,fiy0);
1966 fiz0 = _fjsp_madd_v2r8(dz02,fscal,fiz0);
1968 fjx2 = _fjsp_madd_v2r8(dx02,fscal,fjx2);
1969 fjy2 = _fjsp_madd_v2r8(dy02,fscal,fjy2);
1970 fjz2 = _fjsp_madd_v2r8(dz02,fscal,fjz2);
1974 /**************************
1975 * CALCULATE INTERACTIONS *
1976 **************************/
1978 if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
1981 r10 = _fjsp_mul_v2r8(rsq10,rinv10);
1983 /* EWALD ELECTROSTATICS */
1985 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1986 ewrt = _fjsp_mul_v2r8(r10,ewtabscale);
1987 itab_tmp = _fjsp_dtox_v2r8(ewrt);
1988 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
1989 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
1991 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
1992 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
1993 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
1994 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
1995 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
1996 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
1997 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
1998 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
1999 velec = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
2000 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
2002 d = _fjsp_sub_v2r8(r10,rswitch);
2003 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2004 d2 = _fjsp_mul_v2r8(d,d);
2005 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2007 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2009 /* Evaluate switch function */
2010 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2011 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
2012 cutoff_mask = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
2016 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2018 /* Update vectorial force */
2019 fix1 = _fjsp_madd_v2r8(dx10,fscal,fix1);
2020 fiy1 = _fjsp_madd_v2r8(dy10,fscal,fiy1);
2021 fiz1 = _fjsp_madd_v2r8(dz10,fscal,fiz1);
2023 fjx0 = _fjsp_madd_v2r8(dx10,fscal,fjx0);
2024 fjy0 = _fjsp_madd_v2r8(dy10,fscal,fjy0);
2025 fjz0 = _fjsp_madd_v2r8(dz10,fscal,fjz0);
2029 /**************************
2030 * CALCULATE INTERACTIONS *
2031 **************************/
2033 if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
2036 r11 = _fjsp_mul_v2r8(rsq11,rinv11);
2038 /* EWALD ELECTROSTATICS */
2040 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2041 ewrt = _fjsp_mul_v2r8(r11,ewtabscale);
2042 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2043 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2044 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2046 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2047 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2048 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2049 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2050 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2051 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2052 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2053 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2054 velec = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
2055 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
2057 d = _fjsp_sub_v2r8(r11,rswitch);
2058 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2059 d2 = _fjsp_mul_v2r8(d,d);
2060 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2062 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2064 /* Evaluate switch function */
2065 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2066 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
2067 cutoff_mask = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
2071 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2073 /* Update vectorial force */
2074 fix1 = _fjsp_madd_v2r8(dx11,fscal,fix1);
2075 fiy1 = _fjsp_madd_v2r8(dy11,fscal,fiy1);
2076 fiz1 = _fjsp_madd_v2r8(dz11,fscal,fiz1);
2078 fjx1 = _fjsp_madd_v2r8(dx11,fscal,fjx1);
2079 fjy1 = _fjsp_madd_v2r8(dy11,fscal,fjy1);
2080 fjz1 = _fjsp_madd_v2r8(dz11,fscal,fjz1);
2084 /**************************
2085 * CALCULATE INTERACTIONS *
2086 **************************/
2088 if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
2091 r12 = _fjsp_mul_v2r8(rsq12,rinv12);
2093 /* EWALD ELECTROSTATICS */
2095 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2096 ewrt = _fjsp_mul_v2r8(r12,ewtabscale);
2097 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2098 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2099 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2101 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2102 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2103 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2104 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2105 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2106 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2107 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2108 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2109 velec = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
2110 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
2112 d = _fjsp_sub_v2r8(r12,rswitch);
2113 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2114 d2 = _fjsp_mul_v2r8(d,d);
2115 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2117 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2119 /* Evaluate switch function */
2120 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2121 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
2122 cutoff_mask = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
2126 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2128 /* Update vectorial force */
2129 fix1 = _fjsp_madd_v2r8(dx12,fscal,fix1);
2130 fiy1 = _fjsp_madd_v2r8(dy12,fscal,fiy1);
2131 fiz1 = _fjsp_madd_v2r8(dz12,fscal,fiz1);
2133 fjx2 = _fjsp_madd_v2r8(dx12,fscal,fjx2);
2134 fjy2 = _fjsp_madd_v2r8(dy12,fscal,fjy2);
2135 fjz2 = _fjsp_madd_v2r8(dz12,fscal,fjz2);
2139 /**************************
2140 * CALCULATE INTERACTIONS *
2141 **************************/
2143 if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
2146 r20 = _fjsp_mul_v2r8(rsq20,rinv20);
2148 /* EWALD ELECTROSTATICS */
2150 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2151 ewrt = _fjsp_mul_v2r8(r20,ewtabscale);
2152 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2153 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2154 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2156 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2157 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2158 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2159 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2160 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2161 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2162 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2163 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2164 velec = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
2165 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
2167 d = _fjsp_sub_v2r8(r20,rswitch);
2168 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2169 d2 = _fjsp_mul_v2r8(d,d);
2170 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2172 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2174 /* Evaluate switch function */
2175 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2176 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
2177 cutoff_mask = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
2181 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2183 /* Update vectorial force */
2184 fix2 = _fjsp_madd_v2r8(dx20,fscal,fix2);
2185 fiy2 = _fjsp_madd_v2r8(dy20,fscal,fiy2);
2186 fiz2 = _fjsp_madd_v2r8(dz20,fscal,fiz2);
2188 fjx0 = _fjsp_madd_v2r8(dx20,fscal,fjx0);
2189 fjy0 = _fjsp_madd_v2r8(dy20,fscal,fjy0);
2190 fjz0 = _fjsp_madd_v2r8(dz20,fscal,fjz0);
2194 /**************************
2195 * CALCULATE INTERACTIONS *
2196 **************************/
2198 if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
2201 r21 = _fjsp_mul_v2r8(rsq21,rinv21);
2203 /* EWALD ELECTROSTATICS */
2205 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2206 ewrt = _fjsp_mul_v2r8(r21,ewtabscale);
2207 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2208 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2209 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2211 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2212 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2213 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2214 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2215 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2216 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2217 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2218 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2219 velec = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
2220 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
2222 d = _fjsp_sub_v2r8(r21,rswitch);
2223 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2224 d2 = _fjsp_mul_v2r8(d,d);
2225 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2227 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2229 /* Evaluate switch function */
2230 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2231 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
2232 cutoff_mask = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
2236 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2238 /* Update vectorial force */
2239 fix2 = _fjsp_madd_v2r8(dx21,fscal,fix2);
2240 fiy2 = _fjsp_madd_v2r8(dy21,fscal,fiy2);
2241 fiz2 = _fjsp_madd_v2r8(dz21,fscal,fiz2);
2243 fjx1 = _fjsp_madd_v2r8(dx21,fscal,fjx1);
2244 fjy1 = _fjsp_madd_v2r8(dy21,fscal,fjy1);
2245 fjz1 = _fjsp_madd_v2r8(dz21,fscal,fjz1);
2249 /**************************
2250 * CALCULATE INTERACTIONS *
2251 **************************/
2253 if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
2256 r22 = _fjsp_mul_v2r8(rsq22,rinv22);
2258 /* EWALD ELECTROSTATICS */
2260 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2261 ewrt = _fjsp_mul_v2r8(r22,ewtabscale);
2262 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2263 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2264 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2266 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2267 ewtabD = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
2268 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2269 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2270 ewtabFn = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
2271 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2272 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2273 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2274 velec = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
2275 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
2277 d = _fjsp_sub_v2r8(r22,rswitch);
2278 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2279 d2 = _fjsp_mul_v2r8(d,d);
2280 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2282 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2284 /* Evaluate switch function */
2285 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2286 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
2287 cutoff_mask = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
2291 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2293 /* Update vectorial force */
2294 fix2 = _fjsp_madd_v2r8(dx22,fscal,fix2);
2295 fiy2 = _fjsp_madd_v2r8(dy22,fscal,fiy2);
2296 fiz2 = _fjsp_madd_v2r8(dz22,fscal,fiz2);
2298 fjx2 = _fjsp_madd_v2r8(dx22,fscal,fjx2);
2299 fjy2 = _fjsp_madd_v2r8(dy22,fscal,fjy2);
2300 fjz2 = _fjsp_madd_v2r8(dz22,fscal,fjz2);
2304 gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2306 /* Inner loop uses 600 flops */
2309 if(jidx<j_index_end)
2313 j_coord_offsetA = DIM*jnrA;
2315 /* load j atom coordinates */
2316 gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
2317 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2319 /* Calculate displacement vector */
2320 dx00 = _fjsp_sub_v2r8(ix0,jx0);
2321 dy00 = _fjsp_sub_v2r8(iy0,jy0);
2322 dz00 = _fjsp_sub_v2r8(iz0,jz0);
2323 dx01 = _fjsp_sub_v2r8(ix0,jx1);
2324 dy01 = _fjsp_sub_v2r8(iy0,jy1);
2325 dz01 = _fjsp_sub_v2r8(iz0,jz1);
2326 dx02 = _fjsp_sub_v2r8(ix0,jx2);
2327 dy02 = _fjsp_sub_v2r8(iy0,jy2);
2328 dz02 = _fjsp_sub_v2r8(iz0,jz2);
2329 dx10 = _fjsp_sub_v2r8(ix1,jx0);
2330 dy10 = _fjsp_sub_v2r8(iy1,jy0);
2331 dz10 = _fjsp_sub_v2r8(iz1,jz0);
2332 dx11 = _fjsp_sub_v2r8(ix1,jx1);
2333 dy11 = _fjsp_sub_v2r8(iy1,jy1);
2334 dz11 = _fjsp_sub_v2r8(iz1,jz1);
2335 dx12 = _fjsp_sub_v2r8(ix1,jx2);
2336 dy12 = _fjsp_sub_v2r8(iy1,jy2);
2337 dz12 = _fjsp_sub_v2r8(iz1,jz2);
2338 dx20 = _fjsp_sub_v2r8(ix2,jx0);
2339 dy20 = _fjsp_sub_v2r8(iy2,jy0);
2340 dz20 = _fjsp_sub_v2r8(iz2,jz0);
2341 dx21 = _fjsp_sub_v2r8(ix2,jx1);
2342 dy21 = _fjsp_sub_v2r8(iy2,jy1);
2343 dz21 = _fjsp_sub_v2r8(iz2,jz1);
2344 dx22 = _fjsp_sub_v2r8(ix2,jx2);
2345 dy22 = _fjsp_sub_v2r8(iy2,jy2);
2346 dz22 = _fjsp_sub_v2r8(iz2,jz2);
2348 /* Calculate squared distance and things based on it */
2349 rsq00 = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
2350 rsq01 = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
2351 rsq02 = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
2352 rsq10 = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
2353 rsq11 = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
2354 rsq12 = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
2355 rsq20 = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
2356 rsq21 = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
2357 rsq22 = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
2359 rinv00 = gmx_fjsp_invsqrt_v2r8(rsq00);
2360 rinv01 = gmx_fjsp_invsqrt_v2r8(rsq01);
2361 rinv02 = gmx_fjsp_invsqrt_v2r8(rsq02);
2362 rinv10 = gmx_fjsp_invsqrt_v2r8(rsq10);
2363 rinv11 = gmx_fjsp_invsqrt_v2r8(rsq11);
2364 rinv12 = gmx_fjsp_invsqrt_v2r8(rsq12);
2365 rinv20 = gmx_fjsp_invsqrt_v2r8(rsq20);
2366 rinv21 = gmx_fjsp_invsqrt_v2r8(rsq21);
2367 rinv22 = gmx_fjsp_invsqrt_v2r8(rsq22);
2369 rinvsq00 = _fjsp_mul_v2r8(rinv00,rinv00);
2370 rinvsq01 = _fjsp_mul_v2r8(rinv01,rinv01);
2371 rinvsq02 = _fjsp_mul_v2r8(rinv02,rinv02);
2372 rinvsq10 = _fjsp_mul_v2r8(rinv10,rinv10);
2373 rinvsq11 = _fjsp_mul_v2r8(rinv11,rinv11);
2374 rinvsq12 = _fjsp_mul_v2r8(rinv12,rinv12);
2375 rinvsq20 = _fjsp_mul_v2r8(rinv20,rinv20);
2376 rinvsq21 = _fjsp_mul_v2r8(rinv21,rinv21);
2377 rinvsq22 = _fjsp_mul_v2r8(rinv22,rinv22);
2379 fjx0 = _fjsp_setzero_v2r8();
2380 fjy0 = _fjsp_setzero_v2r8();
2381 fjz0 = _fjsp_setzero_v2r8();
2382 fjx1 = _fjsp_setzero_v2r8();
2383 fjy1 = _fjsp_setzero_v2r8();
2384 fjz1 = _fjsp_setzero_v2r8();
2385 fjx2 = _fjsp_setzero_v2r8();
2386 fjy2 = _fjsp_setzero_v2r8();
2387 fjz2 = _fjsp_setzero_v2r8();
2389 /**************************
2390 * CALCULATE INTERACTIONS *
2391 **************************/
2393 if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
2396 r00 = _fjsp_mul_v2r8(rsq00,rinv00);
2398 /* EWALD ELECTROSTATICS */
2400 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2401 ewrt = _fjsp_mul_v2r8(r00,ewtabscale);
2402 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2403 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2404 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2406 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2407 ewtabD = _fjsp_setzero_v2r8();
2408 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2409 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2410 ewtabFn = _fjsp_setzero_v2r8();
2411 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2412 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2413 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2414 velec = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
2415 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
2417 /* LENNARD-JONES DISPERSION/REPULSION */
2419 rinvsix = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
2420 vvdw6 = _fjsp_mul_v2r8(c6_00,rinvsix);
2421 vvdw12 = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
2422 vvdw = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
2423 fvdw = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
2425 d = _fjsp_sub_v2r8(r00,rswitch);
2426 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2427 d2 = _fjsp_mul_v2r8(d,d);
2428 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2430 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2432 /* Evaluate switch function */
2433 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2434 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
2435 fvdw = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
2436 cutoff_mask = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
2438 fscal = _fjsp_add_v2r8(felec,fvdw);
2440 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2442 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2444 /* Update vectorial force */
2445 fix0 = _fjsp_madd_v2r8(dx00,fscal,fix0);
2446 fiy0 = _fjsp_madd_v2r8(dy00,fscal,fiy0);
2447 fiz0 = _fjsp_madd_v2r8(dz00,fscal,fiz0);
2449 fjx0 = _fjsp_madd_v2r8(dx00,fscal,fjx0);
2450 fjy0 = _fjsp_madd_v2r8(dy00,fscal,fjy0);
2451 fjz0 = _fjsp_madd_v2r8(dz00,fscal,fjz0);
2455 /**************************
2456 * CALCULATE INTERACTIONS *
2457 **************************/
2459 if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
2462 r01 = _fjsp_mul_v2r8(rsq01,rinv01);
2464 /* EWALD ELECTROSTATICS */
2466 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2467 ewrt = _fjsp_mul_v2r8(r01,ewtabscale);
2468 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2469 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2470 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2472 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2473 ewtabD = _fjsp_setzero_v2r8();
2474 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2475 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2476 ewtabFn = _fjsp_setzero_v2r8();
2477 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2478 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2479 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2480 velec = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
2481 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
2483 d = _fjsp_sub_v2r8(r01,rswitch);
2484 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2485 d2 = _fjsp_mul_v2r8(d,d);
2486 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2488 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2490 /* Evaluate switch function */
2491 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2492 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
2493 cutoff_mask = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
2497 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2499 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2501 /* Update vectorial force */
2502 fix0 = _fjsp_madd_v2r8(dx01,fscal,fix0);
2503 fiy0 = _fjsp_madd_v2r8(dy01,fscal,fiy0);
2504 fiz0 = _fjsp_madd_v2r8(dz01,fscal,fiz0);
2506 fjx1 = _fjsp_madd_v2r8(dx01,fscal,fjx1);
2507 fjy1 = _fjsp_madd_v2r8(dy01,fscal,fjy1);
2508 fjz1 = _fjsp_madd_v2r8(dz01,fscal,fjz1);
2512 /**************************
2513 * CALCULATE INTERACTIONS *
2514 **************************/
2516 if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
2519 r02 = _fjsp_mul_v2r8(rsq02,rinv02);
2521 /* EWALD ELECTROSTATICS */
2523 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2524 ewrt = _fjsp_mul_v2r8(r02,ewtabscale);
2525 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2526 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2527 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2529 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2530 ewtabD = _fjsp_setzero_v2r8();
2531 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2532 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2533 ewtabFn = _fjsp_setzero_v2r8();
2534 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2535 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2536 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2537 velec = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
2538 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
2540 d = _fjsp_sub_v2r8(r02,rswitch);
2541 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2542 d2 = _fjsp_mul_v2r8(d,d);
2543 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2545 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2547 /* Evaluate switch function */
2548 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2549 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
2550 cutoff_mask = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
2554 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2556 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2558 /* Update vectorial force */
2559 fix0 = _fjsp_madd_v2r8(dx02,fscal,fix0);
2560 fiy0 = _fjsp_madd_v2r8(dy02,fscal,fiy0);
2561 fiz0 = _fjsp_madd_v2r8(dz02,fscal,fiz0);
2563 fjx2 = _fjsp_madd_v2r8(dx02,fscal,fjx2);
2564 fjy2 = _fjsp_madd_v2r8(dy02,fscal,fjy2);
2565 fjz2 = _fjsp_madd_v2r8(dz02,fscal,fjz2);
2569 /**************************
2570 * CALCULATE INTERACTIONS *
2571 **************************/
2573 if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
2576 r10 = _fjsp_mul_v2r8(rsq10,rinv10);
2578 /* EWALD ELECTROSTATICS */
2580 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2581 ewrt = _fjsp_mul_v2r8(r10,ewtabscale);
2582 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2583 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2584 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2586 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2587 ewtabD = _fjsp_setzero_v2r8();
2588 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2589 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2590 ewtabFn = _fjsp_setzero_v2r8();
2591 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2592 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2593 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2594 velec = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
2595 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
2597 d = _fjsp_sub_v2r8(r10,rswitch);
2598 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2599 d2 = _fjsp_mul_v2r8(d,d);
2600 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2602 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2604 /* Evaluate switch function */
2605 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2606 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
2607 cutoff_mask = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
2611 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2613 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2615 /* Update vectorial force */
2616 fix1 = _fjsp_madd_v2r8(dx10,fscal,fix1);
2617 fiy1 = _fjsp_madd_v2r8(dy10,fscal,fiy1);
2618 fiz1 = _fjsp_madd_v2r8(dz10,fscal,fiz1);
2620 fjx0 = _fjsp_madd_v2r8(dx10,fscal,fjx0);
2621 fjy0 = _fjsp_madd_v2r8(dy10,fscal,fjy0);
2622 fjz0 = _fjsp_madd_v2r8(dz10,fscal,fjz0);
2626 /**************************
2627 * CALCULATE INTERACTIONS *
2628 **************************/
2630 if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
2633 r11 = _fjsp_mul_v2r8(rsq11,rinv11);
2635 /* EWALD ELECTROSTATICS */
2637 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2638 ewrt = _fjsp_mul_v2r8(r11,ewtabscale);
2639 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2640 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2641 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2643 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2644 ewtabD = _fjsp_setzero_v2r8();
2645 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2646 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2647 ewtabFn = _fjsp_setzero_v2r8();
2648 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2649 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2650 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2651 velec = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
2652 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
2654 d = _fjsp_sub_v2r8(r11,rswitch);
2655 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2656 d2 = _fjsp_mul_v2r8(d,d);
2657 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2659 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2661 /* Evaluate switch function */
2662 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2663 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
2664 cutoff_mask = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
2668 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2670 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2672 /* Update vectorial force */
2673 fix1 = _fjsp_madd_v2r8(dx11,fscal,fix1);
2674 fiy1 = _fjsp_madd_v2r8(dy11,fscal,fiy1);
2675 fiz1 = _fjsp_madd_v2r8(dz11,fscal,fiz1);
2677 fjx1 = _fjsp_madd_v2r8(dx11,fscal,fjx1);
2678 fjy1 = _fjsp_madd_v2r8(dy11,fscal,fjy1);
2679 fjz1 = _fjsp_madd_v2r8(dz11,fscal,fjz1);
2683 /**************************
2684 * CALCULATE INTERACTIONS *
2685 **************************/
2687 if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
2690 r12 = _fjsp_mul_v2r8(rsq12,rinv12);
2692 /* EWALD ELECTROSTATICS */
2694 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2695 ewrt = _fjsp_mul_v2r8(r12,ewtabscale);
2696 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2697 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2698 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2700 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2701 ewtabD = _fjsp_setzero_v2r8();
2702 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2703 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2704 ewtabFn = _fjsp_setzero_v2r8();
2705 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2706 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2707 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2708 velec = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
2709 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
2711 d = _fjsp_sub_v2r8(r12,rswitch);
2712 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2713 d2 = _fjsp_mul_v2r8(d,d);
2714 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2716 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2718 /* Evaluate switch function */
2719 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2720 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
2721 cutoff_mask = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
2725 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2727 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2729 /* Update vectorial force */
2730 fix1 = _fjsp_madd_v2r8(dx12,fscal,fix1);
2731 fiy1 = _fjsp_madd_v2r8(dy12,fscal,fiy1);
2732 fiz1 = _fjsp_madd_v2r8(dz12,fscal,fiz1);
2734 fjx2 = _fjsp_madd_v2r8(dx12,fscal,fjx2);
2735 fjy2 = _fjsp_madd_v2r8(dy12,fscal,fjy2);
2736 fjz2 = _fjsp_madd_v2r8(dz12,fscal,fjz2);
2740 /**************************
2741 * CALCULATE INTERACTIONS *
2742 **************************/
2744 if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
2747 r20 = _fjsp_mul_v2r8(rsq20,rinv20);
2749 /* EWALD ELECTROSTATICS */
2751 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2752 ewrt = _fjsp_mul_v2r8(r20,ewtabscale);
2753 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2754 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2755 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2757 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2758 ewtabD = _fjsp_setzero_v2r8();
2759 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2760 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2761 ewtabFn = _fjsp_setzero_v2r8();
2762 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2763 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2764 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2765 velec = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
2766 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
2768 d = _fjsp_sub_v2r8(r20,rswitch);
2769 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2770 d2 = _fjsp_mul_v2r8(d,d);
2771 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2773 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2775 /* Evaluate switch function */
2776 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2777 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
2778 cutoff_mask = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
2782 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2784 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2786 /* Update vectorial force */
2787 fix2 = _fjsp_madd_v2r8(dx20,fscal,fix2);
2788 fiy2 = _fjsp_madd_v2r8(dy20,fscal,fiy2);
2789 fiz2 = _fjsp_madd_v2r8(dz20,fscal,fiz2);
2791 fjx0 = _fjsp_madd_v2r8(dx20,fscal,fjx0);
2792 fjy0 = _fjsp_madd_v2r8(dy20,fscal,fjy0);
2793 fjz0 = _fjsp_madd_v2r8(dz20,fscal,fjz0);
2797 /**************************
2798 * CALCULATE INTERACTIONS *
2799 **************************/
2801 if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
2804 r21 = _fjsp_mul_v2r8(rsq21,rinv21);
2806 /* EWALD ELECTROSTATICS */
2808 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2809 ewrt = _fjsp_mul_v2r8(r21,ewtabscale);
2810 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2811 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2812 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2814 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2815 ewtabD = _fjsp_setzero_v2r8();
2816 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2817 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2818 ewtabFn = _fjsp_setzero_v2r8();
2819 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2820 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2821 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2822 velec = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
2823 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
2825 d = _fjsp_sub_v2r8(r21,rswitch);
2826 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2827 d2 = _fjsp_mul_v2r8(d,d);
2828 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2830 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2832 /* Evaluate switch function */
2833 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2834 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
2835 cutoff_mask = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
2839 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2841 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2843 /* Update vectorial force */
2844 fix2 = _fjsp_madd_v2r8(dx21,fscal,fix2);
2845 fiy2 = _fjsp_madd_v2r8(dy21,fscal,fiy2);
2846 fiz2 = _fjsp_madd_v2r8(dz21,fscal,fiz2);
2848 fjx1 = _fjsp_madd_v2r8(dx21,fscal,fjx1);
2849 fjy1 = _fjsp_madd_v2r8(dy21,fscal,fjy1);
2850 fjz1 = _fjsp_madd_v2r8(dz21,fscal,fjz1);
2854 /**************************
2855 * CALCULATE INTERACTIONS *
2856 **************************/
2858 if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
2861 r22 = _fjsp_mul_v2r8(rsq22,rinv22);
2863 /* EWALD ELECTROSTATICS */
2865 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2866 ewrt = _fjsp_mul_v2r8(r22,ewtabscale);
2867 itab_tmp = _fjsp_dtox_v2r8(ewrt);
2868 eweps = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
2869 _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
2871 ewtabF = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
2872 ewtabD = _fjsp_setzero_v2r8();
2873 GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
2874 ewtabV = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
2875 ewtabFn = _fjsp_setzero_v2r8();
2876 GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
2877 felec = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
2878 velec = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
2879 velec = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
2880 felec = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
2882 d = _fjsp_sub_v2r8(r22,rswitch);
2883 d = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
2884 d2 = _fjsp_mul_v2r8(d,d);
2885 sw = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
2887 dsw = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
2889 /* Evaluate switch function */
2890 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2891 felec = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
2892 cutoff_mask = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
2896 fscal = _fjsp_and_v2r8(fscal,cutoff_mask);
2898 fscal = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
2900 /* Update vectorial force */
2901 fix2 = _fjsp_madd_v2r8(dx22,fscal,fix2);
2902 fiy2 = _fjsp_madd_v2r8(dy22,fscal,fiy2);
2903 fiz2 = _fjsp_madd_v2r8(dz22,fscal,fiz2);
2905 fjx2 = _fjsp_madd_v2r8(dx22,fscal,fjx2);
2906 fjy2 = _fjsp_madd_v2r8(dy22,fscal,fjy2);
2907 fjz2 = _fjsp_madd_v2r8(dz22,fscal,fjz2);
2911 gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2913 /* Inner loop uses 600 flops */
2916 /* End of innermost loop */
2918 gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2919 f+i_coord_offset,fshift+i_shift_offset);
2921 /* Increment number of inner iterations */
2922 inneriter += j_index_end - j_index_start;
2924 /* Outer loop uses 18 flops */
2927 /* Increment number of outer iterations */
2930 /* Update outer/inner flops */
2932 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*600);