Bug Summary

File:gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_sse4_1_single.c
Location:line 128, column 5
Description:Value stored to 'j_coord_offsetC' is never read

Annotated Source Code

1/*
2 * This file is part of the GROMACS molecular simulation package.
3 *
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
8 *
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
13 *
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23 *
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
31 *
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
34 */
35/*
36 * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37 */
38#ifdef HAVE_CONFIG_H1
39#include <config.h>
40#endif
41
42#include <math.h>
43
44#include "../nb_kernel.h"
45#include "types/simple.h"
46#include "gromacs/math/vec.h"
47#include "nrnb.h"
48
49#include "gromacs/simd/math_x86_sse4_1_single.h"
50#include "kernelutil_x86_sse4_1_single.h"
51
52/*
53 * Gromacs nonbonded kernel: nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse4_1_single
54 * Electrostatics interaction: GeneralizedBorn
55 * VdW interaction: None
56 * Geometry: Particle-Particle
57 * Calculate force/pot: PotentialAndForce
58 */
59void
60nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse4_1_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
68{
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 */
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real rcutoff_scalar;
81 real *shiftvec,*fshift,*x,*f;
82 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83 real scratch[4*DIM3];
84 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85 int vdwioffset0;
86 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
88 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
89 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
90 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
91 real *charge;
92 __m128i gbitab;
93 __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
94 __m128 minushalf = _mm_set1_ps(-0.5);
95 real *invsqrta,*dvda,*gbtab;
96 __m128i vfitab;
97 __m128i ifour = _mm_set1_epi32(4);
98 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
99 real *vftab;
100 __m128 dummy_mask,cutoff_mask;
101 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
102 __m128 one = _mm_set1_ps(1.0);
103 __m128 two = _mm_set1_ps(2.0);
104 x = xx[0];
105 f = ff[0];
106
107 nri = nlist->nri;
108 iinr = nlist->iinr;
109 jindex = nlist->jindex;
110 jjnr = nlist->jjnr;
111 shiftidx = nlist->shift;
112 gid = nlist->gid;
113 shiftvec = fr->shift_vec[0];
114 fshift = fr->fshift[0];
115 facel = _mm_set1_ps(fr->epsfac);
116 charge = mdatoms->chargeA;
117
118 invsqrta = fr->invsqrta;
119 dvda = fr->dvda;
120 gbtabscale = _mm_set1_ps(fr->gbtab.scale);
121 gbtab = fr->gbtab.data;
122 gbinvepsdiff = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
123
124 /* Avoid stupid compiler warnings */
125 jnrA = jnrB = jnrC = jnrD = 0;
126 j_coord_offsetA = 0;
127 j_coord_offsetB = 0;
128 j_coord_offsetC = 0;
Value stored to 'j_coord_offsetC' is never read
129 j_coord_offsetD = 0;
130
131 outeriter = 0;
132 inneriter = 0;
133
134 for(iidx=0;iidx<4*DIM3;iidx++)
135 {
136 scratch[iidx] = 0.0;
137 }
138
139 /* Start outer loop over neighborlists */
140 for(iidx=0; iidx<nri; iidx++)
141 {
142 /* Load shift vector for this list */
143 i_shift_offset = DIM3*shiftidx[iidx];
144
145 /* Load limits for loop over neighbors */
146 j_index_start = jindex[iidx];
147 j_index_end = jindex[iidx+1];
148
149 /* Get outer coordinate index */
150 inr = iinr[iidx];
151 i_coord_offset = DIM3*inr;
152
153 /* Load i particle coords and add shift vector */
154 gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
155
156 fix0 = _mm_setzero_ps();
157 fiy0 = _mm_setzero_ps();
158 fiz0 = _mm_setzero_ps();
159
160 /* Load parameters for i particles */
161 iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
162 isai0 = _mm_load1_ps(invsqrta+inr+0);
163
164 /* Reset potential sums */
165 velecsum = _mm_setzero_ps();
166 vgbsum = _mm_setzero_ps();
167 dvdasum = _mm_setzero_ps();
168
169 /* Start inner kernel loop */
170 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
171 {
172
173 /* Get j neighbor index, and coordinate index */
174 jnrA = jjnr[jidx];
175 jnrB = jjnr[jidx+1];
176 jnrC = jjnr[jidx+2];
177 jnrD = jjnr[jidx+3];
178 j_coord_offsetA = DIM3*jnrA;
179 j_coord_offsetB = DIM3*jnrB;
180 j_coord_offsetC = DIM3*jnrC;
181 j_coord_offsetD = DIM3*jnrD;
182
183 /* load j atom coordinates */
184 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
185 x+j_coord_offsetC,x+j_coord_offsetD,
186 &jx0,&jy0,&jz0);
187
188 /* Calculate displacement vector */
189 dx00 = _mm_sub_ps(ix0,jx0);
190 dy00 = _mm_sub_ps(iy0,jy0);
191 dz00 = _mm_sub_ps(iz0,jz0);
192
193 /* Calculate squared distance and things based on it */
194 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
195
196 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
197
198 /* Load parameters for j particles */
199 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
200 charge+jnrC+0,charge+jnrD+0);
201 isaj0 = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
202 invsqrta+jnrC+0,invsqrta+jnrD+0);
203
204 /**************************
205 * CALCULATE INTERACTIONS *
206 **************************/
207
208 r00 = _mm_mul_ps(rsq00,rinv00);
209
210 /* Compute parameters for interactions between i and j atoms */
211 qq00 = _mm_mul_ps(iq0,jq0);
212
213 /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
214 isaprod = _mm_mul_ps(isai0,isaj0);
215 gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
216 gbscale = _mm_mul_ps(isaprod,gbtabscale);
217
218 /* Calculate generalized born table index - this is a separate table from the normal one,
219 * but we use the same procedure by multiplying r with scale and truncating to integer.
220 */
221 rt = _mm_mul_ps(r00,gbscale);
222 gbitab = _mm_cvttps_epi32(rt);
223 gbeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
224 gbitab = _mm_slli_epi32(gbitab,2);
225 Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0)(__extension__ ({ __v4si __a = (__v4si)(gbitab); __a[(0) &
3];}))
);
226 F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1)(__extension__ ({ __v4si __a = (__v4si)(gbitab); __a[(1) &
3];}))
);
227 G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2)(__extension__ ({ __v4si __a = (__v4si)(gbitab); __a[(2) &
3];}))
);
228 H = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3)(__extension__ ({ __v4si __a = (__v4si)(gbitab); __a[(3) &
3];}))
);
229 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
230 Heps = _mm_mul_ps(gbeps,H);
231 Fp = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
232 VV = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
233 vgb = _mm_mul_ps(gbqqfactor,VV);
234
235 FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
236 fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
237 dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
238 dvdasum = _mm_add_ps(dvdasum,dvdatmp);
239 fjptrA = dvda+jnrA;
240 fjptrB = dvda+jnrB;
241 fjptrC = dvda+jnrC;
242 fjptrD = dvda+jnrD;
243 gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
244 velec = _mm_mul_ps(qq00,rinv00);
245 felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
246
247 /* Update potential sum for this i atom from the interaction with this j atom. */
248 velecsum = _mm_add_ps(velecsum,velec);
249 vgbsum = _mm_add_ps(vgbsum,vgb);
250
251 fscal = felec;
252
253 /* Calculate temporary vectorial force */
254 tx = _mm_mul_ps(fscal,dx00);
255 ty = _mm_mul_ps(fscal,dy00);
256 tz = _mm_mul_ps(fscal,dz00);
257
258 /* Update vectorial force */
259 fix0 = _mm_add_ps(fix0,tx);
260 fiy0 = _mm_add_ps(fiy0,ty);
261 fiz0 = _mm_add_ps(fiz0,tz);
262
263 fjptrA = f+j_coord_offsetA;
264 fjptrB = f+j_coord_offsetB;
265 fjptrC = f+j_coord_offsetC;
266 fjptrD = f+j_coord_offsetD;
267 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
268
269 /* Inner loop uses 58 flops */
270 }
271
272 if(jidx<j_index_end)
273 {
274
275 /* Get j neighbor index, and coordinate index */
276 jnrlistA = jjnr[jidx];
277 jnrlistB = jjnr[jidx+1];
278 jnrlistC = jjnr[jidx+2];
279 jnrlistD = jjnr[jidx+3];
280 /* Sign of each element will be negative for non-real atoms.
281 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
282 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
283 */
284 dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
285 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
286 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
287 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
288 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
289 j_coord_offsetA = DIM3*jnrA;
290 j_coord_offsetB = DIM3*jnrB;
291 j_coord_offsetC = DIM3*jnrC;
292 j_coord_offsetD = DIM3*jnrD;
293
294 /* load j atom coordinates */
295 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
296 x+j_coord_offsetC,x+j_coord_offsetD,
297 &jx0,&jy0,&jz0);
298
299 /* Calculate displacement vector */
300 dx00 = _mm_sub_ps(ix0,jx0);
301 dy00 = _mm_sub_ps(iy0,jy0);
302 dz00 = _mm_sub_ps(iz0,jz0);
303
304 /* Calculate squared distance and things based on it */
305 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
306
307 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
308
309 /* Load parameters for j particles */
310 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
311 charge+jnrC+0,charge+jnrD+0);
312 isaj0 = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
313 invsqrta+jnrC+0,invsqrta+jnrD+0);
314
315 /**************************
316 * CALCULATE INTERACTIONS *
317 **************************/
318
319 r00 = _mm_mul_ps(rsq00,rinv00);
320 r00 = _mm_andnot_ps(dummy_mask,r00);
321
322 /* Compute parameters for interactions between i and j atoms */
323 qq00 = _mm_mul_ps(iq0,jq0);
324
325 /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
326 isaprod = _mm_mul_ps(isai0,isaj0);
327 gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
328 gbscale = _mm_mul_ps(isaprod,gbtabscale);
329
330 /* Calculate generalized born table index - this is a separate table from the normal one,
331 * but we use the same procedure by multiplying r with scale and truncating to integer.
332 */
333 rt = _mm_mul_ps(r00,gbscale);
334 gbitab = _mm_cvttps_epi32(rt);
335 gbeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
336 gbitab = _mm_slli_epi32(gbitab,2);
337 Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0)(__extension__ ({ __v4si __a = (__v4si)(gbitab); __a[(0) &
3];}))
);
338 F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1)(__extension__ ({ __v4si __a = (__v4si)(gbitab); __a[(1) &
3];}))
);
339 G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2)(__extension__ ({ __v4si __a = (__v4si)(gbitab); __a[(2) &
3];}))
);
340 H = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3)(__extension__ ({ __v4si __a = (__v4si)(gbitab); __a[(3) &
3];}))
);
341 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
342 Heps = _mm_mul_ps(gbeps,H);
343 Fp = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
344 VV = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
345 vgb = _mm_mul_ps(gbqqfactor,VV);
346
347 FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
348 fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
349 dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
350 dvdatmp = _mm_andnot_ps(dummy_mask,dvdatmp);
351 dvdasum = _mm_add_ps(dvdasum,dvdatmp);
352 /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
353 fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch;
354 fjptrB = (jnrlistB>=0) ? dvda+jnrB : scratch;
355 fjptrC = (jnrlistC>=0) ? dvda+jnrC : scratch;
356 fjptrD = (jnrlistD>=0) ? dvda+jnrD : scratch;
357 gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
358 velec = _mm_mul_ps(qq00,rinv00);
359 felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
360
361 /* Update potential sum for this i atom from the interaction with this j atom. */
362 velec = _mm_andnot_ps(dummy_mask,velec);
363 velecsum = _mm_add_ps(velecsum,velec);
364 vgb = _mm_andnot_ps(dummy_mask,vgb);
365 vgbsum = _mm_add_ps(vgbsum,vgb);
366
367 fscal = felec;
368
369 fscal = _mm_andnot_ps(dummy_mask,fscal);
370
371 /* Calculate temporary vectorial force */
372 tx = _mm_mul_ps(fscal,dx00);
373 ty = _mm_mul_ps(fscal,dy00);
374 tz = _mm_mul_ps(fscal,dz00);
375
376 /* Update vectorial force */
377 fix0 = _mm_add_ps(fix0,tx);
378 fiy0 = _mm_add_ps(fiy0,ty);
379 fiz0 = _mm_add_ps(fiz0,tz);
380
381 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
382 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
383 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
384 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
385 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
386
387 /* Inner loop uses 59 flops */
388 }
389
390 /* End of innermost loop */
391
392 gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
393 f+i_coord_offset,fshift+i_shift_offset);
394
395 ggid = gid[iidx];
396 /* Update potential energies */
397 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
398 gmx_mm_update_1pot_ps(vgbsum,kernel_data->energygrp_polarization+ggid);
399 dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
400 gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
401
402 /* Increment number of inner iterations */
403 inneriter += j_index_end - j_index_start;
404
405 /* Outer loop uses 9 flops */
406 }
407
408 /* Increment number of outer iterations */
409 outeriter += nri;
410
411 /* Update outer/inner flops */
412
413 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*9 + inneriter*59)(nrnb)->n[eNR_NBKERNEL_ELEC_VF] += outeriter*9 + inneriter
*59
;
414}
415/*
416 * Gromacs nonbonded kernel: nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse4_1_single
417 * Electrostatics interaction: GeneralizedBorn
418 * VdW interaction: None
419 * Geometry: Particle-Particle
420 * Calculate force/pot: Force
421 */
422void
423nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse4_1_single
424 (t_nblist * gmx_restrict nlist,
425 rvec * gmx_restrict xx,
426 rvec * gmx_restrict ff,
427 t_forcerec * gmx_restrict fr,
428 t_mdatoms * gmx_restrict mdatoms,
429 nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
430 t_nrnb * gmx_restrict nrnb)
431{
432 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
433 * just 0 for non-waters.
434 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
435 * jnr indices corresponding to data put in the four positions in the SIMD register.
436 */
437 int i_shift_offset,i_coord_offset,outeriter,inneriter;
438 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
439 int jnrA,jnrB,jnrC,jnrD;
440 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
441 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
442 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
443 real rcutoff_scalar;
444 real *shiftvec,*fshift,*x,*f;
445 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
446 real scratch[4*DIM3];
447 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
448 int vdwioffset0;
449 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
450 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
451 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
452 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
453 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
454 real *charge;
455 __m128i gbitab;
456 __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
457 __m128 minushalf = _mm_set1_ps(-0.5);
458 real *invsqrta,*dvda,*gbtab;
459 __m128i vfitab;
460 __m128i ifour = _mm_set1_epi32(4);
461 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
462 real *vftab;
463 __m128 dummy_mask,cutoff_mask;
464 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
465 __m128 one = _mm_set1_ps(1.0);
466 __m128 two = _mm_set1_ps(2.0);
467 x = xx[0];
468 f = ff[0];
469
470 nri = nlist->nri;
471 iinr = nlist->iinr;
472 jindex = nlist->jindex;
473 jjnr = nlist->jjnr;
474 shiftidx = nlist->shift;
475 gid = nlist->gid;
476 shiftvec = fr->shift_vec[0];
477 fshift = fr->fshift[0];
478 facel = _mm_set1_ps(fr->epsfac);
479 charge = mdatoms->chargeA;
480
481 invsqrta = fr->invsqrta;
482 dvda = fr->dvda;
483 gbtabscale = _mm_set1_ps(fr->gbtab.scale);
484 gbtab = fr->gbtab.data;
485 gbinvepsdiff = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
486
487 /* Avoid stupid compiler warnings */
488 jnrA = jnrB = jnrC = jnrD = 0;
489 j_coord_offsetA = 0;
490 j_coord_offsetB = 0;
491 j_coord_offsetC = 0;
492 j_coord_offsetD = 0;
493
494 outeriter = 0;
495 inneriter = 0;
496
497 for(iidx=0;iidx<4*DIM3;iidx++)
498 {
499 scratch[iidx] = 0.0;
500 }
501
502 /* Start outer loop over neighborlists */
503 for(iidx=0; iidx<nri; iidx++)
504 {
505 /* Load shift vector for this list */
506 i_shift_offset = DIM3*shiftidx[iidx];
507
508 /* Load limits for loop over neighbors */
509 j_index_start = jindex[iidx];
510 j_index_end = jindex[iidx+1];
511
512 /* Get outer coordinate index */
513 inr = iinr[iidx];
514 i_coord_offset = DIM3*inr;
515
516 /* Load i particle coords and add shift vector */
517 gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
518
519 fix0 = _mm_setzero_ps();
520 fiy0 = _mm_setzero_ps();
521 fiz0 = _mm_setzero_ps();
522
523 /* Load parameters for i particles */
524 iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
525 isai0 = _mm_load1_ps(invsqrta+inr+0);
526
527 dvdasum = _mm_setzero_ps();
528
529 /* Start inner kernel loop */
530 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
531 {
532
533 /* Get j neighbor index, and coordinate index */
534 jnrA = jjnr[jidx];
535 jnrB = jjnr[jidx+1];
536 jnrC = jjnr[jidx+2];
537 jnrD = jjnr[jidx+3];
538 j_coord_offsetA = DIM3*jnrA;
539 j_coord_offsetB = DIM3*jnrB;
540 j_coord_offsetC = DIM3*jnrC;
541 j_coord_offsetD = DIM3*jnrD;
542
543 /* load j atom coordinates */
544 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
545 x+j_coord_offsetC,x+j_coord_offsetD,
546 &jx0,&jy0,&jz0);
547
548 /* Calculate displacement vector */
549 dx00 = _mm_sub_ps(ix0,jx0);
550 dy00 = _mm_sub_ps(iy0,jy0);
551 dz00 = _mm_sub_ps(iz0,jz0);
552
553 /* Calculate squared distance and things based on it */
554 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
555
556 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
557
558 /* Load parameters for j particles */
559 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
560 charge+jnrC+0,charge+jnrD+0);
561 isaj0 = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
562 invsqrta+jnrC+0,invsqrta+jnrD+0);
563
564 /**************************
565 * CALCULATE INTERACTIONS *
566 **************************/
567
568 r00 = _mm_mul_ps(rsq00,rinv00);
569
570 /* Compute parameters for interactions between i and j atoms */
571 qq00 = _mm_mul_ps(iq0,jq0);
572
573 /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
574 isaprod = _mm_mul_ps(isai0,isaj0);
575 gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
576 gbscale = _mm_mul_ps(isaprod,gbtabscale);
577
578 /* Calculate generalized born table index - this is a separate table from the normal one,
579 * but we use the same procedure by multiplying r with scale and truncating to integer.
580 */
581 rt = _mm_mul_ps(r00,gbscale);
582 gbitab = _mm_cvttps_epi32(rt);
583 gbeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
584 gbitab = _mm_slli_epi32(gbitab,2);
585 Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0)(__extension__ ({ __v4si __a = (__v4si)(gbitab); __a[(0) &
3];}))
);
586 F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1)(__extension__ ({ __v4si __a = (__v4si)(gbitab); __a[(1) &
3];}))
);
587 G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2)(__extension__ ({ __v4si __a = (__v4si)(gbitab); __a[(2) &
3];}))
);
588 H = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3)(__extension__ ({ __v4si __a = (__v4si)(gbitab); __a[(3) &
3];}))
);
589 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
590 Heps = _mm_mul_ps(gbeps,H);
591 Fp = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
592 VV = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
593 vgb = _mm_mul_ps(gbqqfactor,VV);
594
595 FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
596 fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
597 dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
598 dvdasum = _mm_add_ps(dvdasum,dvdatmp);
599 fjptrA = dvda+jnrA;
600 fjptrB = dvda+jnrB;
601 fjptrC = dvda+jnrC;
602 fjptrD = dvda+jnrD;
603 gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
604 velec = _mm_mul_ps(qq00,rinv00);
605 felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
606
607 fscal = felec;
608
609 /* Calculate temporary vectorial force */
610 tx = _mm_mul_ps(fscal,dx00);
611 ty = _mm_mul_ps(fscal,dy00);
612 tz = _mm_mul_ps(fscal,dz00);
613
614 /* Update vectorial force */
615 fix0 = _mm_add_ps(fix0,tx);
616 fiy0 = _mm_add_ps(fiy0,ty);
617 fiz0 = _mm_add_ps(fiz0,tz);
618
619 fjptrA = f+j_coord_offsetA;
620 fjptrB = f+j_coord_offsetB;
621 fjptrC = f+j_coord_offsetC;
622 fjptrD = f+j_coord_offsetD;
623 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
624
625 /* Inner loop uses 56 flops */
626 }
627
628 if(jidx<j_index_end)
629 {
630
631 /* Get j neighbor index, and coordinate index */
632 jnrlistA = jjnr[jidx];
633 jnrlistB = jjnr[jidx+1];
634 jnrlistC = jjnr[jidx+2];
635 jnrlistD = jjnr[jidx+3];
636 /* Sign of each element will be negative for non-real atoms.
637 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
638 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
639 */
640 dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
641 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
642 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
643 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
644 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
645 j_coord_offsetA = DIM3*jnrA;
646 j_coord_offsetB = DIM3*jnrB;
647 j_coord_offsetC = DIM3*jnrC;
648 j_coord_offsetD = DIM3*jnrD;
649
650 /* load j atom coordinates */
651 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
652 x+j_coord_offsetC,x+j_coord_offsetD,
653 &jx0,&jy0,&jz0);
654
655 /* Calculate displacement vector */
656 dx00 = _mm_sub_ps(ix0,jx0);
657 dy00 = _mm_sub_ps(iy0,jy0);
658 dz00 = _mm_sub_ps(iz0,jz0);
659
660 /* Calculate squared distance and things based on it */
661 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
662
663 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
664
665 /* Load parameters for j particles */
666 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
667 charge+jnrC+0,charge+jnrD+0);
668 isaj0 = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
669 invsqrta+jnrC+0,invsqrta+jnrD+0);
670
671 /**************************
672 * CALCULATE INTERACTIONS *
673 **************************/
674
675 r00 = _mm_mul_ps(rsq00,rinv00);
676 r00 = _mm_andnot_ps(dummy_mask,r00);
677
678 /* Compute parameters for interactions between i and j atoms */
679 qq00 = _mm_mul_ps(iq0,jq0);
680
681 /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
682 isaprod = _mm_mul_ps(isai0,isaj0);
683 gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
684 gbscale = _mm_mul_ps(isaprod,gbtabscale);
685
686 /* Calculate generalized born table index - this is a separate table from the normal one,
687 * but we use the same procedure by multiplying r with scale and truncating to integer.
688 */
689 rt = _mm_mul_ps(r00,gbscale);
690 gbitab = _mm_cvttps_epi32(rt);
691 gbeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
692 gbitab = _mm_slli_epi32(gbitab,2);
693 Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0)(__extension__ ({ __v4si __a = (__v4si)(gbitab); __a[(0) &
3];}))
);
694 F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1)(__extension__ ({ __v4si __a = (__v4si)(gbitab); __a[(1) &
3];}))
);
695 G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2)(__extension__ ({ __v4si __a = (__v4si)(gbitab); __a[(2) &
3];}))
);
696 H = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3)(__extension__ ({ __v4si __a = (__v4si)(gbitab); __a[(3) &
3];}))
);
697 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
698 Heps = _mm_mul_ps(gbeps,H);
699 Fp = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
700 VV = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
701 vgb = _mm_mul_ps(gbqqfactor,VV);
702
703 FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
704 fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
705 dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
706 dvdatmp = _mm_andnot_ps(dummy_mask,dvdatmp);
707 dvdasum = _mm_add_ps(dvdasum,dvdatmp);
708 /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
709 fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch;
710 fjptrB = (jnrlistB>=0) ? dvda+jnrB : scratch;
711 fjptrC = (jnrlistC>=0) ? dvda+jnrC : scratch;
712 fjptrD = (jnrlistD>=0) ? dvda+jnrD : scratch;
713 gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
714 velec = _mm_mul_ps(qq00,rinv00);
715 felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
716
717 fscal = felec;
718
719 fscal = _mm_andnot_ps(dummy_mask,fscal);
720
721 /* Calculate temporary vectorial force */
722 tx = _mm_mul_ps(fscal,dx00);
723 ty = _mm_mul_ps(fscal,dy00);
724 tz = _mm_mul_ps(fscal,dz00);
725
726 /* Update vectorial force */
727 fix0 = _mm_add_ps(fix0,tx);
728 fiy0 = _mm_add_ps(fiy0,ty);
729 fiz0 = _mm_add_ps(fiz0,tz);
730
731 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
732 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
733 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
734 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
735 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
736
737 /* Inner loop uses 57 flops */
738 }
739
740 /* End of innermost loop */
741
742 gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
743 f+i_coord_offset,fshift+i_shift_offset);
744
745 dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
746 gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
747
748 /* Increment number of inner iterations */
749 inneriter += j_index_end - j_index_start;
750
751 /* Outer loop uses 7 flops */
752 }
753
754 /* Increment number of outer iterations */
755 outeriter += nri;
756
757 /* Update outer/inner flops */
758
759 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*57)(nrnb)->n[eNR_NBKERNEL_ELEC_F] += outeriter*7 + inneriter*
57
;
760}