Bug Summary

File:gromacs/mdlib/nbnxn_search.c
Location:line 4095, column 39
Description:Array access (from variable 'bb') results in a null pointer dereference

Annotated Source Code

1/*
2 * This file is part of the GROMACS molecular simulation package.
3 *
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
8 *
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
13 *
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23 *
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
31 *
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
34 */
35
36#ifdef HAVE_CONFIG_H1
37#include <config.h>
38#endif
39
40#include <math.h>
41#include <string.h>
42#include <assert.h>
43
44#include "gromacs/utility/smalloc.h"
45#include "types/commrec.h"
46#include "macros.h"
47#include "gromacs/math/utilities.h"
48#include "gromacs/math/vec.h"
49#include "pbc.h"
50#include "nbnxn_consts.h"
51/* nbnxn_internal.h included gromacs/simd/macros.h */
52#include "nbnxn_internal.h"
53#ifdef GMX_NBNXN_SIMD
54#include "gromacs/simd/vector_operations.h"
55#endif
56#include "nbnxn_atomdata.h"
57#include "nbnxn_search.h"
58#include "gmx_omp_nthreads.h"
59#include "nrnb.h"
60#include "ns.h"
61
62#include "gromacs/fileio/gmxfio.h"
63
64#ifdef NBNXN_SEARCH_BB_SIMD4
65/* Always use 4-wide SIMD for bounding box calculations */
66
67# ifndef GMX_DOUBLE
68/* Single precision BBs + coordinates, we can also load coordinates with SIMD */
69# define NBNXN_SEARCH_SIMD4_FLOAT_X_BB
70# endif
71
72# if defined NBNXN_SEARCH_SIMD4_FLOAT_X_BB && (GPU_NSUBCELL(2*2*2) == 4 || GPU_NSUBCELL(2*2*2) == 8)
73/* Store bounding boxes with x, y and z coordinates in packs of 4 */
74# define NBNXN_PBB_SIMD4
75# endif
76
77/* The packed bounding box coordinate stride is always set to 4.
78 * With AVX we could use 8, but that turns out not to be faster.
79 */
80# define STRIDE_PBB4 4
81# define STRIDE_PBB_2LOG2 2
82
83#endif /* NBNXN_SEARCH_BB_SIMD4 */
84
85#ifdef GMX_NBNXN_SIMD
86
87/* The functions below are macros as they are performance sensitive */
88
89/* 4x4 list, pack=4: no complex conversion required */
90/* i-cluster to j-cluster conversion */
91#define CI_TO_CJ_J4(ci)(ci) (ci)
92/* cluster index to coordinate array index conversion */
93#define X_IND_CI_J4(ci)((ci)*(3*4)) ((ci)*STRIDE_P4(3*4))
94#define X_IND_CJ_J4(cj)((cj)*(3*4)) ((cj)*STRIDE_P4(3*4))
95
96/* 4x2 list, pack=4: j-cluster size is half the packing width */
97/* i-cluster to j-cluster conversion */
98#define CI_TO_CJ_J2(ci)((ci)<<1) ((ci)<<1)
99/* cluster index to coordinate array index conversion */
100#define X_IND_CI_J2(ci)((ci)*(3*4)) ((ci)*STRIDE_P4(3*4))
101#define X_IND_CJ_J2(cj)(((cj)>>1)*(3*4) + ((cj) & 1)*(4>>1)) (((cj)>>1)*STRIDE_P4(3*4) + ((cj) & 1)*(PACK_X44>>1))
102
103/* 4x8 list, pack=8: i-cluster size is half the packing width */
104/* i-cluster to j-cluster conversion */
105#define CI_TO_CJ_J8(ci)((ci)>>1) ((ci)>>1)
106/* cluster index to coordinate array index conversion */
107#define X_IND_CI_J8(ci)(((ci)>>1)*(3*8) + ((ci) & 1)*(8>>1)) (((ci)>>1)*STRIDE_P8(3*8) + ((ci) & 1)*(PACK_X88>>1))
108#define X_IND_CJ_J8(cj)((cj)*(3*8)) ((cj)*STRIDE_P8(3*8))
109
110/* The j-cluster size is matched to the SIMD width */
111#if GMX_SIMD_REAL_WIDTH4 == 2
112#define CI_TO_CJ_SIMD_4XN(ci)(ci) CI_TO_CJ_J2(ci)((ci)<<1)
113#define X_IND_CI_SIMD_4XN(ci)((ci)*(3*4)) X_IND_CI_J2(ci)((ci)*(3*4))
114#define X_IND_CJ_SIMD_4XN(cj)((cj)*(3*4)) X_IND_CJ_J2(cj)(((cj)>>1)*(3*4) + ((cj) & 1)*(4>>1))
115#else
116#if GMX_SIMD_REAL_WIDTH4 == 4
117#define CI_TO_CJ_SIMD_4XN(ci)(ci) CI_TO_CJ_J4(ci)(ci)
118#define X_IND_CI_SIMD_4XN(ci)((ci)*(3*4)) X_IND_CI_J4(ci)((ci)*(3*4))
119#define X_IND_CJ_SIMD_4XN(cj)((cj)*(3*4)) X_IND_CJ_J4(cj)((cj)*(3*4))
120#else
121#if GMX_SIMD_REAL_WIDTH4 == 8
122#define CI_TO_CJ_SIMD_4XN(ci)(ci) CI_TO_CJ_J8(ci)((ci)>>1)
123#define X_IND_CI_SIMD_4XN(ci)((ci)*(3*4)) X_IND_CI_J8(ci)(((ci)>>1)*(3*8) + ((ci) & 1)*(8>>1))
124#define X_IND_CJ_SIMD_4XN(cj)((cj)*(3*4)) X_IND_CJ_J8(cj)((cj)*(3*8))
125/* Half SIMD with j-cluster size */
126#define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J4(ci)(ci)
127#define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J4(ci)((ci)*(3*4))
128#define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J4(cj)((cj)*(3*4))
129#else
130#if GMX_SIMD_REAL_WIDTH4 == 16
131#define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J8(ci)((ci)>>1)
132#define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J8(ci)(((ci)>>1)*(3*8) + ((ci) & 1)*(8>>1))
133#define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J8(cj)((cj)*(3*8))
134#else
135#error "unsupported GMX_SIMD_REAL_WIDTH"
136#endif
137#endif
138#endif
139#endif
140
141#endif /* GMX_NBNXN_SIMD */
142
143
144#ifdef NBNXN_SEARCH_BB_SIMD4
145/* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
146#define NBNXN_BBXXXX
147/* Size of bounding box corners quadruplet */
148#define NNBSBB_XXXX(2*3*4) (NNBSBB_D2*DIM3*STRIDE_PBB4)
149#endif
150
151/* We shift the i-particles backward for PBC.
152 * This leads to more conditionals than shifting forward.
153 * We do this to get more balanced pair lists.
154 */
155#define NBNXN_SHIFT_BACKWARD
156
157
158/* This define is a lazy way to avoid interdependence of the grid
159 * and searching data structures.
160 */
161#define NBNXN_NA_SC_MAX((2*2*2)*8) (GPU_NSUBCELL(2*2*2)*NBNXN_GPU_CLUSTER_SIZE8)
162
163
164static void nbs_cycle_clear(nbnxn_cycle_t *cc)
165{
166 int i;
167
168 for (i = 0; i < enbsCCnr; i++)
169 {
170 cc[i].count = 0;
171 cc[i].c = 0;
172 }
173}
174
175static double Mcyc_av(const nbnxn_cycle_t *cc)
176{
177 return (double)cc->c*1e-6/cc->count;
178}
179
180static void nbs_cycle_print(FILE *fp, const nbnxn_search_t nbs)
181{
182 int n;
183 int t;
184
185 fprintf(fp, "\n");
186 fprintf(fp, "ns %4d grid %4.1f search %4.1f red.f %5.3f",
187 nbs->cc[enbsCCgrid].count,
188 Mcyc_av(&nbs->cc[enbsCCgrid]),
189 Mcyc_av(&nbs->cc[enbsCCsearch]),
190 Mcyc_av(&nbs->cc[enbsCCreducef]));
191
192 if (nbs->nthread_max > 1)
193 {
194 if (nbs->cc[enbsCCcombine].count > 0)
195 {
196 fprintf(fp, " comb %5.2f",
197 Mcyc_av(&nbs->cc[enbsCCcombine]));
198 }
199 fprintf(fp, " s. th");
200 for (t = 0; t < nbs->nthread_max; t++)
201 {
202 fprintf(fp, " %4.1f",
203 Mcyc_av(&nbs->work[t].cc[enbsCCsearch]));
204 }
205 }
206 fprintf(fp, "\n");
207}
208
209static void nbnxn_grid_init(nbnxn_grid_t * grid)
210{
211 grid->cxy_na = NULL((void*)0);
212 grid->cxy_ind = NULL((void*)0);
213 grid->cxy_nalloc = 0;
214 grid->bb = NULL((void*)0);
215 grid->bbj = NULL((void*)0);
216 grid->nc_nalloc = 0;
217}
218
219static int get_2log(int n)
220{
221 int log2;
222
223 log2 = 0;
224 while ((1<<log2) < n)
225 {
226 log2++;
227 }
228 if ((1<<log2) != n)
229 {
230 gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 230
, "nbnxn na_c (%d) is not a power of 2", n);
231 }
232
233 return log2;
234}
235
236static int nbnxn_kernel_to_ci_size(int nb_kernel_type)
237{
238 switch (nb_kernel_type)
239 {
240 case nbnxnk4x4_PlainC:
241 case nbnxnk4xN_SIMD_4xN:
242 case nbnxnk4xN_SIMD_2xNN:
243 return NBNXN_CPU_CLUSTER_I_SIZE4;
244 case nbnxnk8x8x8_CUDA:
245 case nbnxnk8x8x8_PlainC:
246 /* The cluster size for super/sub lists is only set here.
247 * Any value should work for the pair-search and atomdata code.
248 * The kernels, of course, might require a particular value.
249 */
250 return NBNXN_GPU_CLUSTER_SIZE8;
251 default:
252 gmx_incons("unknown kernel type")_gmx_error("incons", "unknown kernel type", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 252)
;
253 }
254
255 return 0;
256}
257
258int nbnxn_kernel_to_cj_size(int nb_kernel_type)
259{
260 int nbnxn_simd_width = 0;
261 int cj_size = 0;
262
263#ifdef GMX_NBNXN_SIMD
264 nbnxn_simd_width = GMX_SIMD_REAL_WIDTH4;
265#endif
266
267 switch (nb_kernel_type)
268 {
269 case nbnxnk4x4_PlainC:
270 cj_size = NBNXN_CPU_CLUSTER_I_SIZE4;
271 break;
272 case nbnxnk4xN_SIMD_4xN:
273 cj_size = nbnxn_simd_width;
274 break;
275 case nbnxnk4xN_SIMD_2xNN:
276 cj_size = nbnxn_simd_width/2;
277 break;
278 case nbnxnk8x8x8_CUDA:
279 case nbnxnk8x8x8_PlainC:
280 cj_size = nbnxn_kernel_to_ci_size(nb_kernel_type);
281 break;
282 default:
283 gmx_incons("unknown kernel type")_gmx_error("incons", "unknown kernel type", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 283)
;
284 }
285
286 return cj_size;
287}
288
289static int ci_to_cj(int na_cj_2log, int ci)
290{
291 switch (na_cj_2log)
292 {
293 case 2: return ci; break;
294 case 1: return (ci<<1); break;
295 case 3: return (ci>>1); break;
296 }
297
298 return 0;
299}
300
301gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type)
302{
303 if (nb_kernel_type == nbnxnkNotSet)
304 {
305 gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 305
, "Non-bonded kernel type not set for Verlet-style pair-list.");
306 }
307
308 switch (nb_kernel_type)
309 {
310 case nbnxnk8x8x8_CUDA:
311 case nbnxnk8x8x8_PlainC:
312 return FALSE0;
313
314 case nbnxnk4x4_PlainC:
315 case nbnxnk4xN_SIMD_4xN:
316 case nbnxnk4xN_SIMD_2xNN:
317 return TRUE1;
318
319 default:
320 gmx_incons("Invalid nonbonded kernel type passed!")_gmx_error("incons", "Invalid nonbonded kernel type passed!",
"/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 320)
;
321 return FALSE0;
322 }
323}
324
325/* Initializes a single nbnxn_pairlist_t data structure */
326static void nbnxn_init_pairlist_fep(t_nblist *nl)
327{
328 nl->type = GMX_NBLIST_INTERACTION_FREE_ENERGY;
329 nl->igeometry = GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE;
330 /* The interaction functions are set in the free energy kernel fuction */
331 nl->ivdw = -1;
332 nl->ivdwmod = -1;
333 nl->ielec = -1;
334 nl->ielecmod = -1;
335
336 nl->maxnri = 0;
337 nl->maxnrj = 0;
338 nl->nri = 0;
339 nl->nrj = 0;
340 nl->iinr = NULL((void*)0);
341 nl->gid = NULL((void*)0);
342 nl->shift = NULL((void*)0);
343 nl->jindex = NULL((void*)0);
344 nl->jjnr = NULL((void*)0);
345 nl->excl_fep = NULL((void*)0);
346
347}
348
349void nbnxn_init_search(nbnxn_search_t * nbs_ptr,
350 ivec *n_dd_cells,
351 gmx_domdec_zones_t *zones,
352 gmx_bool bFEP,
353 int nthread_max)
354{
355 nbnxn_search_t nbs;
356 int d, g, t;
357
358 snew(nbs, 1)(nbs) = save_calloc("nbs", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 358, (1), sizeof(*(nbs)))
;
359 *nbs_ptr = nbs;
360
361 nbs->bFEP = bFEP;
362
363 nbs->DomDec = (n_dd_cells != NULL((void*)0));
364
365 clear_ivec(nbs->dd_dim);
366 nbs->ngrid = 1;
367 if (nbs->DomDec)
368 {
369 nbs->zones = zones;
370
371 for (d = 0; d < DIM3; d++)
372 {
373 if ((*n_dd_cells)[d] > 1)
374 {
375 nbs->dd_dim[d] = 1;
376 /* Each grid matches a DD zone */
377 nbs->ngrid *= 2;
378 }
379 }
380 }
381
382 snew(nbs->grid, nbs->ngrid)(nbs->grid) = save_calloc("nbs->grid", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 382, (nbs->ngrid), sizeof(*(nbs->grid)))
;
383 for (g = 0; g < nbs->ngrid; g++)
384 {
385 nbnxn_grid_init(&nbs->grid[g]);
386 }
387 nbs->cell = NULL((void*)0);
388 nbs->cell_nalloc = 0;
389 nbs->a = NULL((void*)0);
390 nbs->a_nalloc = 0;
391
392 nbs->nthread_max = nthread_max;
393
394 /* Initialize the work data structures for each thread */
395 snew(nbs->work, nbs->nthread_max)(nbs->work) = save_calloc("nbs->work", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 395, (nbs->nthread_max), sizeof(*(nbs->work)))
;
396 for (t = 0; t < nbs->nthread_max; t++)
397 {
398 nbs->work[t].cxy_na = NULL((void*)0);
399 nbs->work[t].cxy_na_nalloc = 0;
400 nbs->work[t].sort_work = NULL((void*)0);
401 nbs->work[t].sort_work_nalloc = 0;
402
403 snew(nbs->work[t].nbl_fep, 1)(nbs->work[t].nbl_fep) = save_calloc("nbs->work[t].nbl_fep"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 403, (1), sizeof(*(nbs->work[t].nbl_fep)))
;
404 nbnxn_init_pairlist_fep(nbs->work[t].nbl_fep);
405 }
406
407 /* Initialize detailed nbsearch cycle counting */
408 nbs->print_cycles = (getenv("GMX_NBNXN_CYCLE") != 0);
409 nbs->search_count = 0;
410 nbs_cycle_clear(nbs->cc);
411 for (t = 0; t < nbs->nthread_max; t++)
412 {
413 nbs_cycle_clear(nbs->work[t].cc);
414 }
415}
416
417static real grid_atom_density(int n, rvec corner0, rvec corner1)
418{
419 rvec size;
420
421 rvec_sub(corner1, corner0, size);
422
423 return n/(size[XX0]*size[YY1]*size[ZZ2]);
424}
425
426static int set_grid_size_xy(const nbnxn_search_t nbs,
427 nbnxn_grid_t *grid,
428 int dd_zone,
429 int n, rvec corner0, rvec corner1,
430 real atom_density)
431{
432 rvec size;
433 int na_c;
434 real adens, tlen, tlen_x, tlen_y, nc_max;
435 int t;
436
437 rvec_sub(corner1, corner0, size);
438
439 if (n > grid->na_sc)
440 {
441 /* target cell length */
442 if (grid->bSimple)
443 {
444 /* To minimize the zero interactions, we should make
445 * the largest of the i/j cell cubic.
446 */
447 na_c = max(grid->na_c, grid->na_cj)(((grid->na_c) > (grid->na_cj)) ? (grid->na_c) : (
grid->na_cj) )
;
448
449 /* Approximately cubic cells */
450 tlen = pow(na_c/atom_density, 1.0/3.0);
451 tlen_x = tlen;
452 tlen_y = tlen;
453 }
454 else
455 {
456 /* Approximately cubic sub cells */
457 tlen = pow(grid->na_c/atom_density, 1.0/3.0);
458 tlen_x = tlen*GPU_NSUBCELL_X2;
459 tlen_y = tlen*GPU_NSUBCELL_Y2;
460 }
461 /* We round ncx and ncy down, because we get less cell pairs
462 * in the nbsist when the fixed cell dimensions (x,y) are
463 * larger than the variable one (z) than the other way around.
464 */
465 grid->ncx = max(1, (int)(size[XX]/tlen_x))(((1) > ((int)(size[0]/tlen_x))) ? (1) : ((int)(size[0]/tlen_x
)) )
;
466 grid->ncy = max(1, (int)(size[YY]/tlen_y))(((1) > ((int)(size[1]/tlen_y))) ? (1) : ((int)(size[1]/tlen_y
)) )
;
467 }
468 else
469 {
470 grid->ncx = 1;
471 grid->ncy = 1;
472 }
473
474 grid->sx = size[XX0]/grid->ncx;
475 grid->sy = size[YY1]/grid->ncy;
476 grid->inv_sx = 1/grid->sx;
477 grid->inv_sy = 1/grid->sy;
478
479 if (dd_zone > 0)
480 {
481 /* This is a non-home zone, add an extra row of cells
482 * for particles communicated for bonded interactions.
483 * These can be beyond the cut-off. It doesn't matter where
484 * they end up on the grid, but for performance it's better
485 * if they don't end up in cells that can be within cut-off range.
486 */
487 grid->ncx++;
488 grid->ncy++;
489 }
490
491 /* We need one additional cell entry for particles moved by DD */
492 if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
493 {
494 grid->cxy_nalloc = over_alloc_large(grid->ncx*grid->ncy+1)(int)(1.19*(grid->ncx*grid->ncy+1) + 1000);
495 srenew(grid->cxy_na, grid->cxy_nalloc)(grid->cxy_na) = save_realloc("grid->cxy_na", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 495, (grid->cxy_na), (grid->cxy_nalloc), sizeof(*(grid
->cxy_na)))
;
496 srenew(grid->cxy_ind, grid->cxy_nalloc+1)(grid->cxy_ind) = save_realloc("grid->cxy_ind", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 496, (grid->cxy_ind), (grid->cxy_nalloc+1), sizeof(*(
grid->cxy_ind)))
;
497 }
498 for (t = 0; t < nbs->nthread_max; t++)
499 {
500 if (grid->ncx*grid->ncy+1 > nbs->work[t].cxy_na_nalloc)
501 {
502 nbs->work[t].cxy_na_nalloc = over_alloc_large(grid->ncx*grid->ncy+1)(int)(1.19*(grid->ncx*grid->ncy+1) + 1000);
503 srenew(nbs->work[t].cxy_na, nbs->work[t].cxy_na_nalloc)(nbs->work[t].cxy_na) = save_realloc("nbs->work[t].cxy_na"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 503, (nbs->work[t].cxy_na), (nbs->work[t].cxy_na_nalloc
), sizeof(*(nbs->work[t].cxy_na)))
;
504 }
505 }
506
507 /* Worst case scenario of 1 atom in each last cell */
508 if (grid->na_cj <= grid->na_c)
509 {
510 nc_max = n/grid->na_sc + grid->ncx*grid->ncy;
511 }
512 else
513 {
514 nc_max = n/grid->na_sc + grid->ncx*grid->ncy*grid->na_cj/grid->na_c;
515 }
516
517 if (nc_max > grid->nc_nalloc)
518 {
519 grid->nc_nalloc = over_alloc_large(nc_max)(int)(1.19*(nc_max) + 1000);
520 srenew(grid->nsubc, grid->nc_nalloc)(grid->nsubc) = save_realloc("grid->nsubc", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 520, (grid->nsubc), (grid->nc_nalloc), sizeof(*(grid->
nsubc)))
;
521 srenew(grid->bbcz, grid->nc_nalloc*NNBSBB_D)(grid->bbcz) = save_realloc("grid->bbcz", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 521, (grid->bbcz), (grid->nc_nalloc*2), sizeof(*(grid
->bbcz)))
;
522
523 sfree_aligned(grid->bb)save_free_aligned("grid->bb", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 523, (grid->bb))
;
524 /* This snew also zeros the contents, this avoid possible
525 * floating exceptions in SIMD with the unused bb elements.
526 */
527 if (grid->bSimple)
528 {
529 snew_aligned(grid->bb, grid->nc_nalloc, 16)(grid->bb) = save_calloc_aligned("grid->bb", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 529, (grid->nc_nalloc), sizeof(*(grid->bb)), 16)
;
530 }
531 else
532 {
533#ifdef NBNXN_BBXXXX
534 int pbb_nalloc;
535
536 pbb_nalloc = grid->nc_nalloc*GPU_NSUBCELL(2*2*2)/STRIDE_PBB4*NNBSBB_XXXX(2*3*4);
537 snew_aligned(grid->pbb, pbb_nalloc, 16)(grid->pbb) = save_calloc_aligned("grid->pbb", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 537, (pbb_nalloc), sizeof(*(grid->pbb)), 16)
;
538#else
539 snew_aligned(grid->bb, grid->nc_nalloc*GPU_NSUBCELL, 16)(grid->bb) = save_calloc_aligned("grid->bb", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 539, (grid->nc_nalloc*(2*2*2)), sizeof(*(grid->bb)), 16
)
;
540#endif
541 }
542
543 if (grid->bSimple)
544 {
545 if (grid->na_cj == grid->na_c)
546 {
547 grid->bbj = grid->bb;
548 }
549 else
550 {
551 sfree_aligned(grid->bbj)save_free_aligned("grid->bbj", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 551, (grid->bbj))
;
552 snew_aligned(grid->bbj, grid->nc_nalloc*grid->na_c/grid->na_cj, 16)(grid->bbj) = save_calloc_aligned("grid->bbj", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 552, (grid->nc_nalloc*grid->na_c/grid->na_cj), sizeof
(*(grid->bbj)), 16)
;
553 }
554 }
555
556 srenew(grid->flags, grid->nc_nalloc)(grid->flags) = save_realloc("grid->flags", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 556, (grid->flags), (grid->nc_nalloc), sizeof(*(grid->
flags)))
;
557 if (nbs->bFEP)
558 {
559 srenew(grid->fep, grid->nc_nalloc*grid->na_sc/grid->na_c)(grid->fep) = save_realloc("grid->fep", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 559, (grid->fep), (grid->nc_nalloc*grid->na_sc/grid
->na_c), sizeof(*(grid->fep)))
;
560 }
561 }
562
563 copy_rvec(corner0, grid->c0);
564 copy_rvec(corner1, grid->c1);
565
566 return nc_max;
567}
568
569/* We need to sort paricles in grid columns on z-coordinate.
570 * As particle are very often distributed homogeneously, we a sorting
571 * algorithm similar to pigeonhole sort. We multiply the z-coordinate
572 * by a factor, cast to an int and try to store in that hole. If the hole
573 * is full, we move this or another particle. A second pass is needed to make
574 * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
575 * 4 is the optimal value for homogeneous particle distribution and allows
576 * for an O(#particles) sort up till distributions were all particles are
577 * concentrated in 1/4 of the space. No NlogN fallback is implemented,
578 * as it can be expensive to detect imhomogeneous particle distributions.
579 * SGSF is the maximum ratio of holes used, in the worst case all particles
580 * end up in the last hole and we need #particles extra holes at the end.
581 */
582#define SORT_GRID_OVERSIZE4 4
583#define SGSF(4 + 1) (SORT_GRID_OVERSIZE4 + 1)
584
585/* Sort particle index a on coordinates x along dim.
586 * Backwards tells if we want decreasing iso increasing coordinates.
587 * h0 is the minimum of the coordinate range.
588 * invh is the 1/length of the sorting range.
589 * n_per_h (>=n) is the expected average number of particles per 1/invh
590 * sort is the sorting work array.
591 * sort should have a size of at least n_per_h*SORT_GRID_OVERSIZE + n,
592 * or easier, allocate at least n*SGSF elements.
593 */
594static void sort_atoms(int dim, gmx_bool Backwards,
595 int gmx_unused__attribute__ ((unused)) dd_zone,
596 int *a, int n, rvec *x,
597 real h0, real invh, int n_per_h,
598 int *sort)
599{
600 int nsort, i, c;
601 int zi, zim, zi_min, zi_max;
602 int cp, tmp;
603
604 if (n <= 1)
605 {
606 /* Nothing to do */
607 return;
608 }
609
610#ifndef NDEBUG1
611 if (n > n_per_h)
612 {
613 gmx_incons("n > n_per_h")_gmx_error("incons", "n > n_per_h", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 613)
;
614 }
615#endif
616
617 /* Transform the inverse range height into the inverse hole height */
618 invh *= n_per_h*SORT_GRID_OVERSIZE4;
619
620 /* Set nsort to the maximum possible number of holes used.
621 * In worst case all n elements end up in the last bin.
622 */
623 nsort = n_per_h*SORT_GRID_OVERSIZE4 + n;
624
625 /* Determine the index range used, so we can limit it for the second pass */
626 zi_min = INT_MAX2147483647;
627 zi_max = -1;
628
629 /* Sort the particles using a simple index sort */
630 for (i = 0; i < n; i++)
631 {
632 /* The cast takes care of float-point rounding effects below zero.
633 * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
634 * times the box height out of the box.
635 */
636 zi = (int)((x[a[i]][dim] - h0)*invh);
637
638#ifndef NDEBUG1
639 /* As we can have rounding effect, we use > iso >= here */
640 if (zi < 0 || (dd_zone == 0 && zi > n_per_h*SORT_GRID_OVERSIZE4))
641 {
642 gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 642
, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
643 a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi,
644 n_per_h, SORT_GRID_OVERSIZE4);
645 }
646#endif
647
648 /* In a non-local domain, particles communcated for bonded interactions
649 * can be far beyond the grid size, which is set by the non-bonded
650 * cut-off distance. We sort such particles into the last cell.
651 */
652 if (zi > n_per_h*SORT_GRID_OVERSIZE4)
653 {
654 zi = n_per_h*SORT_GRID_OVERSIZE4;
655 }
656
657 /* Ideally this particle should go in sort cell zi,
658 * but that might already be in use,
659 * in that case find the first empty cell higher up
660 */
661 if (sort[zi] < 0)
662 {
663 sort[zi] = a[i];
664 zi_min = min(zi_min, zi)(((zi_min) < (zi)) ? (zi_min) : (zi) );
665 zi_max = max(zi_max, zi)(((zi_max) > (zi)) ? (zi_max) : (zi) );
666 }
667 else
668 {
669 /* We have multiple atoms in the same sorting slot.
670 * Sort on real z for minimal bounding box size.
671 * There is an extra check for identical z to ensure
672 * well-defined output order, independent of input order
673 * to ensure binary reproducibility after restarts.
674 */
675 while (sort[zi] >= 0 && ( x[a[i]][dim] > x[sort[zi]][dim] ||
676 (x[a[i]][dim] == x[sort[zi]][dim] &&
677 a[i] > sort[zi])))
678 {
679 zi++;
680 }
681
682 if (sort[zi] >= 0)
683 {
684 /* Shift all elements by one slot until we find an empty slot */
685 cp = sort[zi];
686 zim = zi + 1;
687 while (sort[zim] >= 0)
688 {
689 tmp = sort[zim];
690 sort[zim] = cp;
691 cp = tmp;
692 zim++;
693 }
694 sort[zim] = cp;
695 zi_max = max(zi_max, zim)(((zi_max) > (zim)) ? (zi_max) : (zim) );
696 }
697 sort[zi] = a[i];
698 zi_max = max(zi_max, zi)(((zi_max) > (zi)) ? (zi_max) : (zi) );
699 }
700 }
701
702 c = 0;
703 if (!Backwards)
704 {
705 for (zi = 0; zi < nsort; zi++)
706 {
707 if (sort[zi] >= 0)
708 {
709 a[c++] = sort[zi];
710 sort[zi] = -1;
711 }
712 }
713 }
714 else
715 {
716 for (zi = zi_max; zi >= zi_min; zi--)
717 {
718 if (sort[zi] >= 0)
719 {
720 a[c++] = sort[zi];
721 sort[zi] = -1;
722 }
723 }
724 }
725 if (c < n)
726 {
727 gmx_incons("Lost particles while sorting")_gmx_error("incons", "Lost particles while sorting", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 727)
;
728 }
729}
730
731#ifdef GMX_DOUBLE
732#define R2F_D(x)(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS5.96046448E-08)*(x)) : ((1+GMX_FLOAT_EPS5.96046448E-08)*(x))))
733#define R2F_U(x)(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS5.96046448E-08)*(x)) : ((1-GMX_FLOAT_EPS5.96046448E-08)*(x))))
734#else
735#define R2F_D(x)(x) (x)
736#define R2F_U(x)(x) (x)
737#endif
738
739/* Coordinate order x,y,z, bb order xyz0 */
740static void calc_bounding_box(int na, int stride, const real *x, nbnxn_bb_t *bb)
741{
742 int i, j;
743 real xl, xh, yl, yh, zl, zh;
744
745 i = 0;
746 xl = x[i+XX0];
747 xh = x[i+XX0];
748 yl = x[i+YY1];
749 yh = x[i+YY1];
750 zl = x[i+ZZ2];
751 zh = x[i+ZZ2];
752 i += stride;
753 for (j = 1; j < na; j++)
754 {
755 xl = min(xl, x[i+XX])(((xl) < (x[i+0])) ? (xl) : (x[i+0]) );
756 xh = max(xh, x[i+XX])(((xh) > (x[i+0])) ? (xh) : (x[i+0]) );
757 yl = min(yl, x[i+YY])(((yl) < (x[i+1])) ? (yl) : (x[i+1]) );
758 yh = max(yh, x[i+YY])(((yh) > (x[i+1])) ? (yh) : (x[i+1]) );
759 zl = min(zl, x[i+ZZ])(((zl) < (x[i+2])) ? (zl) : (x[i+2]) );
760 zh = max(zh, x[i+ZZ])(((zh) > (x[i+2])) ? (zh) : (x[i+2]) );
761 i += stride;
762 }
763 /* Note: possible double to float conversion here */
764 bb->lower[BB_X0] = R2F_D(xl)(xl);
765 bb->lower[BB_Y1] = R2F_D(yl)(yl);
766 bb->lower[BB_Z2] = R2F_D(zl)(zl);
767 bb->upper[BB_X0] = R2F_U(xh)(xh);
768 bb->upper[BB_Y1] = R2F_U(yh)(yh);
769 bb->upper[BB_Z2] = R2F_U(zh)(zh);
770}
771
772/* Packed coordinates, bb order xyz0 */
773static void calc_bounding_box_x_x4(int na, const real *x, nbnxn_bb_t *bb)
774{
775 int j;
776 real xl, xh, yl, yh, zl, zh;
777
778 xl = x[XX0*PACK_X44];
779 xh = x[XX0*PACK_X44];
780 yl = x[YY1*PACK_X44];
781 yh = x[YY1*PACK_X44];
782 zl = x[ZZ2*PACK_X44];
783 zh = x[ZZ2*PACK_X44];
784 for (j = 1; j < na; j++)
785 {
786 xl = min(xl, x[j+XX*PACK_X4])(((xl) < (x[j+0*4])) ? (xl) : (x[j+0*4]) );
787 xh = max(xh, x[j+XX*PACK_X4])(((xh) > (x[j+0*4])) ? (xh) : (x[j+0*4]) );
788 yl = min(yl, x[j+YY*PACK_X4])(((yl) < (x[j+1*4])) ? (yl) : (x[j+1*4]) );
789 yh = max(yh, x[j+YY*PACK_X4])(((yh) > (x[j+1*4])) ? (yh) : (x[j+1*4]) );
790 zl = min(zl, x[j+ZZ*PACK_X4])(((zl) < (x[j+2*4])) ? (zl) : (x[j+2*4]) );
791 zh = max(zh, x[j+ZZ*PACK_X4])(((zh) > (x[j+2*4])) ? (zh) : (x[j+2*4]) );
792 }
793 /* Note: possible double to float conversion here */
794 bb->lower[BB_X0] = R2F_D(xl)(xl);
795 bb->lower[BB_Y1] = R2F_D(yl)(yl);
796 bb->lower[BB_Z2] = R2F_D(zl)(zl);
797 bb->upper[BB_X0] = R2F_U(xh)(xh);
798 bb->upper[BB_Y1] = R2F_U(yh)(yh);
799 bb->upper[BB_Z2] = R2F_U(zh)(zh);
800}
801
802/* Packed coordinates, bb order xyz0 */
803static void calc_bounding_box_x_x8(int na, const real *x, nbnxn_bb_t *bb)
804{
805 int j;
806 real xl, xh, yl, yh, zl, zh;
807
808 xl = x[XX0*PACK_X88];
809 xh = x[XX0*PACK_X88];
810 yl = x[YY1*PACK_X88];
811 yh = x[YY1*PACK_X88];
812 zl = x[ZZ2*PACK_X88];
813 zh = x[ZZ2*PACK_X88];
814 for (j = 1; j < na; j++)
815 {
816 xl = min(xl, x[j+XX*PACK_X8])(((xl) < (x[j+0*8])) ? (xl) : (x[j+0*8]) );
817 xh = max(xh, x[j+XX*PACK_X8])(((xh) > (x[j+0*8])) ? (xh) : (x[j+0*8]) );
818 yl = min(yl, x[j+YY*PACK_X8])(((yl) < (x[j+1*8])) ? (yl) : (x[j+1*8]) );
819 yh = max(yh, x[j+YY*PACK_X8])(((yh) > (x[j+1*8])) ? (yh) : (x[j+1*8]) );
820 zl = min(zl, x[j+ZZ*PACK_X8])(((zl) < (x[j+2*8])) ? (zl) : (x[j+2*8]) );
821 zh = max(zh, x[j+ZZ*PACK_X8])(((zh) > (x[j+2*8])) ? (zh) : (x[j+2*8]) );
822 }
823 /* Note: possible double to float conversion here */
824 bb->lower[BB_X0] = R2F_D(xl)(xl);
825 bb->lower[BB_Y1] = R2F_D(yl)(yl);
826 bb->lower[BB_Z2] = R2F_D(zl)(zl);
827 bb->upper[BB_X0] = R2F_U(xh)(xh);
828 bb->upper[BB_Y1] = R2F_U(yh)(yh);
829 bb->upper[BB_Z2] = R2F_U(zh)(zh);
830}
831
832/* Packed coordinates, bb order xyz0 */
833static void calc_bounding_box_x_x4_halves(int na, const real *x,
834 nbnxn_bb_t *bb, nbnxn_bb_t *bbj)
835{
836 calc_bounding_box_x_x4(min(na, 2)(((na) < (2)) ? (na) : (2) ), x, bbj);
837
838 if (na > 2)
839 {
840 calc_bounding_box_x_x4(min(na-2, 2)(((na-2) < (2)) ? (na-2) : (2) ), x+(PACK_X44>>1), bbj+1);
841 }
842 else
843 {
844 /* Set the "empty" bounding box to the same as the first one,
845 * so we don't need to treat special cases in the rest of the code.
846 */
847#ifdef NBNXN_SEARCH_BB_SIMD4
848 gmx_simd4_store_f_mm_store_ps(&bbj[1].lower[0], gmx_simd4_load_f_mm_load_ps(&bbj[0].lower[0]));
849 gmx_simd4_store_f_mm_store_ps(&bbj[1].upper[0], gmx_simd4_load_f_mm_load_ps(&bbj[0].upper[0]));
850#else
851 bbj[1] = bbj[0];
852#endif
853 }
854
855#ifdef NBNXN_SEARCH_BB_SIMD4
856 gmx_simd4_store_f_mm_store_ps(&bb->lower[0],
857 gmx_simd4_min_f_mm_min_ps(gmx_simd4_load_f_mm_load_ps(&bbj[0].lower[0]),
858 gmx_simd4_load_f_mm_load_ps(&bbj[1].lower[0])));
859 gmx_simd4_store_f_mm_store_ps(&bb->upper[0],
860 gmx_simd4_max_f_mm_max_ps(gmx_simd4_load_f_mm_load_ps(&bbj[0].upper[0]),
861 gmx_simd4_load_f_mm_load_ps(&bbj[1].upper[0])));
862#else
863 {
864 int i;
865
866 for (i = 0; i < NNBSBB_C4; i++)
867 {
868 bb->lower[i] = min(bbj[0].lower[i], bbj[1].lower[i])(((bbj[0].lower[i]) < (bbj[1].lower[i])) ? (bbj[0].lower[i
]) : (bbj[1].lower[i]) )
;
869 bb->upper[i] = max(bbj[0].upper[i], bbj[1].upper[i])(((bbj[0].upper[i]) > (bbj[1].upper[i])) ? (bbj[0].upper[i
]) : (bbj[1].upper[i]) )
;
870 }
871 }
872#endif
873}
874
875#ifdef NBNXN_SEARCH_BB_SIMD4
876
877/* Coordinate order xyz, bb order xxxxyyyyzzzz */
878static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
879{
880 int i, j;
881 real xl, xh, yl, yh, zl, zh;
882
883 i = 0;
884 xl = x[i+XX0];
885 xh = x[i+XX0];
886 yl = x[i+YY1];
887 yh = x[i+YY1];
888 zl = x[i+ZZ2];
889 zh = x[i+ZZ2];
890 i += stride;
891 for (j = 1; j < na; j++)
892 {
893 xl = min(xl, x[i+XX])(((xl) < (x[i+0])) ? (xl) : (x[i+0]) );
894 xh = max(xh, x[i+XX])(((xh) > (x[i+0])) ? (xh) : (x[i+0]) );
895 yl = min(yl, x[i+YY])(((yl) < (x[i+1])) ? (yl) : (x[i+1]) );
896 yh = max(yh, x[i+YY])(((yh) > (x[i+1])) ? (yh) : (x[i+1]) );
897 zl = min(zl, x[i+ZZ])(((zl) < (x[i+2])) ? (zl) : (x[i+2]) );
898 zh = max(zh, x[i+ZZ])(((zh) > (x[i+2])) ? (zh) : (x[i+2]) );
899 i += stride;
900 }
901 /* Note: possible double to float conversion here */
902 bb[0*STRIDE_PBB4] = R2F_D(xl)(xl);
903 bb[1*STRIDE_PBB4] = R2F_D(yl)(yl);
904 bb[2*STRIDE_PBB4] = R2F_D(zl)(zl);
905 bb[3*STRIDE_PBB4] = R2F_U(xh)(xh);
906 bb[4*STRIDE_PBB4] = R2F_U(yh)(yh);
907 bb[5*STRIDE_PBB4] = R2F_U(zh)(zh);
908}
909
910#endif /* NBNXN_SEARCH_BB_SIMD4 */
911
912#ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB
913
914/* Coordinate order xyz?, bb order xyz0 */
915static void calc_bounding_box_simd4(int na, const float *x, nbnxn_bb_t *bb)
916{
917 gmx_simd4_float_t__m128 bb_0_S, bb_1_S;
918 gmx_simd4_float_t__m128 x_S;
919
920 int i;
921
922 bb_0_S = gmx_simd4_load_f_mm_load_ps(x);
923 bb_1_S = bb_0_S;
924
925 for (i = 1; i < na; i++)
926 {
927 x_S = gmx_simd4_load_f_mm_load_ps(x+i*NNBSBB_C4);
928 bb_0_S = gmx_simd4_min_f_mm_min_ps(bb_0_S, x_S);
929 bb_1_S = gmx_simd4_max_f_mm_max_ps(bb_1_S, x_S);
930 }
931
932 gmx_simd4_store_f_mm_store_ps(&bb->lower[0], bb_0_S);
933 gmx_simd4_store_f_mm_store_ps(&bb->upper[0], bb_1_S);
934}
935
936/* Coordinate order xyz?, bb order xxxxyyyyzzzz */
937static void calc_bounding_box_xxxx_simd4(int na, const float *x,
938 nbnxn_bb_t *bb_work_aligned,
939 real *bb)
940{
941 calc_bounding_box_simd4(na, x, bb_work_aligned);
942
943 bb[0*STRIDE_PBB4] = bb_work_aligned->lower[BB_X0];
944 bb[1*STRIDE_PBB4] = bb_work_aligned->lower[BB_Y1];
945 bb[2*STRIDE_PBB4] = bb_work_aligned->lower[BB_Z2];
946 bb[3*STRIDE_PBB4] = bb_work_aligned->upper[BB_X0];
947 bb[4*STRIDE_PBB4] = bb_work_aligned->upper[BB_Y1];
948 bb[5*STRIDE_PBB4] = bb_work_aligned->upper[BB_Z2];
949}
950
951#endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
952
953
954/* Combines pairs of consecutive bounding boxes */
955static void combine_bounding_box_pairs(nbnxn_grid_t *grid, const nbnxn_bb_t *bb)
956{
957 int i, j, sc2, nc2, c2;
958
959 for (i = 0; i < grid->ncx*grid->ncy; i++)
960 {
961 /* Starting bb in a column is expected to be 2-aligned */
962 sc2 = grid->cxy_ind[i]>>1;
963 /* For odd numbers skip the last bb here */
964 nc2 = (grid->cxy_na[i]+3)>>(2+1);
965 for (c2 = sc2; c2 < sc2+nc2; c2++)
966 {
967#ifdef NBNXN_SEARCH_BB_SIMD4
968 gmx_simd4_float_t__m128 min_S, max_S;
969
970 min_S = gmx_simd4_min_f_mm_min_ps(gmx_simd4_load_f_mm_load_ps(&bb[c2*2+0].lower[0]),
971 gmx_simd4_load_f_mm_load_ps(&bb[c2*2+1].lower[0]));
972 max_S = gmx_simd4_max_f_mm_max_ps(gmx_simd4_load_f_mm_load_ps(&bb[c2*2+0].upper[0]),
973 gmx_simd4_load_f_mm_load_ps(&bb[c2*2+1].upper[0]));
974 gmx_simd4_store_f_mm_store_ps(&grid->bbj[c2].lower[0], min_S);
975 gmx_simd4_store_f_mm_store_ps(&grid->bbj[c2].upper[0], max_S);
976#else
977 for (j = 0; j < NNBSBB_C4; j++)
978 {
979 grid->bbj[c2].lower[j] = min(bb[c2*2+0].lower[j],(((bb[c2*2+0].lower[j]) < (bb[c2*2+1].lower[j])) ? (bb[c2*
2+0].lower[j]) : (bb[c2*2+1].lower[j]) )
980 bb[c2*2+1].lower[j])(((bb[c2*2+0].lower[j]) < (bb[c2*2+1].lower[j])) ? (bb[c2*
2+0].lower[j]) : (bb[c2*2+1].lower[j]) )
;
981 grid->bbj[c2].upper[j] = max(bb[c2*2+0].upper[j],(((bb[c2*2+0].upper[j]) > (bb[c2*2+1].upper[j])) ? (bb[c2*
2+0].upper[j]) : (bb[c2*2+1].upper[j]) )
982 bb[c2*2+1].upper[j])(((bb[c2*2+0].upper[j]) > (bb[c2*2+1].upper[j])) ? (bb[c2*
2+0].upper[j]) : (bb[c2*2+1].upper[j]) )
;
983 }
984#endif
985 }
986 if (((grid->cxy_na[i]+3)>>2) & 1)
987 {
988 /* The bb count in this column is odd: duplicate the last bb */
989 for (j = 0; j < NNBSBB_C4; j++)
990 {
991 grid->bbj[c2].lower[j] = bb[c2*2].lower[j];
992 grid->bbj[c2].upper[j] = bb[c2*2].upper[j];
993 }
994 }
995 }
996}
997
998
999/* Prints the average bb size, used for debug output */
1000static void print_bbsizes_simple(FILE *fp,
1001 const nbnxn_search_t nbs,
1002 const nbnxn_grid_t *grid)
1003{
1004 int c, d;
1005 dvec ba;
1006
1007 clear_dvec(ba);
1008 for (c = 0; c < grid->nc; c++)
1009 {
1010 for (d = 0; d < DIM3; d++)
1011 {
1012 ba[d] += grid->bb[c].upper[d] - grid->bb[c].lower[d];
1013 }
1014 }
1015 dsvmul(1.0/grid->nc, ba, ba);
1016
1017 fprintf(fp, "ns bb: %4.2f %4.2f %4.2f %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
1018 nbs->box[XX0][XX0]/grid->ncx,
1019 nbs->box[YY1][YY1]/grid->ncy,
1020 nbs->box[ZZ2][ZZ2]*grid->ncx*grid->ncy/grid->nc,
1021 ba[XX0], ba[YY1], ba[ZZ2],
1022 ba[XX0]*grid->ncx/nbs->box[XX0][XX0],
1023 ba[YY1]*grid->ncy/nbs->box[YY1][YY1],
1024 ba[ZZ2]*grid->nc/(grid->ncx*grid->ncy*nbs->box[ZZ2][ZZ2]));
1025}
1026
1027/* Prints the average bb size, used for debug output */
1028static void print_bbsizes_supersub(FILE *fp,
1029 const nbnxn_search_t nbs,
1030 const nbnxn_grid_t *grid)
1031{
1032 int ns, c, s;
1033 dvec ba;
1034
1035 clear_dvec(ba);
1036 ns = 0;
1037 for (c = 0; c < grid->nc; c++)
1038 {
1039#ifdef NBNXN_BBXXXX
1040 for (s = 0; s < grid->nsubc[c]; s += STRIDE_PBB4)
1041 {
1042 int cs_w, i, d;
1043
1044 cs_w = (c*GPU_NSUBCELL(2*2*2) + s)/STRIDE_PBB4;
1045 for (i = 0; i < STRIDE_PBB4; i++)
1046 {
1047 for (d = 0; d < DIM3; d++)
1048 {
1049 ba[d] +=
1050 grid->pbb[cs_w*NNBSBB_XXXX(2*3*4)+(DIM3+d)*STRIDE_PBB4+i] -
1051 grid->pbb[cs_w*NNBSBB_XXXX(2*3*4)+ d *STRIDE_PBB4+i];
1052 }
1053 }
1054 }
1055#else
1056 for (s = 0; s < grid->nsubc[c]; s++)
1057 {
1058 int cs, d;
1059
1060 cs = c*GPU_NSUBCELL(2*2*2) + s;
1061 for (d = 0; d < DIM3; d++)
1062 {
1063 ba[d] += grid->bb[cs].upper[d] - grid->bb[cs].lower[d];
1064 }
1065 }
1066#endif
1067 ns += grid->nsubc[c];
1068 }
1069 dsvmul(1.0/ns, ba, ba);
1070
1071 fprintf(fp, "ns bb: %4.2f %4.2f %4.2f %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
1072 nbs->box[XX0][XX0]/(grid->ncx*GPU_NSUBCELL_X2),
1073 nbs->box[YY1][YY1]/(grid->ncy*GPU_NSUBCELL_Y2),
1074 nbs->box[ZZ2][ZZ2]*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z2),
1075 ba[XX0], ba[YY1], ba[ZZ2],
1076 ba[XX0]*grid->ncx*GPU_NSUBCELL_X2/nbs->box[XX0][XX0],
1077 ba[YY1]*grid->ncy*GPU_NSUBCELL_Y2/nbs->box[YY1][YY1],
1078 ba[ZZ2]*grid->nc*GPU_NSUBCELL_Z2/(grid->ncx*grid->ncy*nbs->box[ZZ2][ZZ2]));
1079}
1080
1081/* Potentially sorts atoms on LJ coefficients !=0 and ==0.
1082 * Also sets interaction flags.
1083 */
1084void sort_on_lj(int na_c,
1085 int a0, int a1, const int *atinfo,
1086 int *order,
1087 int *flags)
1088{
1089 int subc, s, a, n1, n2, a_lj_max, i, j;
1090 int sort1[NBNXN_NA_SC_MAX((2*2*2)*8)/GPU_NSUBCELL(2*2*2)];
1091 int sort2[NBNXN_NA_SC_MAX((2*2*2)*8)/GPU_NSUBCELL(2*2*2)];
1092 gmx_bool haveQ, bFEP;
1093
1094 *flags = 0;
1095
1096 subc = 0;
1097 for (s = a0; s < a1; s += na_c)
1098 {
1099 /* Make lists for this (sub-)cell on atoms with and without LJ */
1100 n1 = 0;
1101 n2 = 0;
1102 haveQ = FALSE0;
1103 a_lj_max = -1;
1104 for (a = s; a < min(s+na_c, a1)(((s+na_c) < (a1)) ? (s+na_c) : (a1) ); a++)
1105 {
1106 haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]])( (atinfo[order[a]]) & (1<<24));
1107
1108 if (GET_CGINFO_HAS_VDW(atinfo[order[a]])( (atinfo[order[a]]) & (1<<23)))
1109 {
1110 sort1[n1++] = order[a];
1111 a_lj_max = a;
1112 }
1113 else
1114 {
1115 sort2[n2++] = order[a];
1116 }
1117 }
1118
1119 /* If we don't have atoms with LJ, there's nothing to sort */
1120 if (n1 > 0)
1121 {
1122 *flags |= NBNXN_CI_DO_LJ(subc)(1<<(7+3*(subc)));
1123
1124 if (2*n1 <= na_c)
1125 {
1126 /* Only sort when strictly necessary. Ordering particles
1127 * Ordering particles can lead to less accurate summation
1128 * due to rounding, both for LJ and Coulomb interactions.
1129 */
1130 if (2*(a_lj_max - s) >= na_c)
1131 {
1132 for (i = 0; i < n1; i++)
1133 {
1134 order[a0+i] = sort1[i];
1135 }
1136 for (j = 0; j < n2; j++)
1137 {
1138 order[a0+n1+j] = sort2[j];
1139 }
1140 }
1141
1142 *flags |= NBNXN_CI_HALF_LJ(subc)(1<<(8+3*(subc)));
1143 }
1144 }
1145 if (haveQ)
1146 {
1147 *flags |= NBNXN_CI_DO_COUL(subc)(1<<(9+3*(subc)));
1148 }
1149 subc++;
1150 }
1151}
1152
1153/* Fill a pair search cell with atoms.
1154 * Potentially sorts atoms and sets the interaction flags.
1155 */
1156void fill_cell(const nbnxn_search_t nbs,
1157 nbnxn_grid_t *grid,
1158 nbnxn_atomdata_t *nbat,
1159 int a0, int a1,
1160 const int *atinfo,
1161 rvec *x,
1162 int sx, int sy, int sz,
1163 nbnxn_bb_t gmx_unused__attribute__ ((unused)) *bb_work_aligned)
1164{
1165 int na, a;
1166 size_t offset;
1167 nbnxn_bb_t *bb_ptr;
1168#ifdef NBNXN_BBXXXX
1169 float *pbb_ptr;
1170#endif
1171
1172 na = a1 - a0;
1173
1174 if (grid->bSimple)
1175 {
1176 sort_on_lj(grid->na_c, a0, a1, atinfo, nbs->a,
1177 grid->flags+(a0>>grid->na_c_2log)-grid->cell0);
1178 }
1179
1180 if (nbs->bFEP)
1181 {
1182 /* Set the fep flag for perturbed atoms in this (sub-)cell */
1183 int c, at;
1184
1185 /* The grid-local cluster/(sub-)cell index */
1186 c = (a0 >> grid->na_c_2log) - grid->cell0*(grid->bSimple ? 1 : GPU_NSUBCELL(2*2*2));
1187 grid->fep[c] = 0;
1188 for (at = a0; at < a1; at++)
1189 {
1190 if (nbs->a[at] >= 0 && GET_CGINFO_FEP(atinfo[nbs->a[at]])( (atinfo[nbs->a[at]]) & (1<<15)))
1191 {
1192 grid->fep[c] |= (1 << (at - a0));
1193 }
1194 }
1195 }
1196
1197 /* Now we have sorted the atoms, set the cell indices */
1198 for (a = a0; a < a1; a++)
1199 {
1200 nbs->cell[nbs->a[a]] = a;
1201 }
1202
1203 copy_rvec_to_nbat_real(nbs->a+a0, a1-a0, grid->na_c, x,
1204 nbat->XFormat, nbat->x, a0,
1205 sx, sy, sz);
1206
1207 if (nbat->XFormat == nbatX4)
1208 {
1209 /* Store the bounding boxes as xyz.xyz. */
1210 offset = (a0 - grid->cell0*grid->na_sc) >> grid->na_c_2log;
1211 bb_ptr = grid->bb + offset;
1212
1213#if defined GMX_NBNXN_SIMD && GMX_SIMD_REAL_WIDTH4 == 2
1214 if (2*grid->na_cj == grid->na_c)
1215 {
1216 calc_bounding_box_x_x4_halves(na, nbat->x+X4_IND_A(a0)((3*4)*((a0) >> 2) + ((a0) & (4 - 1))), bb_ptr,
1217 grid->bbj+offset*2);
1218 }
1219 else
1220#endif
1221 {
1222 calc_bounding_box_x_x4(na, nbat->x+X4_IND_A(a0)((3*4)*((a0) >> 2) + ((a0) & (4 - 1))), bb_ptr);
1223 }
1224 }
1225 else if (nbat->XFormat == nbatX8)
1226 {
1227 /* Store the bounding boxes as xyz.xyz. */
1228 offset = (a0 - grid->cell0*grid->na_sc) >> grid->na_c_2log;
1229 bb_ptr = grid->bb + offset;
1230
1231 calc_bounding_box_x_x8(na, nbat->x+X8_IND_A(a0)((3*8)*((a0) >> 3) + ((a0) & (8 - 1))), bb_ptr);
1232 }
1233#ifdef NBNXN_BBXXXX
1234 else if (!grid->bSimple)
1235 {
1236 /* Store the bounding boxes in a format convenient
1237 * for SIMD4 calculations: xxxxyyyyzzzz...
1238 */
1239 pbb_ptr =
1240 grid->pbb +
1241 ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_PBB_2LOG2))*NNBSBB_XXXX(2*3*4) +
1242 (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_PBB4-1));
1243
1244#ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB
1245 if (nbat->XFormat == nbatXYZQ)
1246 {
1247 calc_bounding_box_xxxx_simd4(na, nbat->x+a0*nbat->xstride,
1248 bb_work_aligned, pbb_ptr);
1249 }
1250 else
1251#endif
1252 {
1253 calc_bounding_box_xxxx(na, nbat->xstride, nbat->x+a0*nbat->xstride,
1254 pbb_ptr);
1255 }
1256 if (gmx_debug_at)
1257 {
1258 fprintf(debug, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
1259 sx, sy, sz,
1260 pbb_ptr[0*STRIDE_PBB4], pbb_ptr[3*STRIDE_PBB4],
1261 pbb_ptr[1*STRIDE_PBB4], pbb_ptr[4*STRIDE_PBB4],
1262 pbb_ptr[2*STRIDE_PBB4], pbb_ptr[5*STRIDE_PBB4]);
1263 }
1264 }
1265#endif
1266 else
1267 {
1268 /* Store the bounding boxes as xyz.xyz. */
1269 bb_ptr = grid->bb+((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log);
1270
1271 calc_bounding_box(na, nbat->xstride, nbat->x+a0*nbat->xstride,
1272 bb_ptr);
1273
1274 if (gmx_debug_at)
1275 {
1276 int bbo;
1277 bbo = (a0 - grid->cell0*grid->na_sc)/grid->na_c;
1278 fprintf(debug, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
1279 sx, sy, sz,
1280 grid->bb[bbo].lower[BB_X0],
1281 grid->bb[bbo].lower[BB_Y1],
1282 grid->bb[bbo].lower[BB_Z2],
1283 grid->bb[bbo].upper[BB_X0],
1284 grid->bb[bbo].upper[BB_Y1],
1285 grid->bb[bbo].upper[BB_Z2]);
1286 }
1287 }
1288}
1289
1290/* Spatially sort the atoms within one grid column */
1291static void sort_columns_simple(const nbnxn_search_t nbs,
1292 int dd_zone,
1293 nbnxn_grid_t *grid,
1294 int a0, int a1,
1295 const int *atinfo,
1296 rvec *x,
1297 nbnxn_atomdata_t *nbat,
1298 int cxy_start, int cxy_end,
1299 int *sort_work)
1300{
1301 int cxy;
1302 int cx, cy, cz, ncz, cfilled, c;
1303 int na, ash, ind, a;
1304 int na_c, ash_c;
1305
1306 if (debug)
1307 {
1308 fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1309 grid->cell0, cxy_start, cxy_end, a0, a1);
1310 }
1311
1312 /* Sort the atoms within each x,y column in 3 dimensions */
1313 for (cxy = cxy_start; cxy < cxy_end; cxy++)
1314 {
1315 cx = cxy/grid->ncy;
1316 cy = cxy - cx*grid->ncy;
1317
1318 na = grid->cxy_na[cxy];
1319 ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
1320 ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
1321
1322 /* Sort the atoms within each x,y column on z coordinate */
1323 sort_atoms(ZZ2, FALSE0, dd_zone,
1324 nbs->a+ash, na, x,
1325 grid->c0[ZZ2],
1326 1.0/nbs->box[ZZ2][ZZ2], ncz*grid->na_sc,
1327 sort_work);
1328
1329 /* Fill the ncz cells in this column */
1330 cfilled = grid->cxy_ind[cxy];
1331 for (cz = 0; cz < ncz; cz++)
1332 {
1333 c = grid->cxy_ind[cxy] + cz;
1334
1335 ash_c = ash + cz*grid->na_sc;
1336 na_c = min(grid->na_sc, na-(ash_c-ash))(((grid->na_sc) < (na-(ash_c-ash))) ? (grid->na_sc) :
(na-(ash_c-ash)) )
;
1337
1338 fill_cell(nbs, grid, nbat,
1339 ash_c, ash_c+na_c, atinfo, x,
1340 grid->na_sc*cx + (dd_zone >> 2),
1341 grid->na_sc*cy + (dd_zone & 3),
1342 grid->na_sc*cz,
1343 NULL((void*)0));
1344
1345 /* This copy to bbcz is not really necessary.
1346 * But it allows to use the same grid search code
1347 * for the simple and supersub cell setups.
1348 */
1349 if (na_c > 0)
1350 {
1351 cfilled = c;
1352 }
1353 grid->bbcz[c*NNBSBB_D2 ] = grid->bb[cfilled].lower[BB_Z2];
1354 grid->bbcz[c*NNBSBB_D2+1] = grid->bb[cfilled].upper[BB_Z2];
1355 }
1356
1357 /* Set the unused atom indices to -1 */
1358 for (ind = na; ind < ncz*grid->na_sc; ind++)
1359 {
1360 nbs->a[ash+ind] = -1;
1361 }
1362 }
1363}
1364
1365/* Spatially sort the atoms within one grid column */
1366static void sort_columns_supersub(const nbnxn_search_t nbs,
1367 int dd_zone,
1368 nbnxn_grid_t *grid,
1369 int a0, int a1,
1370 const int *atinfo,
1371 rvec *x,
1372 nbnxn_atomdata_t *nbat,
1373 int cxy_start, int cxy_end,
1374 int *sort_work)
1375{
1376 int cxy;
1377 int cx, cy, cz = -1, c = -1, ncz;
1378 int na, ash, na_c, ind, a;
1379 int subdiv_z, sub_z, na_z, ash_z;
1380 int subdiv_y, sub_y, na_y, ash_y;
1381 int subdiv_x, sub_x, na_x, ash_x;
1382
1383 /* cppcheck-suppress unassignedVariable */
1384 nbnxn_bb_t bb_work_array[2], *bb_work_aligned;
1385
1386 bb_work_aligned = (nbnxn_bb_t *)(((size_t)(bb_work_array+1)) & (~((size_t)15)));
1387
1388 if (debug)
1389 {
1390 fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1391 grid->cell0, cxy_start, cxy_end, a0, a1);
1392 }
1393
1394 subdiv_x = grid->na_c;
1395 subdiv_y = GPU_NSUBCELL_X2*subdiv_x;
1396 subdiv_z = GPU_NSUBCELL_Y2*subdiv_y;
1397
1398 /* Sort the atoms within each x,y column in 3 dimensions */
1399 for (cxy = cxy_start; cxy < cxy_end; cxy++)
1400 {
1401 cx = cxy/grid->ncy;
1402 cy = cxy - cx*grid->ncy;
1403
1404 na = grid->cxy_na[cxy];
1405 ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
1406 ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
1407
1408 /* Sort the atoms within each x,y column on z coordinate */
1409 sort_atoms(ZZ2, FALSE0, dd_zone,
1410 nbs->a+ash, na, x,
1411 grid->c0[ZZ2],
1412 1.0/nbs->box[ZZ2][ZZ2], ncz*grid->na_sc,
1413 sort_work);
1414
1415 /* This loop goes over the supercells and subcells along z at once */
1416 for (sub_z = 0; sub_z < ncz*GPU_NSUBCELL_Z2; sub_z++)
1417 {
1418 ash_z = ash + sub_z*subdiv_z;
1419 na_z = min(subdiv_z, na-(ash_z-ash))(((subdiv_z) < (na-(ash_z-ash))) ? (subdiv_z) : (na-(ash_z
-ash)) )
;
1420
1421 /* We have already sorted on z */
1422
1423 if (sub_z % GPU_NSUBCELL_Z2 == 0)
1424 {
1425 cz = sub_z/GPU_NSUBCELL_Z2;
1426 c = grid->cxy_ind[cxy] + cz;
1427
1428 /* The number of atoms in this supercell */
1429 na_c = min(grid->na_sc, na-(ash_z-ash))(((grid->na_sc) < (na-(ash_z-ash))) ? (grid->na_sc) :
(na-(ash_z-ash)) )
;
1430
1431 grid->nsubc[c] = min(GPU_NSUBCELL, (na_c+grid->na_c-1)/grid->na_c)((((2*2*2)) < ((na_c+grid->na_c-1)/grid->na_c)) ? ((
2*2*2)) : ((na_c+grid->na_c-1)/grid->na_c) )
;
1432
1433 /* Store the z-boundaries of the super cell */
1434 grid->bbcz[c*NNBSBB_D2 ] = x[nbs->a[ash_z]][ZZ2];
1435 grid->bbcz[c*NNBSBB_D2+1] = x[nbs->a[ash_z+na_c-1]][ZZ2];
1436 }
1437
1438#if GPU_NSUBCELL_Y2 > 1
1439 /* Sort the atoms along y */
1440 sort_atoms(YY1, (sub_z & 1), dd_zone,
1441 nbs->a+ash_z, na_z, x,
1442 grid->c0[YY1]+cy*grid->sy,
1443 grid->inv_sy, subdiv_z,
1444 sort_work);
1445#endif
1446
1447 for (sub_y = 0; sub_y < GPU_NSUBCELL_Y2; sub_y++)
1448 {
1449 ash_y = ash_z + sub_y*subdiv_y;
1450 na_y = min(subdiv_y, na-(ash_y-ash))(((subdiv_y) < (na-(ash_y-ash))) ? (subdiv_y) : (na-(ash_y
-ash)) )
;
1451
1452#if GPU_NSUBCELL_X2 > 1
1453 /* Sort the atoms along x */
1454 sort_atoms(XX0, ((cz*GPU_NSUBCELL_Y2 + sub_y) & 1), dd_zone,
1455 nbs->a+ash_y, na_y, x,
1456 grid->c0[XX0]+cx*grid->sx,
1457 grid->inv_sx, subdiv_y,
1458 sort_work);
1459#endif
1460
1461 for (sub_x = 0; sub_x < GPU_NSUBCELL_X2; sub_x++)
1462 {
1463 ash_x = ash_y + sub_x*subdiv_x;
1464 na_x = min(subdiv_x, na-(ash_x-ash))(((subdiv_x) < (na-(ash_x-ash))) ? (subdiv_x) : (na-(ash_x
-ash)) )
;
1465
1466 fill_cell(nbs, grid, nbat,
1467 ash_x, ash_x+na_x, atinfo, x,
1468 grid->na_c*(cx*GPU_NSUBCELL_X2+sub_x) + (dd_zone >> 2),
1469 grid->na_c*(cy*GPU_NSUBCELL_Y2+sub_y) + (dd_zone & 3),
1470 grid->na_c*sub_z,
1471 bb_work_aligned);
1472 }
1473 }
1474 }
1475
1476 /* Set the unused atom indices to -1 */
1477 for (ind = na; ind < ncz*grid->na_sc; ind++)
1478 {
1479 nbs->a[ash+ind] = -1;
1480 }
1481 }
1482}
1483
1484/* Determine in which grid column atoms should go */
1485static void calc_column_indices(nbnxn_grid_t *grid,
1486 int a0, int a1,
1487 rvec *x,
1488 int dd_zone, const int *move,
1489 int thread, int nthread,
1490 int *cell,
1491 int *cxy_na)
1492{
1493 int n0, n1, i;
1494 int cx, cy;
1495
1496 /* We add one extra cell for particles which moved during DD */
1497 for (i = 0; i < grid->ncx*grid->ncy+1; i++)
1498 {
1499 cxy_na[i] = 0;
1500 }
1501
1502 n0 = a0 + (int)((thread+0)*(a1 - a0))/nthread;
1503 n1 = a0 + (int)((thread+1)*(a1 - a0))/nthread;
1504 if (dd_zone == 0)
1505 {
1506 /* Home zone */
1507 for (i = n0; i < n1; i++)
1508 {
1509 if (move == NULL((void*)0) || move[i] >= 0)
1510 {
1511 /* We need to be careful with rounding,
1512 * particles might be a few bits outside the local zone.
1513 * The int cast takes care of the lower bound,
1514 * we will explicitly take care of the upper bound.
1515 */
1516 cx = (int)((x[i][XX0] - grid->c0[XX0])*grid->inv_sx);
1517 cy = (int)((x[i][YY1] - grid->c0[YY1])*grid->inv_sy);
1518
1519#ifndef NDEBUG1
1520 if (cx < 0 || cx > grid->ncx ||
1521 cy < 0 || cy > grid->ncy)
1522 {
1523 gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 1523
,
1524 "grid cell cx %d cy %d out of range (max %d %d)\n"
1525 "atom %f %f %f, grid->c0 %f %f",
1526 cx, cy, grid->ncx, grid->ncy,
1527 x[i][XX0], x[i][YY1], x[i][ZZ2], grid->c0[XX0], grid->c0[YY1]);
1528 }
1529#endif
1530 /* Take care of potential rouding issues */
1531 cx = min(cx, grid->ncx - 1)(((cx) < (grid->ncx - 1)) ? (cx) : (grid->ncx - 1) );
1532 cy = min(cy, grid->ncy - 1)(((cy) < (grid->ncy - 1)) ? (cy) : (grid->ncy - 1) );
1533
1534 /* For the moment cell will contain only the, grid local,
1535 * x and y indices, not z.
1536 */
1537 cell[i] = cx*grid->ncy + cy;
1538 }
1539 else
1540 {
1541 /* Put this moved particle after the end of the grid,
1542 * so we can process it later without using conditionals.
1543 */
1544 cell[i] = grid->ncx*grid->ncy;
1545 }
1546
1547 cxy_na[cell[i]]++;
1548 }
1549 }
1550 else
1551 {
1552 /* Non-home zone */
1553 for (i = n0; i < n1; i++)
1554 {
1555 cx = (int)((x[i][XX0] - grid->c0[XX0])*grid->inv_sx);
1556 cy = (int)((x[i][YY1] - grid->c0[YY1])*grid->inv_sy);
1557
1558 /* For non-home zones there could be particles outside
1559 * the non-bonded cut-off range, which have been communicated
1560 * for bonded interactions only. For the result it doesn't
1561 * matter where these end up on the grid. For performance
1562 * we put them in an extra row at the border.
1563 */
1564 cx = max(cx, 0)(((cx) > (0)) ? (cx) : (0) );
1565 cx = min(cx, grid->ncx - 1)(((cx) < (grid->ncx - 1)) ? (cx) : (grid->ncx - 1) );
1566 cy = max(cy, 0)(((cy) > (0)) ? (cy) : (0) );
1567 cy = min(cy, grid->ncy - 1)(((cy) < (grid->ncy - 1)) ? (cy) : (grid->ncy - 1) );
1568
1569 /* For the moment cell will contain only the, grid local,
1570 * x and y indices, not z.
1571 */
1572 cell[i] = cx*grid->ncy + cy;
1573
1574 cxy_na[cell[i]]++;
1575 }
1576 }
1577}
1578
1579/* Determine in which grid cells the atoms should go */
1580static void calc_cell_indices(const nbnxn_search_t nbs,
1581 int dd_zone,
1582 nbnxn_grid_t *grid,
1583 int a0, int a1,
1584 const int *atinfo,
1585 rvec *x,
1586 const int *move,
1587 nbnxn_atomdata_t *nbat)
1588{
1589 int n0, n1, i;
1590 int cx, cy, cxy, ncz_max, ncz;
1591 int nthread, thread;
1592 int *cxy_na, cxy_na_i;
1593
1594 nthread = gmx_omp_nthreads_get(emntPairsearch);
1595
1596#pragma omp parallel for num_threads(nthread) schedule(static)
1597 for (thread = 0; thread < nthread; thread++)
1598 {
1599 calc_column_indices(grid, a0, a1, x, dd_zone, move, thread, nthread,
1600 nbs->cell, nbs->work[thread].cxy_na);
1601 }
1602
1603 /* Make the cell index as a function of x and y */
1604 ncz_max = 0;
1605 ncz = 0;
1606 grid->cxy_ind[0] = 0;
1607 for (i = 0; i < grid->ncx*grid->ncy+1; i++)
1608 {
1609 /* We set ncz_max at the beginning of the loop iso at the end
1610 * to skip i=grid->ncx*grid->ncy which are moved particles
1611 * that do not need to be ordered on the grid.
1612 */
1613 if (ncz > ncz_max)
1614 {
1615 ncz_max = ncz;
1616 }
1617 cxy_na_i = nbs->work[0].cxy_na[i];
1618 for (thread = 1; thread < nthread; thread++)
1619 {
1620 cxy_na_i += nbs->work[thread].cxy_na[i];
1621 }
1622 ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
1623 if (nbat->XFormat == nbatX8)
1624 {
1625 /* Make the number of cell a multiple of 2 */
1626 ncz = (ncz + 1) & ~1;
1627 }
1628 grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
1629 /* Clear cxy_na, so we can reuse the array below */
1630 grid->cxy_na[i] = 0;
1631 }
1632 grid->nc = grid->cxy_ind[grid->ncx*grid->ncy] - grid->cxy_ind[0];
1633
1634 nbat->natoms = (grid->cell0 + grid->nc)*grid->na_sc;
1635
1636 if (debug)
1637 {
1638 fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1639 grid->na_sc, grid->na_c, grid->nc,
1640 grid->ncx, grid->ncy, grid->nc/((double)(grid->ncx*grid->ncy)),
1641 ncz_max);
1642 if (gmx_debug_at)
1643 {
1644 i = 0;
1645 for (cy = 0; cy < grid->ncy; cy++)
1646 {
1647 for (cx = 0; cx < grid->ncx; cx++)
1648 {
1649 fprintf(debug, " %2d", grid->cxy_ind[i+1]-grid->cxy_ind[i]);
1650 i++;
1651 }
1652 fprintf(debug, "\n");
1653 }
1654 }
1655 }
1656
1657 /* Make sure the work array for sorting is large enough */
1658 if (ncz_max*grid->na_sc*SGSF(4 + 1) > nbs->work[0].sort_work_nalloc)
1659 {
1660 for (thread = 0; thread < nbs->nthread_max; thread++)
1661 {
1662 nbs->work[thread].sort_work_nalloc =
1663 over_alloc_large(ncz_max*grid->na_sc*SGSF)(int)(1.19*(ncz_max*grid->na_sc*(4 + 1)) + 1000);
1664 srenew(nbs->work[thread].sort_work,(nbs->work[thread].sort_work) = save_realloc("nbs->work[thread].sort_work"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 1665, (nbs->work[thread].sort_work), (nbs->work[thread
].sort_work_nalloc), sizeof(*(nbs->work[thread].sort_work)
))
1665 nbs->work[thread].sort_work_nalloc)(nbs->work[thread].sort_work) = save_realloc("nbs->work[thread].sort_work"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 1665, (nbs->work[thread].sort_work), (nbs->work[thread
].sort_work_nalloc), sizeof(*(nbs->work[thread].sort_work)
))
;
1666 /* When not in use, all elements should be -1 */
1667 for (i = 0; i < nbs->work[thread].sort_work_nalloc; i++)
1668 {
1669 nbs->work[thread].sort_work[i] = -1;
1670 }
1671 }
1672 }
1673
1674 /* Now we know the dimensions we can fill the grid.
1675 * This is the first, unsorted fill. We sort the columns after this.
1676 */
1677 for (i = a0; i < a1; i++)
1678 {
1679 /* At this point nbs->cell contains the local grid x,y indices */
1680 cxy = nbs->cell[i];
1681 nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
1682 }
1683
1684 if (dd_zone == 0)
1685 {
1686 /* Set the cell indices for the moved particles */
1687 n0 = grid->nc*grid->na_sc;
1688 n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
1689 if (dd_zone == 0)
1690 {
1691 for (i = n0; i < n1; i++)
1692 {
1693 nbs->cell[nbs->a[i]] = i;
1694 }
1695 }
1696 }
1697
1698 /* Sort the super-cell columns along z into the sub-cells. */
1699#pragma omp parallel for num_threads(nbs->nthread_max) schedule(static)
1700 for (thread = 0; thread < nbs->nthread_max; thread++)
1701 {
1702 if (grid->bSimple)
1703 {
1704 sort_columns_simple(nbs, dd_zone, grid, a0, a1, atinfo, x, nbat,
1705 ((thread+0)*grid->ncx*grid->ncy)/nthread,
1706 ((thread+1)*grid->ncx*grid->ncy)/nthread,
1707 nbs->work[thread].sort_work);
1708 }
1709 else
1710 {
1711 sort_columns_supersub(nbs, dd_zone, grid, a0, a1, atinfo, x, nbat,
1712 ((thread+0)*grid->ncx*grid->ncy)/nthread,
1713 ((thread+1)*grid->ncx*grid->ncy)/nthread,
1714 nbs->work[thread].sort_work);
1715 }
1716 }
1717
1718 if (grid->bSimple && nbat->XFormat == nbatX8)
1719 {
1720 combine_bounding_box_pairs(grid, grid->bb);
1721 }
1722
1723 if (!grid->bSimple)
1724 {
1725 grid->nsubc_tot = 0;
1726 for (i = 0; i < grid->nc; i++)
1727 {
1728 grid->nsubc_tot += grid->nsubc[i];
1729 }
1730 }
1731
1732 if (debug)
1733 {
1734 if (grid->bSimple)
1735 {
1736 print_bbsizes_simple(debug, nbs, grid);
1737 }
1738 else
1739 {
1740 fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n",
1741 grid->nsubc_tot, (a1-a0)/(double)grid->nsubc_tot);
1742
1743 print_bbsizes_supersub(debug, nbs, grid);
1744 }
1745 }
1746}
1747
1748static void init_buffer_flags(nbnxn_buffer_flags_t *flags,
1749 int natoms)
1750{
1751 int b;
1752
1753 flags->nflag = (natoms + NBNXN_BUFFERFLAG_SIZE16 - 1)/NBNXN_BUFFERFLAG_SIZE16;
1754 if (flags->nflag > flags->flag_nalloc)
1755 {
1756 flags->flag_nalloc = over_alloc_large(flags->nflag)(int)(1.19*(flags->nflag) + 1000);
1757 srenew(flags->flag, flags->flag_nalloc)(flags->flag) = save_realloc("flags->flag", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 1757, (flags->flag), (flags->flag_nalloc), sizeof(*(flags
->flag)))
;
1758 }
1759 for (b = 0; b < flags->nflag; b++)
1760 {
1761 flags->flag[b] = 0;
1762 }
1763}
1764
1765/* Sets up a grid and puts the atoms on the grid.
1766 * This function only operates on one domain of the domain decompostion.
1767 * Note that without domain decomposition there is only one domain.
1768 */
1769void nbnxn_put_on_grid(nbnxn_search_t nbs,
1770 int ePBC, matrix box,
1771 int dd_zone,
1772 rvec corner0, rvec corner1,
1773 int a0, int a1,
1774 real atom_density,
1775 const int *atinfo,
1776 rvec *x,
1777 int nmoved, int *move,
1778 int nb_kernel_type,
1779 nbnxn_atomdata_t *nbat)
1780{
1781 nbnxn_grid_t *grid;
1782 int n;
1783 int nc_max_grid, nc_max;
1784
1785 grid = &nbs->grid[dd_zone];
1786
1787 nbs_cycle_start(&nbs->cc[enbsCCgrid]);
1788
1789 grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
1790
1791 grid->na_c = nbnxn_kernel_to_ci_size(nb_kernel_type);
1792 grid->na_cj = nbnxn_kernel_to_cj_size(nb_kernel_type);
1793 grid->na_sc = (grid->bSimple ? 1 : GPU_NSUBCELL(2*2*2))*grid->na_c;
1794 grid->na_c_2log = get_2log(grid->na_c);
1795
1796 nbat->na_c = grid->na_c;
1797
1798 if (dd_zone == 0)
1799 {
1800 grid->cell0 = 0;
1801 }
1802 else
1803 {
1804 grid->cell0 =
1805 (nbs->grid[dd_zone-1].cell0 + nbs->grid[dd_zone-1].nc)*
1806 nbs->grid[dd_zone-1].na_sc/grid->na_sc;
1807 }
1808
1809 n = a1 - a0;
1810
1811 if (dd_zone == 0)
1812 {
1813 nbs->ePBC = ePBC;
1814 copy_mat(box, nbs->box);
1815
1816 if (atom_density >= 0)
1817 {
1818 grid->atom_density = atom_density;
1819 }
1820 else
1821 {
1822 grid->atom_density = grid_atom_density(n-nmoved, corner0, corner1);
1823 }
1824
1825 grid->cell0 = 0;
1826
1827 nbs->natoms_local = a1 - nmoved;
1828 /* We assume that nbnxn_put_on_grid is called first
1829 * for the local atoms (dd_zone=0).
1830 */
1831 nbs->natoms_nonlocal = a1 - nmoved;
1832 }
1833 else
1834 {
1835 nbs->natoms_nonlocal = max(nbs->natoms_nonlocal, a1)(((nbs->natoms_nonlocal) > (a1)) ? (nbs->natoms_nonlocal
) : (a1) )
;
1836 }
1837
1838 nc_max_grid = set_grid_size_xy(nbs, grid,
1839 dd_zone, n-nmoved, corner0, corner1,
1840 nbs->grid[0].atom_density);
1841
1842 nc_max = grid->cell0 + nc_max_grid;
1843
1844 if (a1 > nbs->cell_nalloc)
1845 {
1846 nbs->cell_nalloc = over_alloc_large(a1)(int)(1.19*(a1) + 1000);
1847 srenew(nbs->cell, nbs->cell_nalloc)(nbs->cell) = save_realloc("nbs->cell", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 1847, (nbs->cell), (nbs->cell_nalloc), sizeof(*(nbs->
cell)))
;
1848 }
1849
1850 /* To avoid conditionals we store the moved particles at the end of a,
1851 * make sure we have enough space.
1852 */
1853 if (nc_max*grid->na_sc + nmoved > nbs->a_nalloc)
1854 {
1855 nbs->a_nalloc = over_alloc_large(nc_max*grid->na_sc + nmoved)(int)(1.19*(nc_max*grid->na_sc + nmoved) + 1000);
1856 srenew(nbs->a, nbs->a_nalloc)(nbs->a) = save_realloc("nbs->a", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 1856, (nbs->a), (nbs->a_nalloc), sizeof(*(nbs->a))
)
;
1857 }
1858
1859 /* We need padding up to a multiple of the buffer flag size: simply add */
1860 if (nc_max*grid->na_sc + NBNXN_BUFFERFLAG_SIZE16 > nbat->nalloc)
1861 {
1862 nbnxn_atomdata_realloc(nbat, nc_max*grid->na_sc+NBNXN_BUFFERFLAG_SIZE16);
1863 }
1864
1865 calc_cell_indices(nbs, dd_zone, grid, a0, a1, atinfo, x, move, nbat);
1866
1867 if (dd_zone == 0)
1868 {
1869 nbat->natoms_local = nbat->natoms;
1870 }
1871
1872 nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
1873}
1874
1875/* Calls nbnxn_put_on_grid for all non-local domains */
1876void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs,
1877 const gmx_domdec_zones_t *zones,
1878 const int *atinfo,
1879 rvec *x,
1880 int nb_kernel_type,
1881 nbnxn_atomdata_t *nbat)
1882{
1883 int zone, d;
1884 rvec c0, c1;
1885
1886 for (zone = 1; zone < zones->n; zone++)
1887 {
1888 for (d = 0; d < DIM3; d++)
1889 {
1890 c0[d] = zones->size[zone].bb_x0[d];
1891 c1[d] = zones->size[zone].bb_x1[d];
1892 }
1893
1894 nbnxn_put_on_grid(nbs, nbs->ePBC, NULL((void*)0),
1895 zone, c0, c1,
1896 zones->cg_range[zone],
1897 zones->cg_range[zone+1],
1898 -1,
1899 atinfo,
1900 x,
1901 0, NULL((void*)0),
1902 nb_kernel_type,
1903 nbat);
1904 }
1905}
1906
1907/* Add simple grid type information to the local super/sub grid */
1908void nbnxn_grid_add_simple(nbnxn_search_t nbs,
1909 nbnxn_atomdata_t *nbat)
1910{
1911 nbnxn_grid_t *grid;
1912 float *bbcz;
1913 nbnxn_bb_t *bb;
1914 int ncd, sc;
1915
1916 grid = &nbs->grid[0];
1917
1918 if (grid->bSimple)
1919 {
1920 gmx_incons("nbnxn_grid_simple called with a simple grid")_gmx_error("incons", "nbnxn_grid_simple called with a simple grid"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 1920)
;
1921 }
1922
1923 ncd = grid->na_sc/NBNXN_CPU_CLUSTER_I_SIZE4;
1924
1925 if (grid->nc*ncd > grid->nc_nalloc_simple)
1926 {
1927 grid->nc_nalloc_simple = over_alloc_large(grid->nc*ncd)(int)(1.19*(grid->nc*ncd) + 1000);
1928 srenew(grid->bbcz_simple, grid->nc_nalloc_simple*NNBSBB_D)(grid->bbcz_simple) = save_realloc("grid->bbcz_simple",
"/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 1928, (grid->bbcz_simple), (grid->nc_nalloc_simple*2)
, sizeof(*(grid->bbcz_simple)))
;
1929 srenew(grid->bb_simple, grid->nc_nalloc_simple)(grid->bb_simple) = save_realloc("grid->bb_simple", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 1929, (grid->bb_simple), (grid->nc_nalloc_simple), sizeof
(*(grid->bb_simple)))
;
1930 srenew(grid->flags_simple, grid->nc_nalloc_simple)(grid->flags_simple) = save_realloc("grid->flags_simple"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 1930, (grid->flags_simple), (grid->nc_nalloc_simple),
sizeof(*(grid->flags_simple)))
;
1931 if (nbat->XFormat)
1932 {
1933 sfree_aligned(grid->bbj)save_free_aligned("grid->bbj", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 1933, (grid->bbj))
;
1934 snew_aligned(grid->bbj, grid->nc_nalloc_simple/2, 16)(grid->bbj) = save_calloc_aligned("grid->bbj", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 1934, (grid->nc_nalloc_simple/2), sizeof(*(grid->bbj)
), 16)
;
1935 }
1936 }
1937
1938 bbcz = grid->bbcz_simple;
1939 bb = grid->bb_simple;
1940
1941#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
1942 for (sc = 0; sc < grid->nc; sc++)
1943 {
1944 int c, tx, na;
1945
1946 for (c = 0; c < ncd; c++)
1947 {
1948 tx = sc*ncd + c;
1949
1950 na = NBNXN_CPU_CLUSTER_I_SIZE4;
1951 while (na > 0 &&
1952 nbat->type[tx*NBNXN_CPU_CLUSTER_I_SIZE4+na-1] == nbat->ntype-1)
1953 {
1954 na--;
1955 }
1956
1957 if (na > 0)
1958 {
1959 switch (nbat->XFormat)
1960 {
1961 case nbatX4:
1962 /* PACK_X4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
1963 calc_bounding_box_x_x4(na, nbat->x+tx*STRIDE_P4(3*4),
1964 bb+tx);
1965 break;
1966 case nbatX8:
1967 /* PACK_X8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
1968 calc_bounding_box_x_x8(na, nbat->x+X8_IND_A(tx*NBNXN_CPU_CLUSTER_I_SIZE)((3*8)*((tx*4) >> 3) + ((tx*4) & (8 - 1))),
1969 bb+tx);
1970 break;
1971 default:
1972 calc_bounding_box(na, nbat->xstride,
1973 nbat->x+tx*NBNXN_CPU_CLUSTER_I_SIZE4*nbat->xstride,
1974 bb+tx);
1975 break;
1976 }
1977 bbcz[tx*NNBSBB_D2+0] = bb[tx].lower[BB_Z2];
1978 bbcz[tx*NNBSBB_D2+1] = bb[tx].upper[BB_Z2];
1979
1980 /* No interaction optimization yet here */
1981 grid->flags_simple[tx] = NBNXN_CI_DO_LJ(0)(1<<(7+3*(0))) | NBNXN_CI_DO_COUL(0)(1<<(9+3*(0)));
1982 }
1983 else
1984 {
1985 grid->flags_simple[tx] = 0;
1986 }
1987 }
1988 }
1989
1990 if (grid->bSimple && nbat->XFormat == nbatX8)
1991 {
1992 combine_bounding_box_pairs(grid, grid->bb_simple);
1993 }
1994}
1995
1996void nbnxn_get_ncells(nbnxn_search_t nbs, int *ncx, int *ncy)
1997{
1998 *ncx = nbs->grid[0].ncx;
1999 *ncy = nbs->grid[0].ncy;
2000}
2001
2002void nbnxn_get_atomorder(nbnxn_search_t nbs, int **a, int *n)
2003{
2004 const nbnxn_grid_t *grid;
2005
2006 grid = &nbs->grid[0];
2007
2008 /* Return the atom order for the home cell (index 0) */
2009 *a = nbs->a;
2010
2011 *n = grid->cxy_ind[grid->ncx*grid->ncy]*grid->na_sc;
2012}
2013
2014void nbnxn_set_atomorder(nbnxn_search_t nbs)
2015{
2016 nbnxn_grid_t *grid;
2017 int ao, cx, cy, cxy, cz, j;
2018
2019 /* Set the atom order for the home cell (index 0) */
2020 grid = &nbs->grid[0];
2021
2022 ao = 0;
2023 for (cx = 0; cx < grid->ncx; cx++)
2024 {
2025 for (cy = 0; cy < grid->ncy; cy++)
2026 {
2027 cxy = cx*grid->ncy + cy;
2028 j = grid->cxy_ind[cxy]*grid->na_sc;
2029 for (cz = 0; cz < grid->cxy_na[cxy]; cz++)
2030 {
2031 nbs->a[j] = ao;
2032 nbs->cell[ao] = j;
2033 ao++;
2034 j++;
2035 }
2036 }
2037 }
2038}
2039
2040/* Determines the cell range along one dimension that
2041 * the bounding box b0 - b1 sees.
2042 */
2043static void get_cell_range(real b0, real b1,
2044 int nc, real c0, real s, real invs,
2045 real d2, real r2, int *cf, int *cl)
2046{
2047 *cf = max((int)((b0 - c0)*invs), 0)((((int)((b0 - c0)*invs)) > (0)) ? ((int)((b0 - c0)*invs))
: (0) )
;
2048
2049 while (*cf > 0 && d2 + sqr((b0 - c0) - (*cf-1+1)*s) < r2)
2050 {
2051 (*cf)--;
2052 }
2053
2054 *cl = min((int)((b1 - c0)*invs), nc-1)((((int)((b1 - c0)*invs)) < (nc-1)) ? ((int)((b1 - c0)*invs
)) : (nc-1) )
;
2055 while (*cl < nc-1 && d2 + sqr((*cl+1)*s - (b1 - c0)) < r2)
2056 {
2057 (*cl)++;
2058 }
2059}
2060
2061/* Reference code calculating the distance^2 between two bounding boxes */
2062static float box_dist2(float bx0, float bx1, float by0,
2063 float by1, float bz0, float bz1,
2064 const nbnxn_bb_t *bb)
2065{
2066 float d2;
2067 float dl, dh, dm, dm0;
2068
2069 d2 = 0;
2070
2071 dl = bx0 - bb->upper[BB_X0];
2072 dh = bb->lower[BB_X0] - bx1;
2073 dm = max(dl, dh)(((dl) > (dh)) ? (dl) : (dh) );
2074 dm0 = max(dm, 0)(((dm) > (0)) ? (dm) : (0) );
2075 d2 += dm0*dm0;
2076
2077 dl = by0 - bb->upper[BB_Y1];
2078 dh = bb->lower[BB_Y1] - by1;
2079 dm = max(dl, dh)(((dl) > (dh)) ? (dl) : (dh) );
2080 dm0 = max(dm, 0)(((dm) > (0)) ? (dm) : (0) );
2081 d2 += dm0*dm0;
2082
2083 dl = bz0 - bb->upper[BB_Z2];
2084 dh = bb->lower[BB_Z2] - bz1;
2085 dm = max(dl, dh)(((dl) > (dh)) ? (dl) : (dh) );
2086 dm0 = max(dm, 0)(((dm) > (0)) ? (dm) : (0) );
2087 d2 += dm0*dm0;
2088
2089 return d2;
2090}
2091
2092/* Plain C code calculating the distance^2 between two bounding boxes */
2093static float subc_bb_dist2(int si, const nbnxn_bb_t *bb_i_ci,
2094 int csj, const nbnxn_bb_t *bb_j_all)
2095{
2096 const nbnxn_bb_t *bb_i, *bb_j;
2097 float d2;
2098 float dl, dh, dm, dm0;
2099
2100 bb_i = bb_i_ci + si;
2101 bb_j = bb_j_all + csj;
2102
2103 d2 = 0;
2104
2105 dl = bb_i->lower[BB_X0] - bb_j->upper[BB_X0];
2106 dh = bb_j->lower[BB_X0] - bb_i->upper[BB_X0];
2107 dm = max(dl, dh)(((dl) > (dh)) ? (dl) : (dh) );
2108 dm0 = max(dm, 0)(((dm) > (0)) ? (dm) : (0) );
2109 d2 += dm0*dm0;
2110
2111 dl = bb_i->lower[BB_Y1] - bb_j->upper[BB_Y1];
2112 dh = bb_j->lower[BB_Y1] - bb_i->upper[BB_Y1];
2113 dm = max(dl, dh)(((dl) > (dh)) ? (dl) : (dh) );
2114 dm0 = max(dm, 0)(((dm) > (0)) ? (dm) : (0) );
2115 d2 += dm0*dm0;
2116
2117 dl = bb_i->lower[BB_Z2] - bb_j->upper[BB_Z2];
2118 dh = bb_j->lower[BB_Z2] - bb_i->upper[BB_Z2];
2119 dm = max(dl, dh)(((dl) > (dh)) ? (dl) : (dh) );
2120 dm0 = max(dm, 0)(((dm) > (0)) ? (dm) : (0) );
2121 d2 += dm0*dm0;
2122
2123 return d2;
2124}
2125
2126#ifdef NBNXN_SEARCH_BB_SIMD4
2127
2128/* 4-wide SIMD code for bb distance for bb format xyz0 */
2129static float subc_bb_dist2_simd4(int si, const nbnxn_bb_t *bb_i_ci,
2130 int csj, const nbnxn_bb_t *bb_j_all)
2131{
2132 gmx_simd4_float_t__m128 bb_i_S0, bb_i_S1;
2133 gmx_simd4_float_t__m128 bb_j_S0, bb_j_S1;
2134 gmx_simd4_float_t__m128 dl_S;
2135 gmx_simd4_float_t__m128 dh_S;
2136 gmx_simd4_float_t__m128 dm_S;
2137 gmx_simd4_float_t__m128 dm0_S;
2138
2139 bb_i_S0 = gmx_simd4_load_f_mm_load_ps(&bb_i_ci[si].lower[0]);
2140 bb_i_S1 = gmx_simd4_load_f_mm_load_ps(&bb_i_ci[si].upper[0]);
2141 bb_j_S0 = gmx_simd4_load_f_mm_load_ps(&bb_j_all[csj].lower[0]);
2142 bb_j_S1 = gmx_simd4_load_f_mm_load_ps(&bb_j_all[csj].upper[0]);
2143
2144 dl_S = gmx_simd4_sub_f_mm_sub_ps(bb_i_S0, bb_j_S1);
2145 dh_S = gmx_simd4_sub_f_mm_sub_ps(bb_j_S0, bb_i_S1);
2146
2147 dm_S = gmx_simd4_max_f_mm_max_ps(dl_S, dh_S);
2148 dm0_S = gmx_simd4_max_f_mm_max_ps(dm_S, gmx_simd4_setzero_f_mm_setzero_ps());
2149
2150 return gmx_simd4_dotproduct3_fgmx_simd4_dotproduct3_f_sse4_1(dm0_S, dm0_S);
2151}
2152
2153/* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
2154#define SUBC_BB_DIST2_SIMD4_XXXX_INNER(si, bb_i, d2){ int shi; __m128 dx_0, dy_0, dz_0; __m128 dx_1, dy_1, dz_1; __m128
mx, my, mz; __m128 m0x, m0y, m0z; __m128 d2x, d2y, d2z; __m128
d2s, d2t; shi = si*2*3; xi_l = _mm_load_ps(bb_i+shi+0*4); yi_l
= _mm_load_ps(bb_i+shi+1*4); zi_l = _mm_load_ps(bb_i+shi+2*4
); xi_h = _mm_load_ps(bb_i+shi+3*4); yi_h = _mm_load_ps(bb_i+
shi+4*4); zi_h = _mm_load_ps(bb_i+shi+5*4); dx_0 = _mm_sub_ps
(xi_l, xj_h); dy_0 = _mm_sub_ps(yi_l, yj_h); dz_0 = _mm_sub_ps
(zi_l, zj_h); dx_1 = _mm_sub_ps(xj_l, xi_h); dy_1 = _mm_sub_ps
(yj_l, yi_h); dz_1 = _mm_sub_ps(zj_l, zi_h); mx = _mm_max_ps(
dx_0, dx_1); my = _mm_max_ps(dy_0, dy_1); mz = _mm_max_ps(dz_0
, dz_1); m0x = _mm_max_ps(mx, zero); m0y = _mm_max_ps(my, zero
); m0z = _mm_max_ps(mz, zero); d2x = _mm_mul_ps(m0x, m0x); d2y
= _mm_mul_ps(m0y, m0y); d2z = _mm_mul_ps(m0z, m0z); d2s = _mm_add_ps
(d2x, d2y); d2t = _mm_add_ps(d2s, d2z); _mm_store_ps(d2+si, d2t
); }
\
2155 { \
2156 int shi; \
2157 \
2158 gmx_simd4_float_t__m128 dx_0, dy_0, dz_0; \
2159 gmx_simd4_float_t__m128 dx_1, dy_1, dz_1; \
2160 \
2161 gmx_simd4_float_t__m128 mx, my, mz; \
2162 gmx_simd4_float_t__m128 m0x, m0y, m0z; \
2163 \
2164 gmx_simd4_float_t__m128 d2x, d2y, d2z; \
2165 gmx_simd4_float_t__m128 d2s, d2t; \
2166 \
2167 shi = si*NNBSBB_D2*DIM3; \
2168 \
2169 xi_l = gmx_simd4_load_f_mm_load_ps(bb_i+shi+0*STRIDE_PBB4); \
2170 yi_l = gmx_simd4_load_f_mm_load_ps(bb_i+shi+1*STRIDE_PBB4); \
2171 zi_l = gmx_simd4_load_f_mm_load_ps(bb_i+shi+2*STRIDE_PBB4); \
2172 xi_h = gmx_simd4_load_f_mm_load_ps(bb_i+shi+3*STRIDE_PBB4); \
2173 yi_h = gmx_simd4_load_f_mm_load_ps(bb_i+shi+4*STRIDE_PBB4); \
2174 zi_h = gmx_simd4_load_f_mm_load_ps(bb_i+shi+5*STRIDE_PBB4); \
2175 \
2176 dx_0 = gmx_simd4_sub_f_mm_sub_ps(xi_l, xj_h); \
2177 dy_0 = gmx_simd4_sub_f_mm_sub_ps(yi_l, yj_h); \
2178 dz_0 = gmx_simd4_sub_f_mm_sub_ps(zi_l, zj_h); \
2179 \
2180 dx_1 = gmx_simd4_sub_f_mm_sub_ps(xj_l, xi_h); \
2181 dy_1 = gmx_simd4_sub_f_mm_sub_ps(yj_l, yi_h); \
2182 dz_1 = gmx_simd4_sub_f_mm_sub_ps(zj_l, zi_h); \
2183 \
2184 mx = gmx_simd4_max_f_mm_max_ps(dx_0, dx_1); \
2185 my = gmx_simd4_max_f_mm_max_ps(dy_0, dy_1); \
2186 mz = gmx_simd4_max_f_mm_max_ps(dz_0, dz_1); \
2187 \
2188 m0x = gmx_simd4_max_f_mm_max_ps(mx, zero); \
2189 m0y = gmx_simd4_max_f_mm_max_ps(my, zero); \
2190 m0z = gmx_simd4_max_f_mm_max_ps(mz, zero); \
2191 \
2192 d2x = gmx_simd4_mul_f_mm_mul_ps(m0x, m0x); \
2193 d2y = gmx_simd4_mul_f_mm_mul_ps(m0y, m0y); \
2194 d2z = gmx_simd4_mul_f_mm_mul_ps(m0z, m0z); \
2195 \
2196 d2s = gmx_simd4_add_f_mm_add_ps(d2x, d2y); \
2197 d2t = gmx_simd4_add_f_mm_add_ps(d2s, d2z); \
2198 \
2199 gmx_simd4_store_f_mm_store_ps(d2+si, d2t); \
2200 }
2201
2202/* 4-wide SIMD code for nsi bb distances for bb format xxxxyyyyzzzz */
2203static void subc_bb_dist2_simd4_xxxx(const float *bb_j,
2204 int nsi, const float *bb_i,
2205 float *d2)
2206{
2207 gmx_simd4_float_t__m128 xj_l, yj_l, zj_l;
2208 gmx_simd4_float_t__m128 xj_h, yj_h, zj_h;
2209 gmx_simd4_float_t__m128 xi_l, yi_l, zi_l;
2210 gmx_simd4_float_t__m128 xi_h, yi_h, zi_h;
2211
2212 gmx_simd4_float_t__m128 zero;
2213
2214 zero = gmx_simd4_setzero_f_mm_setzero_ps();
2215
2216 xj_l = gmx_simd4_set1_f_mm_set1_ps(bb_j[0*STRIDE_PBB4]);
2217 yj_l = gmx_simd4_set1_f_mm_set1_ps(bb_j[1*STRIDE_PBB4]);
2218 zj_l = gmx_simd4_set1_f_mm_set1_ps(bb_j[2*STRIDE_PBB4]);
2219 xj_h = gmx_simd4_set1_f_mm_set1_ps(bb_j[3*STRIDE_PBB4]);
2220 yj_h = gmx_simd4_set1_f_mm_set1_ps(bb_j[4*STRIDE_PBB4]);
2221 zj_h = gmx_simd4_set1_f_mm_set1_ps(bb_j[5*STRIDE_PBB4]);
2222
2223 /* Here we "loop" over si (0,STRIDE_PBB) from 0 to nsi with step STRIDE_PBB.
2224 * But as we know the number of iterations is 1 or 2, we unroll manually.
2225 */
2226 SUBC_BB_DIST2_SIMD4_XXXX_INNER(0, bb_i, d2){ int shi; __m128 dx_0, dy_0, dz_0; __m128 dx_1, dy_1, dz_1; __m128
mx, my, mz; __m128 m0x, m0y, m0z; __m128 d2x, d2y, d2z; __m128
d2s, d2t; shi = 0*2*3; xi_l = _mm_load_ps(bb_i+shi+0*4); yi_l
= _mm_load_ps(bb_i+shi+1*4); zi_l = _mm_load_ps(bb_i+shi+2*4
); xi_h = _mm_load_ps(bb_i+shi+3*4); yi_h = _mm_load_ps(bb_i+
shi+4*4); zi_h = _mm_load_ps(bb_i+shi+5*4); dx_0 = _mm_sub_ps
(xi_l, xj_h); dy_0 = _mm_sub_ps(yi_l, yj_h); dz_0 = _mm_sub_ps
(zi_l, zj_h); dx_1 = _mm_sub_ps(xj_l, xi_h); dy_1 = _mm_sub_ps
(yj_l, yi_h); dz_1 = _mm_sub_ps(zj_l, zi_h); mx = _mm_max_ps(
dx_0, dx_1); my = _mm_max_ps(dy_0, dy_1); mz = _mm_max_ps(dz_0
, dz_1); m0x = _mm_max_ps(mx, zero); m0y = _mm_max_ps(my, zero
); m0z = _mm_max_ps(mz, zero); d2x = _mm_mul_ps(m0x, m0x); d2y
= _mm_mul_ps(m0y, m0y); d2z = _mm_mul_ps(m0z, m0z); d2s = _mm_add_ps
(d2x, d2y); d2t = _mm_add_ps(d2s, d2z); _mm_store_ps(d2+0, d2t
); }
;
2227 if (STRIDE_PBB4 < nsi)
2228 {
2229 SUBC_BB_DIST2_SIMD4_XXXX_INNER(STRIDE_PBB, bb_i, d2){ int shi; __m128 dx_0, dy_0, dz_0; __m128 dx_1, dy_1, dz_1; __m128
mx, my, mz; __m128 m0x, m0y, m0z; __m128 d2x, d2y, d2z; __m128
d2s, d2t; shi = 4*2*3; xi_l = _mm_load_ps(bb_i+shi+0*4); yi_l
= _mm_load_ps(bb_i+shi+1*4); zi_l = _mm_load_ps(bb_i+shi+2*4
); xi_h = _mm_load_ps(bb_i+shi+3*4); yi_h = _mm_load_ps(bb_i+
shi+4*4); zi_h = _mm_load_ps(bb_i+shi+5*4); dx_0 = _mm_sub_ps
(xi_l, xj_h); dy_0 = _mm_sub_ps(yi_l, yj_h); dz_0 = _mm_sub_ps
(zi_l, zj_h); dx_1 = _mm_sub_ps(xj_l, xi_h); dy_1 = _mm_sub_ps
(yj_l, yi_h); dz_1 = _mm_sub_ps(zj_l, zi_h); mx = _mm_max_ps(
dx_0, dx_1); my = _mm_max_ps(dy_0, dy_1); mz = _mm_max_ps(dz_0
, dz_1); m0x = _mm_max_ps(mx, zero); m0y = _mm_max_ps(my, zero
); m0z = _mm_max_ps(mz, zero); d2x = _mm_mul_ps(m0x, m0x); d2y
= _mm_mul_ps(m0y, m0y); d2z = _mm_mul_ps(m0z, m0z); d2s = _mm_add_ps
(d2x, d2y); d2t = _mm_add_ps(d2s, d2z); _mm_store_ps(d2+4, d2t
); }
;
2230 }
2231}
2232
2233#endif /* NBNXN_SEARCH_BB_SIMD4 */
2234
2235/* Plain C function which determines if any atom pair between two cells
2236 * is within distance sqrt(rl2).
2237 */
2238static gmx_bool subc_in_range_x(int na_c,
2239 int si, const real *x_i,
2240 int csj, int stride, const real *x_j,
2241 real rl2)
2242{
2243 int i, j, i0, j0;
2244 real d2;
2245
2246 for (i = 0; i < na_c; i++)
2247 {
2248 i0 = (si*na_c + i)*DIM3;
2249 for (j = 0; j < na_c; j++)
2250 {
2251 j0 = (csj*na_c + j)*stride;
2252
2253 d2 = sqr(x_i[i0 ] - x_j[j0 ]) +
2254 sqr(x_i[i0+1] - x_j[j0+1]) +
2255 sqr(x_i[i0+2] - x_j[j0+2]);
2256
2257 if (d2 < rl2)
2258 {
2259 return TRUE1;
2260 }
2261 }
2262 }
2263
2264 return FALSE0;
2265}
2266
2267#ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB
2268
2269/* 4-wide SIMD function which determines if any atom pair between two cells,
2270 * both with 8 atoms, is within distance sqrt(rl2).
2271 * Using 8-wide AVX is not faster on Intel Sandy Bridge.
2272 */
2273static gmx_bool subc_in_range_simd4(int na_c,
2274 int si, const real *x_i,
2275 int csj, int stride, const real *x_j,
2276 real rl2)
2277{
2278 gmx_simd4_real_t__m128 ix_S0, iy_S0, iz_S0;
2279 gmx_simd4_real_t__m128 ix_S1, iy_S1, iz_S1;
2280
2281 gmx_simd4_real_t__m128 rc2_S;
2282
2283 int dim_stride;
2284 int j0, j1;
2285
2286 rc2_S = gmx_simd4_set1_r_mm_set1_ps(rl2);
2287
2288 dim_stride = NBNXN_GPU_CLUSTER_SIZE8/STRIDE_PBB4*DIM3;
2289 ix_S0 = gmx_simd4_load_r_mm_load_ps(x_i+(si*dim_stride+0)*STRIDE_PBB4);
2290 iy_S0 = gmx_simd4_load_r_mm_load_ps(x_i+(si*dim_stride+1)*STRIDE_PBB4);
2291 iz_S0 = gmx_simd4_load_r_mm_load_ps(x_i+(si*dim_stride+2)*STRIDE_PBB4);
2292 ix_S1 = gmx_simd4_load_r_mm_load_ps(x_i+(si*dim_stride+3)*STRIDE_PBB4);
2293 iy_S1 = gmx_simd4_load_r_mm_load_ps(x_i+(si*dim_stride+4)*STRIDE_PBB4);
2294 iz_S1 = gmx_simd4_load_r_mm_load_ps(x_i+(si*dim_stride+5)*STRIDE_PBB4);
2295
2296 /* We loop from the outer to the inner particles to maximize
2297 * the chance that we find a pair in range quickly and return.
2298 */
2299 j0 = csj*na_c;
2300 j1 = j0 + na_c - 1;
2301 while (j0 < j1)
2302 {
2303 gmx_simd4_real_t__m128 jx0_S, jy0_S, jz0_S;
2304 gmx_simd4_real_t__m128 jx1_S, jy1_S, jz1_S;
2305
2306 gmx_simd4_real_t__m128 dx_S0, dy_S0, dz_S0;
2307 gmx_simd4_real_t__m128 dx_S1, dy_S1, dz_S1;
2308 gmx_simd4_real_t__m128 dx_S2, dy_S2, dz_S2;
2309 gmx_simd4_real_t__m128 dx_S3, dy_S3, dz_S3;
2310
2311 gmx_simd4_real_t__m128 rsq_S0;
2312 gmx_simd4_real_t__m128 rsq_S1;
2313 gmx_simd4_real_t__m128 rsq_S2;
2314 gmx_simd4_real_t__m128 rsq_S3;
2315
2316 gmx_simd4_bool_t__m128 wco_S0;
2317 gmx_simd4_bool_t__m128 wco_S1;
2318 gmx_simd4_bool_t__m128 wco_S2;
2319 gmx_simd4_bool_t__m128 wco_S3;
2320 gmx_simd4_bool_t__m128 wco_any_S01, wco_any_S23, wco_any_S;
2321
2322 jx0_S = gmx_simd4_set1_r_mm_set1_ps(x_j[j0*stride+0]);
2323 jy0_S = gmx_simd4_set1_r_mm_set1_ps(x_j[j0*stride+1]);
2324 jz0_S = gmx_simd4_set1_r_mm_set1_ps(x_j[j0*stride+2]);
2325
2326 jx1_S = gmx_simd4_set1_r_mm_set1_ps(x_j[j1*stride+0]);
2327 jy1_S = gmx_simd4_set1_r_mm_set1_ps(x_j[j1*stride+1]);
2328 jz1_S = gmx_simd4_set1_r_mm_set1_ps(x_j[j1*stride+2]);
2329
2330 /* Calculate distance */
2331 dx_S0 = gmx_simd4_sub_r_mm_sub_ps(ix_S0, jx0_S);
2332 dy_S0 = gmx_simd4_sub_r_mm_sub_ps(iy_S0, jy0_S);
2333 dz_S0 = gmx_simd4_sub_r_mm_sub_ps(iz_S0, jz0_S);
2334 dx_S1 = gmx_simd4_sub_r_mm_sub_ps(ix_S1, jx0_S);
2335 dy_S1 = gmx_simd4_sub_r_mm_sub_ps(iy_S1, jy0_S);
2336 dz_S1 = gmx_simd4_sub_r_mm_sub_ps(iz_S1, jz0_S);
2337 dx_S2 = gmx_simd4_sub_r_mm_sub_ps(ix_S0, jx1_S);
2338 dy_S2 = gmx_simd4_sub_r_mm_sub_ps(iy_S0, jy1_S);
2339 dz_S2 = gmx_simd4_sub_r_mm_sub_ps(iz_S0, jz1_S);
2340 dx_S3 = gmx_simd4_sub_r_mm_sub_ps(ix_S1, jx1_S);
2341 dy_S3 = gmx_simd4_sub_r_mm_sub_ps(iy_S1, jy1_S);
2342 dz_S3 = gmx_simd4_sub_r_mm_sub_ps(iz_S1, jz1_S);
2343
2344 /* rsq = dx*dx+dy*dy+dz*dz */
2345 rsq_S0 = gmx_simd4_calc_rsq_rgmx_simd4_norm2_f(dx_S0, dy_S0, dz_S0);
2346 rsq_S1 = gmx_simd4_calc_rsq_rgmx_simd4_norm2_f(dx_S1, dy_S1, dz_S1);
2347 rsq_S2 = gmx_simd4_calc_rsq_rgmx_simd4_norm2_f(dx_S2, dy_S2, dz_S2);
2348 rsq_S3 = gmx_simd4_calc_rsq_rgmx_simd4_norm2_f(dx_S3, dy_S3, dz_S3);
2349
2350 wco_S0 = gmx_simd4_cmplt_r_mm_cmplt_ps(rsq_S0, rc2_S);
2351 wco_S1 = gmx_simd4_cmplt_r_mm_cmplt_ps(rsq_S1, rc2_S);
2352 wco_S2 = gmx_simd4_cmplt_r_mm_cmplt_ps(rsq_S2, rc2_S);
2353 wco_S3 = gmx_simd4_cmplt_r_mm_cmplt_ps(rsq_S3, rc2_S);
2354
2355 wco_any_S01 = gmx_simd4_or_b_mm_or_ps(wco_S0, wco_S1);
2356 wco_any_S23 = gmx_simd4_or_b_mm_or_ps(wco_S2, wco_S3);
2357 wco_any_S = gmx_simd4_or_b_mm_or_ps(wco_any_S01, wco_any_S23);
2358
2359 if (gmx_simd4_anytrue_b_mm_movemask_ps(wco_any_S))
2360 {
2361 return TRUE1;
2362 }
2363
2364 j0++;
2365 j1--;
2366 }
2367 return FALSE0;
2368
2369}
2370#endif
2371
2372
2373/* Returns the j sub-cell for index cj_ind */
2374static int nbl_cj(const nbnxn_pairlist_t *nbl, int cj_ind)
2375{
2376 return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG2].cj[cj_ind & (NBNXN_GPU_JGROUP_SIZE4 - 1)];
2377}
2378
2379/* Returns the i-interaction mask of the j sub-cell for index cj_ind */
2380static unsigned int nbl_imask0(const nbnxn_pairlist_t *nbl, int cj_ind)
2381{
2382 return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG2].imei[0].imask;
2383}
2384
2385/* Ensures there is enough space for extra extra exclusion masks */
2386static void check_excl_space(nbnxn_pairlist_t *nbl, int extra)
2387{
2388 if (nbl->nexcl+extra > nbl->excl_nalloc)
2389 {
2390 nbl->excl_nalloc = over_alloc_small(nbl->nexcl+extra)(int)(1.19*(nbl->nexcl+extra) + 8000);
2391 nbnxn_realloc_void((void **)&nbl->excl,
2392 nbl->nexcl*sizeof(*nbl->excl),
2393 nbl->excl_nalloc*sizeof(*nbl->excl),
2394 nbl->alloc, nbl->free);
2395 }
2396}
2397
2398/* Ensures there is enough space for ncell extra j-cells in the list */
2399static void check_subcell_list_space_simple(nbnxn_pairlist_t *nbl,
2400 int ncell)
2401{
2402 int cj_max;
2403
2404 cj_max = nbl->ncj + ncell;
2405
2406 if (cj_max > nbl->cj_nalloc)
2407 {
2408 nbl->cj_nalloc = over_alloc_small(cj_max)(int)(1.19*(cj_max) + 8000);
2409 nbnxn_realloc_void((void **)&nbl->cj,
2410 nbl->ncj*sizeof(*nbl->cj),
2411 nbl->cj_nalloc*sizeof(*nbl->cj),
2412 nbl->alloc, nbl->free);
2413 }
2414}
2415
2416/* Ensures there is enough space for ncell extra j-subcells in the list */
2417static void check_subcell_list_space_supersub(nbnxn_pairlist_t *nbl,
2418 int nsupercell)
2419{
2420 int ncj4_max, j4, j, w, t;
2421
2422#define NWARP2 2
2423#define WARP_SIZE32 32
2424
2425 /* We can have maximally nsupercell*GPU_NSUBCELL sj lists */
2426 /* We can store 4 j-subcell - i-supercell pairs in one struct.
2427 * since we round down, we need one extra entry.
2428 */
2429 ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL(2*2*2) + NBNXN_GPU_JGROUP_SIZE4 - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG2);
2430
2431 if (ncj4_max > nbl->cj4_nalloc)
2432 {
2433 nbl->cj4_nalloc = over_alloc_small(ncj4_max)(int)(1.19*(ncj4_max) + 8000);
2434 nbnxn_realloc_void((void **)&nbl->cj4,
2435 nbl->work->cj4_init*sizeof(*nbl->cj4),
2436 nbl->cj4_nalloc*sizeof(*nbl->cj4),
2437 nbl->alloc, nbl->free);
2438 }
2439
2440 if (ncj4_max > nbl->work->cj4_init)
2441 {
2442 for (j4 = nbl->work->cj4_init; j4 < ncj4_max; j4++)
2443 {
2444 /* No i-subcells and no excl's in the list initially */
2445 for (w = 0; w < NWARP2; w++)
2446 {
2447 nbl->cj4[j4].imei[w].imask = 0U;
2448 nbl->cj4[j4].imei[w].excl_ind = 0;
2449
2450 }
2451 }
2452 nbl->work->cj4_init = ncj4_max;
2453 }
2454}
2455
2456/* Set all excl masks for one GPU warp no exclusions */
2457static void set_no_excls(nbnxn_excl_t *excl)
2458{
2459 int t;
2460
2461 for (t = 0; t < WARP_SIZE32; t++)
2462 {
2463 /* Turn all interaction bits on */
2464 excl->pair[t] = NBNXN_INTERACTION_MASK_ALL;
2465 }
2466}
2467
2468/* Initializes a single nbnxn_pairlist_t data structure */
2469static void nbnxn_init_pairlist(nbnxn_pairlist_t *nbl,
2470 gmx_bool bSimple,
2471 nbnxn_alloc_t *alloc,
2472 nbnxn_free_t *free)
2473{
2474 if (alloc == NULL((void*)0))
2475 {
2476 nbl->alloc = nbnxn_alloc_aligned;
2477 }
2478 else
2479 {
2480 nbl->alloc = alloc;
2481 }
2482 if (free == NULL((void*)0))
2483 {
2484 nbl->free = nbnxn_free_aligned;
2485 }
2486 else
2487 {
2488 nbl->free = free;
2489 }
2490
2491 nbl->bSimple = bSimple;
2492 nbl->na_sc = 0;
2493 nbl->na_ci = 0;
2494 nbl->na_cj = 0;
2495 nbl->nci = 0;
2496 nbl->ci = NULL((void*)0);
2497 nbl->ci_nalloc = 0;
2498 nbl->ncj = 0;
2499 nbl->cj = NULL((void*)0);
2500 nbl->cj_nalloc = 0;
2501 nbl->ncj4 = 0;
2502 /* We need one element extra in sj, so alloc initially with 1 */
2503 nbl->cj4_nalloc = 0;
2504 nbl->cj4 = NULL((void*)0);
2505 nbl->nci_tot = 0;
2506
2507 if (!nbl->bSimple)
2508 {
2509 nbl->excl = NULL((void*)0);
2510 nbl->excl_nalloc = 0;
2511 nbl->nexcl = 0;
2512 check_excl_space(nbl, 1);
2513 nbl->nexcl = 1;
2514 set_no_excls(&nbl->excl[0]);
2515 }
2516
2517 snew(nbl->work, 1)(nbl->work) = save_calloc("nbl->work", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 2517, (1), sizeof(*(nbl->work)))
;
2518 if (nbl->bSimple)
2519 {
2520 snew_aligned(nbl->work->bb_ci, 1, NBNXN_SEARCH_BB_MEM_ALIGN)(nbl->work->bb_ci) = save_calloc_aligned("nbl->work->bb_ci"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 2520, (1), sizeof(*(nbl->work->bb_ci)), (4*sizeof(float
)))
;
2521 }
2522 else
2523 {
2524#ifdef NBNXN_BBXXXX
2525 snew_aligned(nbl->work->pbb_ci, GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX, NBNXN_SEARCH_BB_MEM_ALIGN)(nbl->work->pbb_ci) = save_calloc_aligned("nbl->work->pbb_ci"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 2525, ((2*2*2)/4*(2*3*4)), sizeof(*(nbl->work->pbb_ci
)), (4*sizeof(float)))
;
2526#else
2527 snew_aligned(nbl->work->bb_ci, GPU_NSUBCELL, NBNXN_SEARCH_BB_MEM_ALIGN)(nbl->work->bb_ci) = save_calloc_aligned("nbl->work->bb_ci"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 2527, ((2*2*2)), sizeof(*(nbl->work->bb_ci)), (4*sizeof
(float)))
;
2528#endif
2529 }
2530 snew_aligned(nbl->work->x_ci, NBNXN_NA_SC_MAX*DIM, NBNXN_SEARCH_BB_MEM_ALIGN)(nbl->work->x_ci) = save_calloc_aligned("nbl->work->x_ci"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 2530, (((2*2*2)*8)*3), sizeof(*(nbl->work->x_ci)), (4
*sizeof(float)))
;
2531#ifdef GMX_NBNXN_SIMD
2532 snew_aligned(nbl->work->x_ci_simd_4xn, 1, NBNXN_MEM_ALIGN)(nbl->work->x_ci_simd_4xn) = save_calloc_aligned("nbl->work->x_ci_simd_4xn"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 2532, (1), sizeof(*(nbl->work->x_ci_simd_4xn)), (4*sizeof
(real)))
;
2533 snew_aligned(nbl->work->x_ci_simd_2xnn, 1, NBNXN_MEM_ALIGN)(nbl->work->x_ci_simd_2xnn) = save_calloc_aligned("nbl->work->x_ci_simd_2xnn"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 2533, (1), sizeof(*(nbl->work->x_ci_simd_2xnn)), (4*sizeof
(real)))
;
2534#endif
2535 snew_aligned(nbl->work->d2, GPU_NSUBCELL, NBNXN_SEARCH_BB_MEM_ALIGN)(nbl->work->d2) = save_calloc_aligned("nbl->work->d2"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 2535, ((2*2*2)), sizeof(*(nbl->work->d2)), (4*sizeof(
float)))
;
2536
2537 nbl->work->sort = NULL((void*)0);
2538 nbl->work->sort_nalloc = 0;
2539 nbl->work->sci_sort = NULL((void*)0);
2540 nbl->work->sci_sort_nalloc = 0;
2541}
2542
2543void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
2544 gmx_bool bSimple, gmx_bool bCombined,
2545 nbnxn_alloc_t *alloc,
2546 nbnxn_free_t *free)
2547{
2548 int i;
2549
2550 nbl_list->bSimple = bSimple;
2551 nbl_list->bCombined = bCombined;
2552
2553 nbl_list->nnbl = gmx_omp_nthreads_get(emntNonbonded);
2554
2555 if (!nbl_list->bCombined &&
2556 nbl_list->nnbl > NBNXN_BUFFERFLAG_MAX_THREADS32)
2557 {
2558 gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 2558
, "%d OpenMP threads were requested. Since the non-bonded force buffer reduction is prohibitively slow with more than %d threads, we do not allow this. Use %d or less OpenMP threads.",
2559 nbl_list->nnbl, NBNXN_BUFFERFLAG_MAX_THREADS32, NBNXN_BUFFERFLAG_MAX_THREADS32);
2560 }
2561
2562 snew(nbl_list->nbl, nbl_list->nnbl)(nbl_list->nbl) = save_calloc("nbl_list->nbl", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 2562, (nbl_list->nnbl), sizeof(*(nbl_list->nbl)))
;
2563 snew(nbl_list->nbl_fep, nbl_list->nnbl)(nbl_list->nbl_fep) = save_calloc("nbl_list->nbl_fep", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 2563, (nbl_list->nnbl), sizeof(*(nbl_list->nbl_fep)))
;
2564 /* Execute in order to avoid memory interleaving between threads */
2565#pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)
2566 for (i = 0; i < nbl_list->nnbl; i++)
2567 {
2568 /* Allocate the nblist data structure locally on each thread
2569 * to optimize memory access for NUMA architectures.
2570 */
2571 snew(nbl_list->nbl[i], 1)(nbl_list->nbl[i]) = save_calloc("nbl_list->nbl[i]", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 2571, (1), sizeof(*(nbl_list->nbl[i])))
;
2572
2573 /* Only list 0 is used on the GPU, use normal allocation for i>0 */
2574 if (i == 0)
2575 {
2576 nbnxn_init_pairlist(nbl_list->nbl[i], nbl_list->bSimple, alloc, free);
2577 }
2578 else
2579 {
2580 nbnxn_init_pairlist(nbl_list->nbl[i], nbl_list->bSimple, NULL((void*)0), NULL((void*)0));
2581 }
2582
2583 snew(nbl_list->nbl_fep[i], 1)(nbl_list->nbl_fep[i]) = save_calloc("nbl_list->nbl_fep[i]"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 2583, (1), sizeof(*(nbl_list->nbl_fep[i])))
;
2584 nbnxn_init_pairlist_fep(nbl_list->nbl_fep[i]);
2585 }
2586}
2587
2588/* Print statistics of a pair list, used for debug output */
2589static void print_nblist_statistics_simple(FILE *fp, const nbnxn_pairlist_t *nbl,
2590 const nbnxn_search_t nbs, real rl)
2591{
2592 const nbnxn_grid_t *grid;
2593 int cs[SHIFTS((2*1 +1)*(2*1 +1)*(2*2 +1))];
2594 int s, i, j;
2595 int npexcl;
2596
2597 /* This code only produces correct statistics with domain decomposition */
2598 grid = &nbs->grid[0];
2599
2600 fprintf(fp, "nbl nci %d ncj %d\n",
2601 nbl->nci, nbl->ncj);
2602 fprintf(fp, "nbl na_sc %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
2603 nbl->na_sc, rl, nbl->ncj, nbl->ncj/(double)grid->nc,
2604 nbl->ncj/(double)grid->nc*grid->na_sc,
2605 nbl->ncj/(double)grid->nc*grid->na_sc/(0.5*4.0/3.0*M_PI3.14159265358979323846*rl*rl*rl*grid->nc*grid->na_sc/det(nbs->box)));
2606
2607 fprintf(fp, "nbl average j cell list length %.1f\n",
2608 0.25*nbl->ncj/(double)nbl->nci);
2609
2610 for (s = 0; s < SHIFTS((2*1 +1)*(2*1 +1)*(2*2 +1)); s++)
2611 {
2612 cs[s] = 0;
2613 }
2614 npexcl = 0;
2615 for (i = 0; i < nbl->nci; i++)
2616 {
2617 cs[nbl->ci[i].shift & NBNXN_CI_SHIFT127] +=
2618 nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start;
2619
2620 j = nbl->ci[i].cj_ind_start;
2621 while (j < nbl->ci[i].cj_ind_end &&
2622 nbl->cj[j].excl != NBNXN_INTERACTION_MASK_ALL)
2623 {
2624 npexcl++;
2625 j++;
2626 }
2627 }
2628 fprintf(fp, "nbl cell pairs, total: %d excl: %d %.1f%%\n",
2629 nbl->ncj, npexcl, 100*npexcl/(double)nbl->ncj);
2630 for (s = 0; s < SHIFTS((2*1 +1)*(2*1 +1)*(2*2 +1)); s++)
2631 {
2632 if (cs[s] > 0)
2633 {
2634 fprintf(fp, "nbl shift %2d ncj %3d\n", s, cs[s]);
2635 }
2636 }
2637}
2638
2639/* Print statistics of a pair lists, used for debug output */
2640static void print_nblist_statistics_supersub(FILE *fp, const nbnxn_pairlist_t *nbl,
2641 const nbnxn_search_t nbs, real rl)
2642{
2643 const nbnxn_grid_t *grid;
2644 int i, j4, j, si, b;
2645 int c[GPU_NSUBCELL(2*2*2)+1];
2646
2647 /* This code only produces correct statistics with domain decomposition */
2648 grid = &nbs->grid[0];
2649
2650 fprintf(fp, "nbl nsci %d ncj4 %d nsi %d excl4 %d\n",
2651 nbl->nsci, nbl->ncj4, nbl->nci_tot, nbl->nexcl);
2652 fprintf(fp, "nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
2653 nbl->na_ci, rl, nbl->nci_tot, nbl->nci_tot/(double)grid->nsubc_tot,
2654 nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c,
2655 nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c/(0.5*4.0/3.0*M_PI3.14159265358979323846*rl*rl*rl*grid->nsubc_tot*grid->na_c/det(nbs->box)));
2656
2657 fprintf(fp, "nbl average j super cell list length %.1f\n",
2658 0.25*nbl->ncj4/(double)nbl->nsci);
2659 fprintf(fp, "nbl average i sub cell list length %.1f\n",
2660 nbl->nci_tot/((double)nbl->ncj4));
2661
2662 for (si = 0; si <= GPU_NSUBCELL(2*2*2); si++)
2663 {
2664 c[si] = 0;
2665 }
2666 for (i = 0; i < nbl->nsci; i++)
2667 {
2668 for (j4 = nbl->sci[i].cj4_ind_start; j4 < nbl->sci[i].cj4_ind_end; j4++)
2669 {
2670 for (j = 0; j < NBNXN_GPU_JGROUP_SIZE4; j++)
2671 {
2672 b = 0;
2673 for (si = 0; si < GPU_NSUBCELL(2*2*2); si++)
2674 {
2675 if (nbl->cj4[j4].imei[0].imask & (1U << (j*GPU_NSUBCELL(2*2*2) + si)))
2676 {
2677 b++;
2678 }
2679 }
2680 c[b]++;
2681 }
2682 }
2683 }
2684 for (b = 0; b <= GPU_NSUBCELL(2*2*2); b++)
2685 {
2686 fprintf(fp, "nbl j-list #i-subcell %d %7d %4.1f\n",
2687 b, c[b], 100.0*c[b]/(double)(nbl->ncj4*NBNXN_GPU_JGROUP_SIZE4));
2688 }
2689}
2690
2691/* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp */
2692static void low_get_nbl_exclusions(nbnxn_pairlist_t *nbl, int cj4,
2693 int warp, nbnxn_excl_t **excl)
2694{
2695 if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
2696 {
2697 /* No exclusions set, make a new list entry */
2698 nbl->cj4[cj4].imei[warp].excl_ind = nbl->nexcl;
2699 nbl->nexcl++;
2700 *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
2701 set_no_excls(*excl);
2702 }
2703 else
2704 {
2705 /* We already have some exclusions, new ones can be added to the list */
2706 *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
2707 }
2708}
2709
2710/* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp,
2711 * generates a new element and allocates extra memory, if necessary.
2712 */
2713static void get_nbl_exclusions_1(nbnxn_pairlist_t *nbl, int cj4,
2714 int warp, nbnxn_excl_t **excl)
2715{
2716 if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
2717 {
2718 /* We need to make a new list entry, check if we have space */
2719 check_excl_space(nbl, 1);
2720 }
2721 low_get_nbl_exclusions(nbl, cj4, warp, excl);
2722}
2723
2724/* Returns pointers to the exclusion mask for cj4-unit cj4 for both warps,
2725 * generates a new element and allocates extra memory, if necessary.
2726 */
2727static void get_nbl_exclusions_2(nbnxn_pairlist_t *nbl, int cj4,
2728 nbnxn_excl_t **excl_w0,
2729 nbnxn_excl_t **excl_w1)
2730{
2731 /* Check for space we might need */
2732 check_excl_space(nbl, 2);
2733
2734 low_get_nbl_exclusions(nbl, cj4, 0, excl_w0);
2735 low_get_nbl_exclusions(nbl, cj4, 1, excl_w1);
2736}
2737
2738/* Sets the self exclusions i=j and pair exclusions i>j */
2739static void set_self_and_newton_excls_supersub(nbnxn_pairlist_t *nbl,
2740 int cj4_ind, int sj_offset,
2741 int si)
2742{
2743 nbnxn_excl_t *excl[2];
2744 int ei, ej, w;
2745
2746 /* Here we only set the set self and double pair exclusions */
2747
2748 get_nbl_exclusions_2(nbl, cj4_ind, &excl[0], &excl[1]);
2749
2750 /* Only minor < major bits set */
2751 for (ej = 0; ej < nbl->na_ci; ej++)
2752 {
2753 w = (ej>>2);
2754 for (ei = ej; ei < nbl->na_ci; ei++)
2755 {
2756 excl[w]->pair[(ej & (NBNXN_GPU_JGROUP_SIZE4-1))*nbl->na_ci + ei] &=
2757 ~(1U << (sj_offset*GPU_NSUBCELL(2*2*2) + si));
2758 }
2759 }
2760}
2761
2762/* Returns a diagonal or off-diagonal interaction mask for plain C lists */
2763static unsigned int get_imask(gmx_bool rdiag, int ci, int cj)
2764{
2765 return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
2766}
2767
2768/* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
2769static unsigned int get_imask_simd_j2(gmx_bool rdiag, int ci, int cj)
2770{
2771 return (rdiag && ci*2 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_0 :
2772 (rdiag && ci*2+1 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_1 :
2773 NBNXN_INTERACTION_MASK_ALL));
2774}
2775
2776/* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
2777static unsigned int get_imask_simd_j4(gmx_bool rdiag, int ci, int cj)
2778{
2779 return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
2780}
2781
2782/* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
2783static unsigned int get_imask_simd_j8(gmx_bool rdiag, int ci, int cj)
2784{
2785 return (rdiag && ci == cj*2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0 :
2786 (rdiag && ci == cj*2+1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1 :
2787 NBNXN_INTERACTION_MASK_ALL));
2788}
2789
2790#ifdef GMX_NBNXN_SIMD
2791#if GMX_SIMD_REAL_WIDTH4 == 2
2792#define get_imask_simd_4xnget_imask_simd_j4 get_imask_simd_j2
2793#endif
2794#if GMX_SIMD_REAL_WIDTH4 == 4
2795#define get_imask_simd_4xnget_imask_simd_j4 get_imask_simd_j4
2796#endif
2797#if GMX_SIMD_REAL_WIDTH4 == 8
2798#define get_imask_simd_4xnget_imask_simd_j4 get_imask_simd_j8
2799#define get_imask_simd_2xnn get_imask_simd_j4
2800#endif
2801#if GMX_SIMD_REAL_WIDTH4 == 16
2802#define get_imask_simd_2xnn get_imask_simd_j8
2803#endif
2804#endif
2805
2806/* Plain C code for making a pair list of cell ci vs cell cjf-cjl.
2807 * Checks bounding box distances and possibly atom pair distances.
2808 */
2809static void make_cluster_list_simple(const nbnxn_grid_t *gridj,
2810 nbnxn_pairlist_t *nbl,
2811 int ci, int cjf, int cjl,
2812 gmx_bool remove_sub_diag,
2813 const real *x_j,
2814 real rl2, float rbb2,
2815 int *ndistc)
2816{
2817 const nbnxn_list_work_t *work;
2818
2819 const nbnxn_bb_t *bb_ci;
2820 const real *x_ci;
2821
2822 gmx_bool InRange;
2823 real d2;
2824 int cjf_gl, cjl_gl, cj;
2825
2826 work = nbl->work;
2827
2828 bb_ci = nbl->work->bb_ci;
2829 x_ci = nbl->work->x_ci;
2830
2831 InRange = FALSE0;
2832 while (!InRange && cjf <= cjl)
2833 {
2834 d2 = subc_bb_dist2(0, bb_ci, cjf, gridj->bb);
2835 *ndistc += 2;
2836
2837 /* Check if the distance is within the distance where
2838 * we use only the bounding box distance rbb,
2839 * or within the cut-off and there is at least one atom pair
2840 * within the cut-off.
2841 */
2842 if (d2 < rbb2)
2843 {
2844 InRange = TRUE1;
2845 }
2846 else if (d2 < rl2)
2847 {
2848 int i, j;
2849
2850 cjf_gl = gridj->cell0 + cjf;
2851 for (i = 0; i < NBNXN_CPU_CLUSTER_I_SIZE4 && !InRange; i++)
2852 {
2853 for (j = 0; j < NBNXN_CPU_CLUSTER_I_SIZE4; j++)
2854 {
2855 InRange = InRange ||
2856 (sqr(x_ci[i*STRIDE_XYZ3+XX0] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE4+j)*STRIDE_XYZ3+XX0]) +
2857 sqr(x_ci[i*STRIDE_XYZ3+YY1] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE4+j)*STRIDE_XYZ3+YY1]) +
2858 sqr(x_ci[i*STRIDE_XYZ3+ZZ2] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE4+j)*STRIDE_XYZ3+ZZ2]) < rl2);
2859 }
2860 }
2861 *ndistc += NBNXN_CPU_CLUSTER_I_SIZE4*NBNXN_CPU_CLUSTER_I_SIZE4;
2862 }
2863 if (!InRange)
2864 {
2865 cjf++;
2866 }
2867 }
2868 if (!InRange)
2869 {
2870 return;
2871 }
2872
2873 InRange = FALSE0;
2874 while (!InRange && cjl > cjf)
2875 {
2876 d2 = subc_bb_dist2(0, bb_ci, cjl, gridj->bb);
2877 *ndistc += 2;
2878
2879 /* Check if the distance is within the distance where
2880 * we use only the bounding box distance rbb,
2881 * or within the cut-off and there is at least one atom pair
2882 * within the cut-off.
2883 */
2884 if (d2 < rbb2)
2885 {
2886 InRange = TRUE1;
2887 }
2888 else if (d2 < rl2)
2889 {
2890 int i, j;
2891
2892 cjl_gl = gridj->cell0 + cjl;
2893 for (i = 0; i < NBNXN_CPU_CLUSTER_I_SIZE4 && !InRange; i++)
2894 {
2895 for (j = 0; j < NBNXN_CPU_CLUSTER_I_SIZE4; j++)
2896 {
2897 InRange = InRange ||
2898 (sqr(x_ci[i*STRIDE_XYZ3+XX0] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE4+j)*STRIDE_XYZ3+XX0]) +
2899 sqr(x_ci[i*STRIDE_XYZ3+YY1] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE4+j)*STRIDE_XYZ3+YY1]) +
2900 sqr(x_ci[i*STRIDE_XYZ3+ZZ2] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE4+j)*STRIDE_XYZ3+ZZ2]) < rl2);
2901 }
2902 }
2903 *ndistc += NBNXN_CPU_CLUSTER_I_SIZE4*NBNXN_CPU_CLUSTER_I_SIZE4;
2904 }
2905 if (!InRange)
2906 {
2907 cjl--;
2908 }
2909 }
2910
2911 if (cjf <= cjl)
2912 {
2913 for (cj = cjf; cj <= cjl; cj++)
2914 {
2915 /* Store cj and the interaction mask */
2916 nbl->cj[nbl->ncj].cj = gridj->cell0 + cj;
2917 nbl->cj[nbl->ncj].excl = get_imask(remove_sub_diag, ci, cj);
2918 nbl->ncj++;
2919 }
2920 /* Increase the closing index in i super-cell list */
2921 nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
2922 }
2923}
2924
2925#ifdef GMX_NBNXN_SIMD_4XN
2926#include "nbnxn_search_simd_4xn.h"
2927#endif
2928#ifdef GMX_NBNXN_SIMD_2XNN
2929#include "nbnxn_search_simd_2xnn.h"
2930#endif
2931
2932/* Plain C or SIMD4 code for making a pair list of super-cell sci vs scj.
2933 * Checks bounding box distances and possibly atom pair distances.
2934 */
2935static void make_cluster_list_supersub(const nbnxn_grid_t *gridi,
2936 const nbnxn_grid_t *gridj,
2937 nbnxn_pairlist_t *nbl,
2938 int sci, int scj,
2939 gmx_bool sci_equals_scj,
2940 int stride, const real *x,
2941 real rl2, float rbb2,
2942 int *ndistc)
2943{
2944 int na_c;
2945 int npair;
2946 int cjo, ci1, ci, cj, cj_gl;
2947 int cj4_ind, cj_offset;
2948 unsigned int imask;
2949 nbnxn_cj4_t *cj4;
2950#ifdef NBNXN_BBXXXX
2951 const float *pbb_ci;
2952#else
2953 const nbnxn_bb_t *bb_ci;
2954#endif
2955 const real *x_ci;
2956 float *d2l, d2;
2957 int w;
2958#define PRUNE_LIST_CPU_ONE
2959#ifdef PRUNE_LIST_CPU_ONE
2960 int ci_last = -1;
2961#endif
2962
2963 d2l = nbl->work->d2;
2964
2965#ifdef NBNXN_BBXXXX
2966 pbb_ci = nbl->work->pbb_ci;
2967#else
2968 bb_ci = nbl->work->bb_ci;
2969#endif
2970 x_ci = nbl->work->x_ci;
2971
2972 na_c = gridj->na_c;
2973
2974 for (cjo = 0; cjo < gridj->nsubc[scj]; cjo++)
2975 {
2976 cj4_ind = (nbl->work->cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG2);
2977 cj_offset = nbl->work->cj_ind - cj4_ind*NBNXN_GPU_JGROUP_SIZE4;
2978 cj4 = &nbl->cj4[cj4_ind];
2979
2980 cj = scj*GPU_NSUBCELL(2*2*2) + cjo;
2981
2982 cj_gl = gridj->cell0*GPU_NSUBCELL(2*2*2) + cj;
2983
2984 /* Initialize this j-subcell i-subcell list */
2985 cj4->cj[cj_offset] = cj_gl;
2986 imask = 0;
2987
2988 if (sci_equals_scj)
2989 {
2990 ci1 = cjo + 1;
2991 }
2992 else
2993 {
2994 ci1 = gridi->nsubc[sci];
2995 }
2996
2997#ifdef NBNXN_BBXXXX
2998 /* Determine all ci1 bb distances in one call with SIMD4 */
2999 subc_bb_dist2_simd4_xxxx(gridj->pbb+(cj>>STRIDE_PBB_2LOG2)*NNBSBB_XXXX(2*3*4)+(cj & (STRIDE_PBB4-1)),
3000 ci1, pbb_ci, d2l);
3001 *ndistc += na_c*2;
3002#endif
3003
3004 npair = 0;
3005 /* We use a fixed upper-bound instead of ci1 to help optimization */
3006 for (ci = 0; ci < GPU_NSUBCELL(2*2*2); ci++)
3007 {
3008 if (ci == ci1)
3009 {
3010 break;
3011 }
3012
3013#ifndef NBNXN_BBXXXX
3014 /* Determine the bb distance between ci and cj */
3015 d2l[ci] = subc_bb_dist2(ci, bb_ci, cj, gridj->bb);
3016 *ndistc += 2;
3017#endif
3018 d2 = d2l[ci];
3019
3020#ifdef PRUNE_LIST_CPU_ALL
3021 /* Check if the distance is within the distance where
3022 * we use only the bounding box distance rbb,
3023 * or within the cut-off and there is at least one atom pair
3024 * within the cut-off. This check is very costly.
3025 */
3026 *ndistc += na_c*na_c;
3027 if (d2 < rbb2 ||
3028 (d2 < rl2 &&
3029#ifdef NBNXN_PBB_SIMD4
3030 subc_in_range_simd4
3031#else
3032 subc_in_range_x
3033#endif
3034 (na_c, ci, x_ci, cj_gl, stride, x, rl2)))
3035#else
3036 /* Check if the distance between the two bounding boxes
3037 * in within the pair-list cut-off.
3038 */
3039 if (d2 < rl2)
3040#endif
3041 {
3042 /* Flag this i-subcell to be taken into account */
3043 imask |= (1U << (cj_offset*GPU_NSUBCELL(2*2*2)+ci));
3044
3045#ifdef PRUNE_LIST_CPU_ONE
3046 ci_last = ci;
3047#endif
3048
3049 npair++;
3050 }
3051 }
3052
3053#ifdef PRUNE_LIST_CPU_ONE
3054 /* If we only found 1 pair, check if any atoms are actually
3055 * within the cut-off, so we could get rid of it.
3056 */
3057 if (npair == 1 && d2l[ci_last] >= rbb2)
3058 {
3059 /* Avoid using function pointers here, as it's slower */
3060 if (
3061#ifdef NBNXN_PBB_SIMD4
3062 !subc_in_range_simd4
3063#else
3064 !subc_in_range_x
3065#endif
3066 (na_c, ci_last, x_ci, cj_gl, stride, x, rl2))
3067 {
3068 imask &= ~(1U << (cj_offset*GPU_NSUBCELL(2*2*2)+ci_last));
3069 npair--;
3070 }
3071 }
3072#endif
3073
3074 if (npair > 0)
3075 {
3076 /* We have a useful sj entry, close it now */
3077
3078 /* Set the exclucions for the ci== sj entry.
3079 * Here we don't bother to check if this entry is actually flagged,
3080 * as it will nearly always be in the list.
3081 */
3082 if (sci_equals_scj)
3083 {
3084 set_self_and_newton_excls_supersub(nbl, cj4_ind, cj_offset, cjo);
3085 }
3086
3087 /* Copy the cluster interaction mask to the list */
3088 for (w = 0; w < NWARP2; w++)
3089 {
3090 cj4->imei[w].imask |= imask;
3091 }
3092
3093 nbl->work->cj_ind++;
3094
3095 /* Keep the count */
3096 nbl->nci_tot += npair;
3097
3098 /* Increase the closing index in i super-cell list */
3099 nbl->sci[nbl->nsci].cj4_ind_end =
3100 ((nbl->work->cj_ind+NBNXN_GPU_JGROUP_SIZE4-1) >> NBNXN_GPU_JGROUP_SIZE_2LOG2);
3101 }
3102 }
3103}
3104
3105/* Set all atom-pair exclusions from the topology stored in excl
3106 * as masks in the pair-list for simple list i-entry nbl_ci
3107 */
3108static void set_ci_top_excls(const nbnxn_search_t nbs,
3109 nbnxn_pairlist_t *nbl,
3110 gmx_bool diagRemoved,
3111 int na_ci_2log,
3112 int na_cj_2log,
3113 const nbnxn_ci_t *nbl_ci,
3114 const t_blocka *excl)
3115{
3116 const int *cell;
3117 int ci;
3118 int cj_ind_first, cj_ind_last;
3119 int cj_first, cj_last;
3120 int ndirect;
3121 int i, ai, aj, si, eind, ge, se;
3122 int found, cj_ind_0, cj_ind_1, cj_ind_m;
3123 int cj_m;
3124 gmx_bool Found_si;
3125 int si_ind;
3126 nbnxn_excl_t *nbl_excl;
3127 int inner_i, inner_e;
3128
3129 cell = nbs->cell;
3130
3131 if (nbl_ci->cj_ind_end == nbl_ci->cj_ind_start)
3132 {
3133 /* Empty list */
3134 return;
3135 }
3136
3137 ci = nbl_ci->ci;
3138
3139 cj_ind_first = nbl_ci->cj_ind_start;
3140 cj_ind_last = nbl->ncj - 1;
3141
3142 cj_first = nbl->cj[cj_ind_first].cj;
3143 cj_last = nbl->cj[cj_ind_last].cj;
3144
3145 /* Determine how many contiguous j-cells we have starting
3146 * from the first i-cell. This number can be used to directly
3147 * calculate j-cell indices for excluded atoms.
3148 */
3149 ndirect = 0;
3150 if (na_ci_2log == na_cj_2log)
3151 {
3152 while (cj_ind_first + ndirect <= cj_ind_last &&
3153 nbl->cj[cj_ind_first+ndirect].cj == ci + ndirect)
3154 {
3155 ndirect++;
3156 }
3157 }
3158#ifdef NBNXN_SEARCH_BB_SIMD4
3159 else
3160 {
3161 while (cj_ind_first + ndirect <= cj_ind_last &&
3162 nbl->cj[cj_ind_first+ndirect].cj == ci_to_cj(na_cj_2log, ci) + ndirect)
3163 {
3164 ndirect++;
3165 }
3166 }
3167#endif
3168
3169 /* Loop over the atoms in the i super-cell */
3170 for (i = 0; i < nbl->na_sc; i++)
3171 {
3172 ai = nbs->a[ci*nbl->na_sc+i];
3173 if (ai >= 0)
3174 {
3175 si = (i>>na_ci_2log);
3176
3177 /* Loop over the topology-based exclusions for this i-atom */
3178 for (eind = excl->index[ai]; eind < excl->index[ai+1]; eind++)
3179 {
3180 aj = excl->a[eind];
3181
3182 if (aj == ai)
3183 {
3184 /* The self exclusion are already set, save some time */
3185 continue;
3186 }
3187
3188 ge = cell[aj];
3189
3190 /* Without shifts we only calculate interactions j>i
3191 * for one-way pair-lists.
3192 */
3193 if (diagRemoved && ge <= ci*nbl->na_sc + i)
3194 {
3195 continue;
3196 }
3197
3198 se = (ge >> na_cj_2log);
3199
3200 /* Could the cluster se be in our list? */
3201 if (se >= cj_first && se <= cj_last)
3202 {
3203 if (se < cj_first + ndirect)
3204 {
3205 /* We can calculate cj_ind directly from se */
3206 found = cj_ind_first + se - cj_first;
3207 }
3208 else
3209 {
3210 /* Search for se using bisection */
3211 found = -1;
3212 cj_ind_0 = cj_ind_first + ndirect;
3213 cj_ind_1 = cj_ind_last + 1;
3214 while (found == -1 && cj_ind_0 < cj_ind_1)
3215 {
3216 cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
3217
3218 cj_m = nbl->cj[cj_ind_m].cj;
3219
3220 if (se == cj_m)
3221 {
3222 found = cj_ind_m;
3223 }
3224 else if (se < cj_m)
3225 {
3226 cj_ind_1 = cj_ind_m;
3227 }
3228 else
3229 {
3230 cj_ind_0 = cj_ind_m + 1;
3231 }
3232 }
3233 }
3234
3235 if (found >= 0)
3236 {
3237 inner_i = i - (si << na_ci_2log);
3238 inner_e = ge - (se << na_cj_2log);
3239
3240 nbl->cj[found].excl &= ~(1U<<((inner_i<<na_cj_2log) + inner_e));
3241/* The next code line is usually not needed. We do not want to version
3242 * away the above line, because there is logic that relies on being
3243 * able to detect easily whether any exclusions exist. */
3244#if (defined GMX_SIMD_IBM_QPX)
3245 nbl->cj[found].interaction_mask_indices[inner_i] &= ~(1U << inner_e);
3246#endif
3247 }
3248 }
3249 }
3250 }
3251 }
3252}
3253
3254/* Add a new i-entry to the FEP list and copy the i-properties */
3255static gmx_inlineinline void fep_list_new_nri_copy(t_nblist *nlist)
3256{
3257 /* Add a new i-entry */
3258 nlist->nri++;
3259
3260 assert(nlist->nri < nlist->maxnri)((void) (0));
3261
3262 /* Duplicate the last i-entry, except for jindex, which continues */
3263 nlist->iinr[nlist->nri] = nlist->iinr[nlist->nri-1];
3264 nlist->shift[nlist->nri] = nlist->shift[nlist->nri-1];
3265 nlist->gid[nlist->nri] = nlist->gid[nlist->nri-1];
3266 nlist->jindex[nlist->nri] = nlist->nrj;
3267}
3268
3269/* For load balancing of the free-energy lists over threads, we set
3270 * the maximum nrj size of an i-entry to 40. This leads to good
3271 * load balancing in the worst case scenario of a single perturbed
3272 * particle on 16 threads, while not introducing significant overhead.
3273 * Note that half of the perturbed pairs will anyhow end up in very small lists,
3274 * since non perturbed i-particles will see few perturbed j-particles).
3275 */
3276const int max_nrj_fep = 40;
3277
3278/* Exclude the perturbed pairs from the Verlet list. This is only done to avoid
3279 * singularities for overlapping particles (0/0), since the charges and
3280 * LJ parameters have been zeroed in the nbnxn data structure.
3281 * Simultaneously make a group pair list for the perturbed pairs.
3282 */
3283static void make_fep_list(const nbnxn_search_t nbs,
3284 const nbnxn_atomdata_t *nbat,
3285 nbnxn_pairlist_t *nbl,
3286 gmx_bool bDiagRemoved,
3287 nbnxn_ci_t *nbl_ci,
3288 const nbnxn_grid_t *gridi,
3289 const nbnxn_grid_t *gridj,
3290 t_nblist *nlist)
3291{
3292 int ci, cj_ind_start, cj_ind_end, cj_ind, cja, cjr;
3293 int nri_max;
3294 int ngid, gid_i = 0, gid_j, gid;
3295 int egp_shift, egp_mask;
3296 int gid_cj = 0;
3297 int i, j, ind_i, ind_j, ai, aj;
3298 int nri;
3299 gmx_bool bFEP_i, bFEP_i_all;
3300
3301 if (nbl_ci->cj_ind_end == nbl_ci->cj_ind_start)
3302 {
3303 /* Empty list */
3304 return;
3305 }
3306
3307 ci = nbl_ci->ci;
3308
3309 cj_ind_start = nbl_ci->cj_ind_start;
3310 cj_ind_end = nbl_ci->cj_ind_end;
3311
3312 /* In worst case we have alternating energy groups and create npair lists */
3313 nri_max = nbl->na_ci*(cj_ind_end - cj_ind_start);
3314 if (nlist->nri + nri_max > nlist->maxnri)
3315 {
3316 nlist->maxnri = over_alloc_large(nlist->nri + nri_max)(int)(1.19*(nlist->nri + nri_max) + 1000);
3317 reallocate_nblist(nlist);
3318 }
3319
3320 ngid = nbat->nenergrp;
3321
3322 if (ngid*gridj->na_cj > sizeof(gid_cj)*8)
3323 {
3324 gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 3324
, "The Verlet scheme with %dx%d kernels and free-energy only supports up to %d energy groups",
3325 gridi->na_c, gridj->na_cj, (sizeof(gid_cj)*8)/gridj->na_cj);
3326 }
3327
3328 egp_shift = nbat->neg_2log;
3329 egp_mask = (1<<nbat->neg_2log) - 1;
3330
3331 /* Loop over the atoms in the i sub-cell */
3332 bFEP_i_all = TRUE1;
3333 for (i = 0; i < nbl->na_ci; i++)
3334 {
3335 ind_i = ci*nbl->na_ci + i;
3336 ai = nbs->a[ind_i];
3337 if (ai >= 0)
3338 {
3339 nri = nlist->nri;
3340 nlist->jindex[nri+1] = nlist->jindex[nri];
3341 nlist->iinr[nri] = ai;
3342 /* The actual energy group pair index is set later */
3343 nlist->gid[nri] = 0;
3344 nlist->shift[nri] = nbl_ci->shift & NBNXN_CI_SHIFT127;
3345
3346 bFEP_i = gridi->fep[ci - gridi->cell0] & (1 << i);
3347
3348 bFEP_i_all = bFEP_i_all && bFEP_i;
3349
3350 if ((nlist->nrj + cj_ind_end - cj_ind_start)*nbl->na_cj > nlist->maxnrj)
3351 {
3352 nlist->maxnrj = over_alloc_small((nlist->nrj + cj_ind_end - cj_ind_start)*nbl->na_cj)(int)(1.19*((nlist->nrj + cj_ind_end - cj_ind_start)*nbl->
na_cj) + 8000)
;
3353 srenew(nlist->jjnr, nlist->maxnrj)(nlist->jjnr) = save_realloc("nlist->jjnr", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 3353, (nlist->jjnr), (nlist->maxnrj), sizeof(*(nlist->
jjnr)))
;
3354 srenew(nlist->excl_fep, nlist->maxnrj)(nlist->excl_fep) = save_realloc("nlist->excl_fep", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 3354, (nlist->excl_fep), (nlist->maxnrj), sizeof(*(nlist
->excl_fep)))
;
3355 }
3356
3357 if (ngid > 1)
3358 {
3359 gid_i = (nbat->energrp[ci] >> (egp_shift*i)) & egp_mask;
3360 }
3361
3362 for (cj_ind = cj_ind_start; cj_ind < cj_ind_end; cj_ind++)
3363 {
3364 unsigned int fep_cj;
3365
3366 cja = nbl->cj[cj_ind].cj;
3367
3368 if (gridj->na_cj == gridj->na_c)
3369 {
3370 cjr = cja - gridj->cell0;
3371 fep_cj = gridj->fep[cjr];
3372 if (ngid > 1)
3373 {
3374 gid_cj = nbat->energrp[cja];
3375 }
3376 }
3377 else if (2*gridj->na_cj == gridj->na_c)
3378 {
3379 cjr = cja - gridj->cell0*2;
3380 /* Extract half of the ci fep/energrp mask */
3381 fep_cj = (gridj->fep[cjr>>1] >> ((cjr&1)*gridj->na_cj)) & ((1<<gridj->na_cj) - 1);
3382 if (ngid > 1)
3383 {
3384 gid_cj = nbat->energrp[cja>>1] >> ((cja&1)*gridj->na_cj*egp_shift) & ((1<<(gridj->na_cj*egp_shift)) - 1);
3385 }
3386 }
3387 else
3388 {
3389 cjr = cja - (gridj->cell0>>1);
3390 /* Combine two ci fep masks/energrp */
3391 fep_cj = gridj->fep[cjr*2] + (gridj->fep[cjr*2+1] << gridj->na_c);
3392 if (ngid > 1)
3393 {
3394 gid_cj = nbat->energrp[cja*2] + (nbat->energrp[cja*2+1] << (gridj->na_c*egp_shift));
3395 }
3396 }
3397
3398 if (bFEP_i || fep_cj != 0)
3399 {
3400 for (j = 0; j < nbl->na_cj; j++)
3401 {
3402 /* Is this interaction perturbed and not excluded? */
3403 ind_j = cja*nbl->na_cj + j;
3404 aj = nbs->a[ind_j];
3405 if (aj >= 0 &&
3406 (bFEP_i || (fep_cj & (1 << j))) &&
3407 (!bDiagRemoved || ind_j >= ind_i))
3408 {
3409 if (ngid > 1)
3410 {
3411 gid_j = (gid_cj >> (j*egp_shift)) & egp_mask;
3412 gid = GID(gid_i, gid_j, ngid)((gid_i < gid_j) ? (gid_i*ngid+gid_j) : (gid_j*ngid+gid_i)
)
;
3413
3414 if (nlist->nrj > nlist->jindex[nri] &&
3415 nlist->gid[nri] != gid)
3416 {
3417 /* Energy group pair changed: new list */
3418 fep_list_new_nri_copy(nlist);
3419 nri = nlist->nri;
3420 }
3421 nlist->gid[nri] = gid;
3422 }
3423
3424 if (nlist->nrj - nlist->jindex[nri] >= max_nrj_fep)
3425 {
3426 fep_list_new_nri_copy(nlist);
3427 nri = nlist->nri;
3428 }
3429
3430 /* Add it to the FEP list */
3431 nlist->jjnr[nlist->nrj] = aj;
3432 nlist->excl_fep[nlist->nrj] = (nbl->cj[cj_ind].excl >> (i*nbl->na_cj + j)) & 1;
3433 nlist->nrj++;
3434
3435 /* Exclude it from the normal list.
3436 * Note that the charge has been set to zero,
3437 * but we need to avoid 0/0, as perturbed atoms
3438 * can be on top of each other.
3439 * (and the LJ parameters have not been zeroed)
3440 */
3441 nbl->cj[cj_ind].excl &= ~(1U << (i*nbl->na_cj + j));
3442 }
3443 }
3444 }
3445 }
3446
3447 if (nlist->nrj > nlist->jindex[nri])
3448 {
3449 nlist->nri++;
3450 nlist->jindex[nlist->nri] = nlist->nrj;
3451 }
3452 }
3453 }
3454
3455 if (bFEP_i_all)
3456 {
3457 /* All interactions are perturbed, we can skip this entry */
3458 nbl_ci->cj_ind_end = cj_ind_start;
3459 }
3460}
3461
3462/* Return the index of atom a within a cluster */
3463static gmx_inlineinline int cj_mod_cj4(int cj)
3464{
3465 return cj & (NBNXN_GPU_JGROUP_SIZE4 - 1);
3466}
3467
3468/* Convert a j-cluster to a cj4 group */
3469static gmx_inlineinline int cj_to_cj4(int cj)
3470{
3471 return cj >> NBNXN_GPU_JGROUP_SIZE_2LOG2;
3472}
3473
3474/* Return the index of an j-atom within a warp */
3475static gmx_inlineinline int a_mod_wj(int a)
3476{
3477 return a & (NBNXN_GPU_CLUSTER_SIZE8/2 - 1);
3478}
3479
3480/* As make_fep_list above, but for super/sub lists. */
3481static void make_fep_list_supersub(const nbnxn_search_t nbs,
3482 const nbnxn_atomdata_t *nbat,
3483 nbnxn_pairlist_t *nbl,
3484 gmx_bool bDiagRemoved,
3485 const nbnxn_sci_t *nbl_sci,
3486 real shx,
3487 real shy,
3488 real shz,
3489 real rlist_fep2,
3490 const nbnxn_grid_t *gridi,
3491 const nbnxn_grid_t *gridj,
3492 t_nblist *nlist)
3493{
3494 int sci, cj4_ind_start, cj4_ind_end, cj4_ind, gcj, cjr;
3495 int nri_max;
3496 int c, c_abs;
3497 int i, j, ind_i, ind_j, ai, aj;
3498 int nri;
3499 gmx_bool bFEP_i;
3500 real xi, yi, zi;
3501 const nbnxn_cj4_t *cj4;
3502
3503 if (nbl_sci->cj4_ind_end == nbl_sci->cj4_ind_start)
3504 {
3505 /* Empty list */
3506 return;
3507 }
3508
3509 sci = nbl_sci->sci;
3510
3511 cj4_ind_start = nbl_sci->cj4_ind_start;
3512 cj4_ind_end = nbl_sci->cj4_ind_end;
3513
3514 /* No energy groups (yet), so we split lists in max_nrj_fep pairs */
3515 nri_max = nbl->na_sc*(1 + ((cj4_ind_end - cj4_ind_start)*NBNXN_GPU_JGROUP_SIZE4)/max_nrj_fep);
3516 if (nlist->nri + nri_max > nlist->maxnri)
3517 {
3518 nlist->maxnri = over_alloc_large(nlist->nri + nri_max)(int)(1.19*(nlist->nri + nri_max) + 1000);
3519 reallocate_nblist(nlist);
3520 }
3521
3522 /* Loop over the atoms in the i super-cluster */
3523 for (c = 0; c < GPU_NSUBCELL(2*2*2); c++)
3524 {
3525 c_abs = sci*GPU_NSUBCELL(2*2*2) + c;
3526
3527 for (i = 0; i < nbl->na_ci; i++)
3528 {
3529 ind_i = c_abs*nbl->na_ci + i;
3530 ai = nbs->a[ind_i];
3531 if (ai >= 0)
3532 {
3533 nri = nlist->nri;
3534 nlist->jindex[nri+1] = nlist->jindex[nri];
3535 nlist->iinr[nri] = ai;
3536 /* With GPUs, energy groups are not supported */
3537 nlist->gid[nri] = 0;
3538 nlist->shift[nri] = nbl_sci->shift & NBNXN_CI_SHIFT127;
3539
3540 bFEP_i = (gridi->fep[c_abs - gridi->cell0] & (1 << i));
3541
3542 xi = nbat->x[ind_i*nbat->xstride+XX0] + shx;
3543 yi = nbat->x[ind_i*nbat->xstride+YY1] + shy;
3544 zi = nbat->x[ind_i*nbat->xstride+ZZ2] + shz;
3545
3546 if ((nlist->nrj + cj4_ind_end - cj4_ind_start)*NBNXN_GPU_JGROUP_SIZE4*nbl->na_cj > nlist->maxnrj)
3547 {
3548 nlist->maxnrj = over_alloc_small((nlist->nrj + cj4_ind_end - cj4_ind_start)*NBNXN_GPU_JGROUP_SIZE*nbl->na_cj)(int)(1.19*((nlist->nrj + cj4_ind_end - cj4_ind_start)*4*nbl
->na_cj) + 8000)
;
3549 srenew(nlist->jjnr, nlist->maxnrj)(nlist->jjnr) = save_realloc("nlist->jjnr", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 3549, (nlist->jjnr), (nlist->maxnrj), sizeof(*(nlist->
jjnr)))
;
3550 srenew(nlist->excl_fep, nlist->maxnrj)(nlist->excl_fep) = save_realloc("nlist->excl_fep", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 3550, (nlist->excl_fep), (nlist->maxnrj), sizeof(*(nlist
->excl_fep)))
;
3551 }
3552
3553 for (cj4_ind = cj4_ind_start; cj4_ind < cj4_ind_end; cj4_ind++)
3554 {
3555 cj4 = &nbl->cj4[cj4_ind];
3556
3557 for (gcj = 0; gcj < NBNXN_GPU_JGROUP_SIZE4; gcj++)
3558 {
3559 unsigned int fep_cj;
3560
3561 if ((cj4->imei[0].imask & (1U << (gcj*GPU_NSUBCELL(2*2*2) + c))) == 0)
3562 {
3563 /* Skip this ci for this cj */
3564 continue;
3565 }
3566
3567 cjr = cj4->cj[gcj] - gridj->cell0*GPU_NSUBCELL(2*2*2);
3568
3569 fep_cj = gridj->fep[cjr];
3570
3571 if (bFEP_i || fep_cj != 0)
3572 {
3573 for (j = 0; j < nbl->na_cj; j++)
3574 {
3575 /* Is this interaction perturbed and not excluded? */
3576 ind_j = (gridj->cell0*GPU_NSUBCELL(2*2*2) + cjr)*nbl->na_cj + j;
3577 aj = nbs->a[ind_j];
3578 if (aj >= 0 &&
3579 (bFEP_i || (fep_cj & (1 << j))) &&
3580 (!bDiagRemoved || ind_j >= ind_i))
3581 {
3582 nbnxn_excl_t *excl;
3583 int excl_pair;
3584 unsigned int excl_bit;
3585 real dx, dy, dz;
3586
3587 get_nbl_exclusions_1(nbl, cj4_ind, j>>2, &excl);
3588
3589 excl_pair = a_mod_wj(j)*nbl->na_ci + i;
3590 excl_bit = (1U << (gcj*GPU_NSUBCELL(2*2*2) + c));
3591
3592 dx = nbat->x[ind_j*nbat->xstride+XX0] - xi;
3593 dy = nbat->x[ind_j*nbat->xstride+YY1] - yi;
3594 dz = nbat->x[ind_j*nbat->xstride+ZZ2] - zi;
3595
3596 /* The unpruned GPU list has more than 2/3
3597 * of the atom pairs beyond rlist. Using
3598 * this list will cause a lot of overhead
3599 * in the CPU FEP kernels, especially
3600 * relative to the fast GPU kernels.
3601 * So we prune the FEP list here.
3602 */
3603 if (dx*dx + dy*dy + dz*dz < rlist_fep2)
3604 {
3605 if (nlist->nrj - nlist->jindex[nri] >= max_nrj_fep)
3606 {
3607 fep_list_new_nri_copy(nlist);
3608 nri = nlist->nri;
3609 }
3610
3611 /* Add it to the FEP list */
3612 nlist->jjnr[nlist->nrj] = aj;
3613 nlist->excl_fep[nlist->nrj] = (excl->pair[excl_pair] & excl_bit) ? 1 : 0;
3614 nlist->nrj++;
3615 }
3616
3617 /* Exclude it from the normal list.
3618 * Note that the charge and LJ parameters have
3619 * been set to zero, but we need to avoid 0/0,
3620 * as perturbed atoms can be on top of each other.
3621 */
3622 excl->pair[excl_pair] &= ~excl_bit;
3623 }
3624 }
3625
3626 /* Note that we could mask out this pair in imask
3627 * if all i- and/or all j-particles are perturbed.
3628 * But since the perturbed pairs on the CPU will
3629 * take an order of magnitude more time, the GPU
3630 * will finish before the CPU and there is no gain.
3631 */
3632 }
3633 }
3634 }
3635
3636 if (nlist->nrj > nlist->jindex[nri])
3637 {
3638 nlist->nri++;
3639 nlist->jindex[nlist->nri] = nlist->nrj;
3640 }
3641 }
3642 }
3643 }
3644}
3645
3646/* Set all atom-pair exclusions from the topology stored in excl
3647 * as masks in the pair-list for i-super-cell entry nbl_sci
3648 */
3649static void set_sci_top_excls(const nbnxn_search_t nbs,
3650 nbnxn_pairlist_t *nbl,
3651 gmx_bool diagRemoved,
3652 int na_c_2log,
3653 const nbnxn_sci_t *nbl_sci,
3654 const t_blocka *excl)
3655{
3656 const int *cell;
3657 int na_c;
3658 int sci;
3659 int cj_ind_first, cj_ind_last;
3660 int cj_first, cj_last;
3661 int ndirect;
3662 int i, ai, aj, si, eind, ge, se;
3663 int found, cj_ind_0, cj_ind_1, cj_ind_m;
3664 int cj_m;
3665 gmx_bool Found_si;
3666 int si_ind;
3667 nbnxn_excl_t *nbl_excl;
3668 int inner_i, inner_e, w;
3669
3670 cell = nbs->cell;
3671
3672 na_c = nbl->na_ci;
3673
3674 if (nbl_sci->cj4_ind_end == nbl_sci->cj4_ind_start)
3675 {
3676 /* Empty list */
3677 return;
3678 }
3679
3680 sci = nbl_sci->sci;
3681
3682 cj_ind_first = nbl_sci->cj4_ind_start*NBNXN_GPU_JGROUP_SIZE4;
3683 cj_ind_last = nbl->work->cj_ind - 1;
3684
3685 cj_first = nbl->cj4[nbl_sci->cj4_ind_start].cj[0];
3686 cj_last = nbl_cj(nbl, cj_ind_last);
3687
3688 /* Determine how many contiguous j-clusters we have starting
3689 * from the first i-cluster. This number can be used to directly
3690 * calculate j-cluster indices for excluded atoms.
3691 */
3692 ndirect = 0;
3693 while (cj_ind_first + ndirect <= cj_ind_last &&
3694 nbl_cj(nbl, cj_ind_first+ndirect) == sci*GPU_NSUBCELL(2*2*2) + ndirect)
3695 {
3696 ndirect++;
3697 }
3698
3699 /* Loop over the atoms in the i super-cell */
3700 for (i = 0; i < nbl->na_sc; i++)
3701 {
3702 ai = nbs->a[sci*nbl->na_sc+i];
3703 if (ai >= 0)
3704 {
3705 si = (i>>na_c_2log);
3706
3707 /* Loop over the topology-based exclusions for this i-atom */
3708 for (eind = excl->index[ai]; eind < excl->index[ai+1]; eind++)
3709 {
3710 aj = excl->a[eind];
3711
3712 if (aj == ai)
3713 {
3714 /* The self exclusion are already set, save some time */
3715 continue;
3716 }
3717
3718 ge = cell[aj];
3719
3720 /* Without shifts we only calculate interactions j>i
3721 * for one-way pair-lists.
3722 */
3723 if (diagRemoved && ge <= sci*nbl->na_sc + i)
3724 {
3725 continue;
3726 }
3727
3728 se = ge>>na_c_2log;
3729 /* Could the cluster se be in our list? */
3730 if (se >= cj_first && se <= cj_last)
3731 {
3732 if (se < cj_first + ndirect)
3733 {
3734 /* We can calculate cj_ind directly from se */
3735 found = cj_ind_first + se - cj_first;
3736 }
3737 else
3738 {
3739 /* Search for se using bisection */
3740 found = -1;
3741 cj_ind_0 = cj_ind_first + ndirect;
3742 cj_ind_1 = cj_ind_last + 1;
3743 while (found == -1 && cj_ind_0 < cj_ind_1)
3744 {
3745 cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
3746
3747 cj_m = nbl_cj(nbl, cj_ind_m);
3748
3749 if (se == cj_m)
3750 {
3751 found = cj_ind_m;
3752 }
3753 else if (se < cj_m)
3754 {
3755 cj_ind_1 = cj_ind_m;
3756 }
3757 else
3758 {
3759 cj_ind_0 = cj_ind_m + 1;
3760 }
3761 }
3762 }
3763
3764 if (found >= 0)
3765 {
3766 inner_i = i - si*na_c;
3767 inner_e = ge - se*na_c;
3768
3769 if (nbl_imask0(nbl, found) & (1U << (cj_mod_cj4(found)*GPU_NSUBCELL(2*2*2) + si)))
3770 {
3771 w = (inner_e >> 2);
3772
3773 get_nbl_exclusions_1(nbl, cj_to_cj4(found), w, &nbl_excl);
3774
3775 nbl_excl->pair[a_mod_wj(inner_e)*nbl->na_ci+inner_i] &=
3776 ~(1U << (cj_mod_cj4(found)*GPU_NSUBCELL(2*2*2) + si));
3777 }
3778 }
3779 }
3780 }
3781 }
3782 }
3783}
3784
3785/* Reallocate the simple ci list for at least n entries */
3786static void nb_realloc_ci(nbnxn_pairlist_t *nbl, int n)
3787{
3788 nbl->ci_nalloc = over_alloc_small(n)(int)(1.19*(n) + 8000);
3789 nbnxn_realloc_void((void **)&nbl->ci,
3790 nbl->nci*sizeof(*nbl->ci),
3791 nbl->ci_nalloc*sizeof(*nbl->ci),
3792 nbl->alloc, nbl->free);
3793}
3794
3795/* Reallocate the super-cell sci list for at least n entries */
3796static void nb_realloc_sci(nbnxn_pairlist_t *nbl, int n)
3797{
3798 nbl->sci_nalloc = over_alloc_small(n)(int)(1.19*(n) + 8000);
3799 nbnxn_realloc_void((void **)&nbl->sci,
3800 nbl->nsci*sizeof(*nbl->sci),
3801 nbl->sci_nalloc*sizeof(*nbl->sci),
3802 nbl->alloc, nbl->free);
3803}
3804
3805/* Make a new ci entry at index nbl->nci */
3806static void new_ci_entry(nbnxn_pairlist_t *nbl, int ci, int shift, int flags)
3807{
3808 if (nbl->nci + 1 > nbl->ci_nalloc)
3809 {
3810 nb_realloc_ci(nbl, nbl->nci+1);
3811 }
3812 nbl->ci[nbl->nci].ci = ci;
3813 nbl->ci[nbl->nci].shift = shift;
3814 /* Store the interaction flags along with the shift */
3815 nbl->ci[nbl->nci].shift |= flags;
3816 nbl->ci[nbl->nci].cj_ind_start = nbl->ncj;
3817 nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
3818}
3819
3820/* Make a new sci entry at index nbl->nsci */
3821static void new_sci_entry(nbnxn_pairlist_t *nbl, int sci, int shift)
3822{
3823 if (nbl->nsci + 1 > nbl->sci_nalloc)
3824 {
3825 nb_realloc_sci(nbl, nbl->nsci+1);
3826 }
3827 nbl->sci[nbl->nsci].sci = sci;
3828 nbl->sci[nbl->nsci].shift = shift;
3829 nbl->sci[nbl->nsci].cj4_ind_start = nbl->ncj4;
3830 nbl->sci[nbl->nsci].cj4_ind_end = nbl->ncj4;
3831}
3832
3833/* Sort the simple j-list cj on exclusions.
3834 * Entries with exclusions will all be sorted to the beginning of the list.
3835 */
3836static void sort_cj_excl(nbnxn_cj_t *cj, int ncj,
3837 nbnxn_list_work_t *work)
3838{
3839 int jnew, j;
3840
3841 if (ncj > work->cj_nalloc)
3842 {
3843 work->cj_nalloc = over_alloc_large(ncj)(int)(1.19*(ncj) + 1000);
3844 srenew(work->cj, work->cj_nalloc)(work->cj) = save_realloc("work->cj", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 3844, (work->cj), (work->cj_nalloc), sizeof(*(work->
cj)))
;
3845 }
3846
3847 /* Make a list of the j-cells involving exclusions */
3848 jnew = 0;
3849 for (j = 0; j < ncj; j++)
3850 {
3851 if (cj[j].excl != NBNXN_INTERACTION_MASK_ALL)
3852 {
3853 work->cj[jnew++] = cj[j];
3854 }
3855 }
3856 /* Check if there are exclusions at all or not just the first entry */
3857 if (!((jnew == 0) ||
3858 (jnew == 1 && cj[0].excl != NBNXN_INTERACTION_MASK_ALL)))
3859 {
3860 for (j = 0; j < ncj; j++)
3861 {
3862 if (cj[j].excl == NBNXN_INTERACTION_MASK_ALL)
3863 {
3864 work->cj[jnew++] = cj[j];
3865 }
3866 }
3867 for (j = 0; j < ncj; j++)
3868 {
3869 cj[j] = work->cj[j];
3870 }
3871 }
3872}
3873
3874/* Close this simple list i entry */
3875static void close_ci_entry_simple(nbnxn_pairlist_t *nbl)
3876{
3877 int jlen;
3878
3879 /* All content of the new ci entry have already been filled correctly,
3880 * we only need to increase the count here (for non empty lists).
3881 */
3882 jlen = nbl->ci[nbl->nci].cj_ind_end - nbl->ci[nbl->nci].cj_ind_start;
3883 if (jlen > 0)
3884 {
3885 sort_cj_excl(nbl->cj+nbl->ci[nbl->nci].cj_ind_start, jlen, nbl->work);
3886
3887 /* The counts below are used for non-bonded pair/flop counts
3888 * and should therefore match the available kernel setups.
3889 */
3890 if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)(1<<(9+3*(0)))))
3891 {
3892 nbl->work->ncj_noq += jlen;
3893 }
3894 else if ((nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0)(1<<(8+3*(0)))) ||
3895 !(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_LJ(0)(1<<(7+3*(0)))))
3896 {
3897 nbl->work->ncj_hlj += jlen;
3898 }
3899
3900 nbl->nci++;
3901 }
3902}
3903
3904/* Split sci entry for load balancing on the GPU.
3905 * Splitting ensures we have enough lists to fully utilize the whole GPU.
3906 * With progBal we generate progressively smaller lists, which improves
3907 * load balancing. As we only know the current count on our own thread,
3908 * we will need to estimate the current total amount of i-entries.
3909 * As the lists get concatenated later, this estimate depends
3910 * both on nthread and our own thread index.
3911 */
3912static void split_sci_entry(nbnxn_pairlist_t *nbl,
3913 int nsp_max_av, gmx_bool progBal, int nc_bal,
3914 int thread, int nthread)
3915{
3916 int nsci_est;
3917 int nsp_max;
3918 int cj4_start, cj4_end, j4len, cj4;
3919 int sci;
3920 int nsp, nsp_sci, nsp_cj4, nsp_cj4_e, nsp_cj4_p;
3921 int p;
3922
3923 if (progBal)
3924 {
3925 /* Estimate the total numbers of ci's of the nblist combined
3926 * over all threads using the target number of ci's.
3927 */
3928 nsci_est = nc_bal*thread/nthread + nbl->nsci;
3929
3930 /* The first ci blocks should be larger, to avoid overhead.
3931 * The last ci blocks should be smaller, to improve load balancing.
3932 */
3933 nsp_max = max(1,(((1) > (nsp_max_av*nc_bal*3/(2*(nsci_est - 1 + nc_bal))))
? (1) : (nsp_max_av*nc_bal*3/(2*(nsci_est - 1 + nc_bal))) )
3934 nsp_max_av*nc_bal*3/(2*(nsci_est - 1 + nc_bal)))(((1) > (nsp_max_av*nc_bal*3/(2*(nsci_est - 1 + nc_bal))))
? (1) : (nsp_max_av*nc_bal*3/(2*(nsci_est - 1 + nc_bal))) )
;
3935 }
3936 else
3937 {
3938 nsp_max = nsp_max_av;
3939 }
3940
3941 cj4_start = nbl->sci[nbl->nsci-1].cj4_ind_start;
3942 cj4_end = nbl->sci[nbl->nsci-1].cj4_ind_end;
3943 j4len = cj4_end - cj4_start;
3944
3945 if (j4len > 1 && j4len*GPU_NSUBCELL(2*2*2)*NBNXN_GPU_JGROUP_SIZE4 > nsp_max)
3946 {
3947 /* Remove the last ci entry and process the cj4's again */
3948 nbl->nsci -= 1;
3949
3950 sci = nbl->nsci;
3951 nsp = 0;
3952 nsp_sci = 0;
3953 nsp_cj4_e = 0;
3954 nsp_cj4 = 0;
3955 for (cj4 = cj4_start; cj4 < cj4_end; cj4++)
3956 {
3957 nsp_cj4_p = nsp_cj4;
3958 /* Count the number of cluster pairs in this cj4 group */
3959 nsp_cj4 = 0;
3960 for (p = 0; p < GPU_NSUBCELL(2*2*2)*NBNXN_GPU_JGROUP_SIZE4; p++)
3961 {
3962 nsp_cj4 += (nbl->cj4[cj4].imei[0].imask >> p) & 1;
3963 }
3964
3965 if (nsp_cj4 > 0 && nsp + nsp_cj4 > nsp_max)
3966 {
3967 /* Split the list at cj4 */
3968 nbl->sci[sci].cj4_ind_end = cj4;
3969 /* Create a new sci entry */
3970 sci++;
3971 nbl->nsci++;
3972 if (nbl->nsci+1 > nbl->sci_nalloc)
3973 {
3974 nb_realloc_sci(nbl, nbl->nsci+1);
3975 }
3976 nbl->sci[sci].sci = nbl->sci[nbl->nsci-1].sci;
3977 nbl->sci[sci].shift = nbl->sci[nbl->nsci-1].shift;
3978 nbl->sci[sci].cj4_ind_start = cj4;
3979 nsp_sci = nsp;
3980 nsp_cj4_e = nsp_cj4_p;
3981 nsp = 0;
3982 }
3983 nsp += nsp_cj4;
3984 }
3985
3986 /* Put the remaining cj4's in the last sci entry */
3987 nbl->sci[sci].cj4_ind_end = cj4_end;
3988
3989 /* Possibly balance out the last two sci's
3990 * by moving the last cj4 of the second last sci.
3991 */
3992 if (nsp_sci - nsp_cj4_e >= nsp + nsp_cj4_e)
3993 {
3994 nbl->sci[sci-1].cj4_ind_end--;
3995 nbl->sci[sci].cj4_ind_start--;
3996 }
3997
3998 nbl->nsci++;
3999 }
4000}
4001
4002/* Clost this super/sub list i entry */
4003static void close_ci_entry_supersub(nbnxn_pairlist_t *nbl,
4004 int nsp_max_av,
4005 gmx_bool progBal, int nc_bal,
4006 int thread, int nthread)
4007{
4008 int j4len, tlen;
4009 int nb, b;
4010
4011 /* All content of the new ci entry have already been filled correctly,
4012 * we only need to increase the count here (for non empty lists).
4013 */
4014 j4len = nbl->sci[nbl->nsci].cj4_ind_end - nbl->sci[nbl->nsci].cj4_ind_start;
4015 if (j4len > 0)
4016 {
4017 /* We can only have complete blocks of 4 j-entries in a list,
4018 * so round the count up before closing.
4019 */
4020 nbl->ncj4 = ((nbl->work->cj_ind + NBNXN_GPU_JGROUP_SIZE4 - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG2);
4021 nbl->work->cj_ind = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE4;
4022
4023 nbl->nsci++;
4024
4025 if (nsp_max_av > 0)
4026 {
4027 /* Measure the size of the new entry and potentially split it */
4028 split_sci_entry(nbl, nsp_max_av, progBal, nc_bal, thread, nthread);
4029 }
4030 }
4031}
4032
4033/* Syncs the working array before adding another grid pair to the list */
4034static void sync_work(nbnxn_pairlist_t *nbl)
4035{
4036 if (!nbl->bSimple)
4037 {
4038 nbl->work->cj_ind = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE4;
4039 nbl->work->cj4_init = nbl->ncj4;
4040 }
4041}
4042
4043/* Clears an nbnxn_pairlist_t data structure */
4044static void clear_pairlist(nbnxn_pairlist_t *nbl)
4045{
4046 nbl->nci = 0;
4047 nbl->nsci = 0;
4048 nbl->ncj = 0;
4049 nbl->ncj4 = 0;
4050 nbl->nci_tot = 0;
4051 nbl->nexcl = 1;
4052
4053 nbl->work->ncj_noq = 0;
4054 nbl->work->ncj_hlj = 0;
4055}
4056
4057/* Clears a group scheme pair list */
4058static void clear_pairlist_fep(t_nblist *nl)
4059{
4060 nl->nri = 0;
4061 nl->nrj = 0;
4062 if (nl->jindex == NULL((void*)0))
4063 {
4064 snew(nl->jindex, 1)(nl->jindex) = save_calloc("nl->jindex", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 4064, (1), sizeof(*(nl->jindex)))
;
4065 }
4066 nl->jindex[0] = 0;
4067}
4068
4069/* Sets a simple list i-cell bounding box, including PBC shift */
4070static gmx_inlineinline void set_icell_bb_simple(const nbnxn_bb_t *bb, int ci,
4071 real shx, real shy, real shz,
4072 nbnxn_bb_t *bb_ci)
4073{
4074 bb_ci->lower[BB_X0] = bb[ci].lower[BB_X0] + shx;
4075 bb_ci->lower[BB_Y1] = bb[ci].lower[BB_Y1] + shy;
4076 bb_ci->lower[BB_Z2] = bb[ci].lower[BB_Z2] + shz;
4077 bb_ci->upper[BB_X0] = bb[ci].upper[BB_X0] + shx;
4078 bb_ci->upper[BB_Y1] = bb[ci].upper[BB_Y1] + shy;
4079 bb_ci->upper[BB_Z2] = bb[ci].upper[BB_Z2] + shz;
4080}
4081
4082#ifdef NBNXN_BBXXXX
4083/* Sets a super-cell and sub cell bounding boxes, including PBC shift */
4084static void set_icell_bbxxxx_supersub(const float *bb, int ci,
4085 real shx, real shy, real shz,
4086 float *bb_ci)
4087{
4088 int ia, m, i;
4089
4090 ia = ci*(GPU_NSUBCELL(2*2*2)>>STRIDE_PBB_2LOG2)*NNBSBB_XXXX(2*3*4);
4091 for (m = 0; m < (GPU_NSUBCELL(2*2*2)>>STRIDE_PBB_2LOG2)*NNBSBB_XXXX(2*3*4); m += NNBSBB_XXXX(2*3*4))
77
Loop condition is true. Entering loop body
4092 {
4093 for (i = 0; i < STRIDE_PBB4; i++)
78
Loop condition is true. Entering loop body
4094 {
4095 bb_ci[m+0*STRIDE_PBB4+i] = bb[ia+m+0*STRIDE_PBB4+i] + shx;
79
Array access (from variable 'bb') results in a null pointer dereference
4096 bb_ci[m+1*STRIDE_PBB4+i] = bb[ia+m+1*STRIDE_PBB4+i] + shy;
4097 bb_ci[m+2*STRIDE_PBB4+i] = bb[ia+m+2*STRIDE_PBB4+i] + shz;
4098 bb_ci[m+3*STRIDE_PBB4+i] = bb[ia+m+3*STRIDE_PBB4+i] + shx;
4099 bb_ci[m+4*STRIDE_PBB4+i] = bb[ia+m+4*STRIDE_PBB4+i] + shy;
4100 bb_ci[m+5*STRIDE_PBB4+i] = bb[ia+m+5*STRIDE_PBB4+i] + shz;
4101 }
4102 }
4103}
4104#endif
4105
4106/* Sets a super-cell and sub cell bounding boxes, including PBC shift */
4107static void set_icell_bb_supersub(const nbnxn_bb_t *bb, int ci,
4108 real shx, real shy, real shz,
4109 nbnxn_bb_t *bb_ci)
4110{
4111 int i;
4112
4113 for (i = 0; i < GPU_NSUBCELL(2*2*2); i++)
4114 {
4115 set_icell_bb_simple(bb, ci*GPU_NSUBCELL(2*2*2)+i,
4116 shx, shy, shz,
4117 &bb_ci[i]);
4118 }
4119}
4120
4121/* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
4122static void icell_set_x_simple(int ci,
4123 real shx, real shy, real shz,
4124 int gmx_unused__attribute__ ((unused)) na_c,
4125 int stride, const real *x,
4126 nbnxn_list_work_t *work)
4127{
4128 int ia, i;
4129
4130 ia = ci*NBNXN_CPU_CLUSTER_I_SIZE4;
4131
4132 for (i = 0; i < NBNXN_CPU_CLUSTER_I_SIZE4; i++)
4133 {
4134 work->x_ci[i*STRIDE_XYZ3+XX0] = x[(ia+i)*stride+XX0] + shx;
4135 work->x_ci[i*STRIDE_XYZ3+YY1] = x[(ia+i)*stride+YY1] + shy;
4136 work->x_ci[i*STRIDE_XYZ3+ZZ2] = x[(ia+i)*stride+ZZ2] + shz;
4137 }
4138}
4139
4140/* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
4141static void icell_set_x_supersub(int ci,
4142 real shx, real shy, real shz,
4143 int na_c,
4144 int stride, const real *x,
4145 nbnxn_list_work_t *work)
4146{
4147 int ia, i;
4148 real *x_ci;
4149
4150 x_ci = work->x_ci;
4151
4152 ia = ci*GPU_NSUBCELL(2*2*2)*na_c;
4153 for (i = 0; i < GPU_NSUBCELL(2*2*2)*na_c; i++)
4154 {
4155 x_ci[i*DIM3 + XX0] = x[(ia+i)*stride + XX0] + shx;
4156 x_ci[i*DIM3 + YY1] = x[(ia+i)*stride + YY1] + shy;
4157 x_ci[i*DIM3 + ZZ2] = x[(ia+i)*stride + ZZ2] + shz;
4158 }
4159}
4160
4161#ifdef NBNXN_SEARCH_BB_SIMD4
4162/* Copies PBC shifted super-cell packed atom coordinates to working array */
4163static void icell_set_x_supersub_simd4(int ci,
4164 real shx, real shy, real shz,
4165 int na_c,
4166 int stride, const real *x,
4167 nbnxn_list_work_t *work)
4168{
4169 int si, io, ia, i, j;
4170 real *x_ci;
4171
4172 x_ci = work->x_ci;
4173
4174 for (si = 0; si < GPU_NSUBCELL(2*2*2); si++)
4175 {
4176 for (i = 0; i < na_c; i += STRIDE_PBB4)
4177 {
4178 io = si*na_c + i;
4179 ia = ci*GPU_NSUBCELL(2*2*2)*na_c + io;
4180 for (j = 0; j < STRIDE_PBB4; j++)
4181 {
4182 x_ci[io*DIM3 + j + XX0*STRIDE_PBB4] = x[(ia+j)*stride+XX0] + shx;
4183 x_ci[io*DIM3 + j + YY1*STRIDE_PBB4] = x[(ia+j)*stride+YY1] + shy;
4184 x_ci[io*DIM3 + j + ZZ2*STRIDE_PBB4] = x[(ia+j)*stride+ZZ2] + shz;
4185 }
4186 }
4187 }
4188}
4189#endif
4190
4191static real minimum_subgrid_size_xy(const nbnxn_grid_t *grid)
4192{
4193 if (grid->bSimple)
4194 {
4195 return min(grid->sx, grid->sy)(((grid->sx) < (grid->sy)) ? (grid->sx) : (grid->
sy) )
;
4196 }
4197 else
4198 {
4199 return min(grid->sx/GPU_NSUBCELL_X, grid->sy/GPU_NSUBCELL_Y)(((grid->sx/2) < (grid->sy/2)) ? (grid->sx/2) : (
grid->sy/2) )
;
4200 }
4201}
4202
4203static real effective_buffer_1x1_vs_MxN(const nbnxn_grid_t *gridi,
4204 const nbnxn_grid_t *gridj)
4205{
4206 const real eff_1x1_buffer_fac_overest = 0.1;
4207
4208 /* Determine an atom-pair list cut-off buffer size for atom pairs,
4209 * to be added to rlist (including buffer) used for MxN.
4210 * This is for converting an MxN list to a 1x1 list. This means we can't
4211 * use the normal buffer estimate, as we have an MxN list in which
4212 * some atom pairs beyond rlist are missing. We want to capture
4213 * the beneficial effect of buffering by extra pairs just outside rlist,
4214 * while removing the useless pairs that are further away from rlist.
4215 * (Also the buffer could have been set manually not using the estimate.)
4216 * This buffer size is an overestimate.
4217 * We add 10% of the smallest grid sub-cell dimensions.
4218 * Note that the z-size differs per cell and we don't use this,
4219 * so we overestimate.
4220 * With PME, the 10% value gives a buffer that is somewhat larger
4221 * than the effective buffer with a tolerance of 0.005 kJ/mol/ps.
4222 * Smaller tolerances or using RF lead to a smaller effective buffer,
4223 * so 10% gives a safe overestimate.
4224 */
4225 return eff_1x1_buffer_fac_overest*(minimum_subgrid_size_xy(gridi) +
4226 minimum_subgrid_size_xy(gridj));
4227}
4228
4229/* Clusters at the cut-off only increase rlist by 60% of their size */
4230static real nbnxn_rlist_inc_outside_fac = 0.6;
4231
4232/* Due to the cluster size the effective pair-list is longer than
4233 * that of a simple atom pair-list. This function gives the extra distance.
4234 */
4235real nbnxn_get_rlist_effective_inc(int cluster_size_j, real atom_density)
4236{
4237 int cluster_size_i;
4238 real vol_inc_i, vol_inc_j;
4239
4240 /* We should get this from the setup, but currently it's the same for
4241 * all setups, including GPUs.
4242 */
4243 cluster_size_i = NBNXN_CPU_CLUSTER_I_SIZE4;
4244
4245 vol_inc_i = (cluster_size_i - 1)/atom_density;
4246 vol_inc_j = (cluster_size_j - 1)/atom_density;
4247
4248 return nbnxn_rlist_inc_outside_fac*pow(vol_inc_i + vol_inc_j, 1.0/3.0);
4249}
4250
4251/* Estimates the interaction volume^2 for non-local interactions */
4252static real nonlocal_vol2(const gmx_domdec_zones_t *zones, rvec ls, real r)
4253{
4254 int z, d;
4255 real cl, ca, za;
4256 real vold_est;
4257 real vol2_est_tot;
4258
4259 vol2_est_tot = 0;
4260
4261 /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
4262 * not home interaction volume^2. As these volumes are not additive,
4263 * this is an overestimate, but it would only be significant in the limit
4264 * of small cells, where we anyhow need to split the lists into
4265 * as small parts as possible.
4266 */
4267
4268 for (z = 0; z < zones->n; z++)
4269 {
4270 if (zones->shift[z][XX0] + zones->shift[z][YY1] + zones->shift[z][ZZ2] == 1)
4271 {
4272 cl = 0;
4273 ca = 1;
4274 za = 1;
4275 for (d = 0; d < DIM3; d++)
4276 {
4277 if (zones->shift[z][d] == 0)
4278 {
4279 cl += 0.5*ls[d];
4280 ca *= ls[d];
4281 za *= zones->size[z].x1[d] - zones->size[z].x0[d];
4282 }
4283 }
4284
4285 /* 4 octants of a sphere */
4286 vold_est = 0.25*M_PI3.14159265358979323846*r*r*r*r;
4287 /* 4 quarter pie slices on the edges */
4288 vold_est += 4*cl*M_PI3.14159265358979323846/6.0*r*r*r;
4289 /* One rectangular volume on a face */
4290 vold_est += ca*0.5*r*r;
4291
4292 vol2_est_tot += vold_est*za;
4293 }
4294 }
4295
4296 return vol2_est_tot;
4297}
4298
4299/* Estimates the average size of a full j-list for super/sub setup */
4300static int get_nsubpair_max(const nbnxn_search_t nbs,
4301 int iloc,
4302 real rlist,
4303 int min_ci_balanced)
4304{
4305 const nbnxn_grid_t *grid;
4306 rvec ls;
4307 real xy_diag2, r_eff_sup, vol_est, nsp_est, nsp_est_nl;
4308 int nsubpair_max;
4309
4310 grid = &nbs->grid[0];
4311
4312 ls[XX0] = (grid->c1[XX0] - grid->c0[XX0])/(grid->ncx*GPU_NSUBCELL_X2);
4313 ls[YY1] = (grid->c1[YY1] - grid->c0[YY1])/(grid->ncy*GPU_NSUBCELL_Y2);
4314 ls[ZZ2] = (grid->c1[ZZ2] - grid->c0[ZZ2])*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z2);
4315
4316 /* The average squared length of the diagonal of a sub cell */
4317 xy_diag2 = ls[XX0]*ls[XX0] + ls[YY1]*ls[YY1] + ls[ZZ2]*ls[ZZ2];
4318
4319 /* The formulas below are a heuristic estimate of the average nsj per si*/
4320 r_eff_sup = rlist + nbnxn_rlist_inc_outside_fac*sqr((grid->na_c - 1.0)/grid->na_c)*sqrt(xy_diag2/3);
4321
4322 if (!nbs->DomDec || nbs->zones->n == 1)
4323 {
4324 nsp_est_nl = 0;
4325 }
4326 else
4327 {
4328 nsp_est_nl =
4329 sqr(grid->atom_density/grid->na_c)*
4330 nonlocal_vol2(nbs->zones, ls, r_eff_sup);
4331 }
4332
4333 if (LOCAL_I(iloc)((iloc) == eintLocal))
4334 {
4335 /* Sub-cell interacts with itself */
4336 vol_est = ls[XX0]*ls[YY1]*ls[ZZ2];
4337 /* 6/2 rectangular volume on the faces */
4338 vol_est += (ls[XX0]*ls[YY1] + ls[XX0]*ls[ZZ2] + ls[YY1]*ls[ZZ2])*r_eff_sup;
4339 /* 12/2 quarter pie slices on the edges */
4340 vol_est += 2*(ls[XX0] + ls[YY1] + ls[ZZ2])*0.25*M_PI3.14159265358979323846*sqr(r_eff_sup);
4341 /* 4 octants of a sphere */
4342 vol_est += 0.5*4.0/3.0*M_PI3.14159265358979323846*pow(r_eff_sup, 3);
4343
4344 nsp_est = grid->nsubc_tot*vol_est*grid->atom_density/grid->na_c;
4345
4346 /* Subtract the non-local pair count */
4347 nsp_est -= nsp_est_nl;
4348
4349 if (debug)
4350 {
4351 fprintf(debug, "nsp_est local %5.1f non-local %5.1f\n",
4352 nsp_est, nsp_est_nl);
4353 }
4354 }
4355 else
4356 {
4357 nsp_est = nsp_est_nl;
4358 }
4359
4360 if (min_ci_balanced <= 0 || grid->nc >= min_ci_balanced || grid->nc == 0)
4361 {
4362 /* We don't need to worry */
4363 nsubpair_max = -1;
4364 }
4365 else
4366 {
4367 /* Thus the (average) maximum j-list size should be as follows */
4368 nsubpair_max = max(1, (int)(nsp_est/min_ci_balanced+0.5))(((1) > ((int)(nsp_est/min_ci_balanced+0.5))) ? (1) : ((int
)(nsp_est/min_ci_balanced+0.5)) )
;
4369
4370 /* Since the target value is a maximum (this avoids high outliers,
4371 * which lead to load imbalance), not average, we add half the
4372 * number of pairs in a cj4 block to get the average about right.
4373 */
4374 nsubpair_max += GPU_NSUBCELL(2*2*2)*NBNXN_GPU_JGROUP_SIZE4/2;
4375 }
4376
4377 if (debug)
4378 {
4379 fprintf(debug, "nbl nsp estimate %.1f, nsubpair_max %d\n",
4380 nsp_est, nsubpair_max);
4381 }
4382
4383 return nsubpair_max;
4384}
4385
4386/* Debug list print function */
4387static void print_nblist_ci_cj(FILE *fp, const nbnxn_pairlist_t *nbl)
4388{
4389 int i, j;
4390
4391 for (i = 0; i < nbl->nci; i++)
4392 {
4393 fprintf(fp, "ci %4d shift %2d ncj %3d\n",
4394 nbl->ci[i].ci, nbl->ci[i].shift,
4395 nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start);
4396
4397 for (j = nbl->ci[i].cj_ind_start; j < nbl->ci[i].cj_ind_end; j++)
4398 {
4399 fprintf(fp, " cj %5d imask %x\n",
4400 nbl->cj[j].cj,
4401 nbl->cj[j].excl);
4402 }
4403 }
4404}
4405
4406/* Debug list print function */
4407static void print_nblist_sci_cj(FILE *fp, const nbnxn_pairlist_t *nbl)
4408{
4409 int i, j4, j, ncp, si;
4410
4411 for (i = 0; i < nbl->nsci; i++)
4412 {
4413 fprintf(fp, "ci %4d shift %2d ncj4 %2d\n",
4414 nbl->sci[i].sci, nbl->sci[i].shift,
4415 nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start);
4416
4417 ncp = 0;
4418 for (j4 = nbl->sci[i].cj4_ind_start; j4 < nbl->sci[i].cj4_ind_end; j4++)
4419 {
4420 for (j = 0; j < NBNXN_GPU_JGROUP_SIZE4; j++)
4421 {
4422 fprintf(fp, " sj %5d imask %x\n",
4423 nbl->cj4[j4].cj[j],
4424 nbl->cj4[j4].imei[0].imask);
4425 for (si = 0; si < GPU_NSUBCELL(2*2*2); si++)
4426 {
4427 if (nbl->cj4[j4].imei[0].imask & (1U << (j*GPU_NSUBCELL(2*2*2) + si)))
4428 {
4429 ncp++;
4430 }
4431 }
4432 }
4433 }
4434 fprintf(fp, "ci %4d shift %2d ncj4 %2d ncp %3d\n",
4435 nbl->sci[i].sci, nbl->sci[i].shift,
4436 nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start,
4437 ncp);
4438 }
4439}
4440
4441/* Combine pair lists *nbl generated on multiple threads nblc */
4442static void combine_nblists(int nnbl, nbnxn_pairlist_t **nbl,
4443 nbnxn_pairlist_t *nblc)
4444{
4445 int nsci, ncj4, nexcl;
4446 int n, i;
4447
4448 if (nblc->bSimple)
4449 {
4450 gmx_incons("combine_nblists does not support simple lists")_gmx_error("incons", "combine_nblists does not support simple lists"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 4450)
;
4451 }
4452
4453 nsci = nblc->nsci;
4454 ncj4 = nblc->ncj4;
4455 nexcl = nblc->nexcl;
4456 for (i = 0; i < nnbl; i++)
4457 {
4458 nsci += nbl[i]->nsci;
4459 ncj4 += nbl[i]->ncj4;
4460 nexcl += nbl[i]->nexcl;
4461 }
4462
4463 if (nsci > nblc->sci_nalloc)
4464 {
4465 nb_realloc_sci(nblc, nsci);
4466 }
4467 if (ncj4 > nblc->cj4_nalloc)
4468 {
4469 nblc->cj4_nalloc = over_alloc_small(ncj4)(int)(1.19*(ncj4) + 8000);
4470 nbnxn_realloc_void((void **)&nblc->cj4,
4471 nblc->ncj4*sizeof(*nblc->cj4),
4472 nblc->cj4_nalloc*sizeof(*nblc->cj4),
4473 nblc->alloc, nblc->free);
4474 }
4475 if (nexcl > nblc->excl_nalloc)
4476 {
4477 nblc->excl_nalloc = over_alloc_small(nexcl)(int)(1.19*(nexcl) + 8000);
4478 nbnxn_realloc_void((void **)&nblc->excl,
4479 nblc->nexcl*sizeof(*nblc->excl),
4480 nblc->excl_nalloc*sizeof(*nblc->excl),
4481 nblc->alloc, nblc->free);
4482 }
4483
4484 /* Each thread should copy its own data to the combined arrays,
4485 * as otherwise data will go back and forth between different caches.
4486 */
4487#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
4488 for (n = 0; n < nnbl; n++)
4489 {
4490 int sci_offset;
4491 int cj4_offset;
4492 int ci_offset;
4493 int excl_offset;
4494 int i, j4;
4495 const nbnxn_pairlist_t *nbli;
4496
4497 /* Determine the offset in the combined data for our thread */
4498 sci_offset = nblc->nsci;
4499 cj4_offset = nblc->ncj4;
4500 ci_offset = nblc->nci_tot;
4501 excl_offset = nblc->nexcl;
4502
4503 for (i = 0; i < n; i++)
4504 {
4505 sci_offset += nbl[i]->nsci;
4506 cj4_offset += nbl[i]->ncj4;
4507 ci_offset += nbl[i]->nci_tot;
4508 excl_offset += nbl[i]->nexcl;
4509 }
4510
4511 nbli = nbl[n];
4512
4513 for (i = 0; i < nbli->nsci; i++)
4514 {
4515 nblc->sci[sci_offset+i] = nbli->sci[i];
4516 nblc->sci[sci_offset+i].cj4_ind_start += cj4_offset;
4517 nblc->sci[sci_offset+i].cj4_ind_end += cj4_offset;
4518 }
4519
4520 for (j4 = 0; j4 < nbli->ncj4; j4++)
4521 {
4522 nblc->cj4[cj4_offset+j4] = nbli->cj4[j4];
4523 nblc->cj4[cj4_offset+j4].imei[0].excl_ind += excl_offset;
4524 nblc->cj4[cj4_offset+j4].imei[1].excl_ind += excl_offset;
4525 }
4526
4527 for (j4 = 0; j4 < nbli->nexcl; j4++)
4528 {
4529 nblc->excl[excl_offset+j4] = nbli->excl[j4];
4530 }
4531 }
4532
4533 for (n = 0; n < nnbl; n++)
4534 {
4535 nblc->nsci += nbl[n]->nsci;
4536 nblc->ncj4 += nbl[n]->ncj4;
4537 nblc->nci_tot += nbl[n]->nci_tot;
4538 nblc->nexcl += nbl[n]->nexcl;
4539 }
4540}
4541
4542static void balance_fep_lists(const nbnxn_search_t nbs,
4543 nbnxn_pairlist_set_t *nbl_lists)
4544{
4545 int nnbl, th;
4546 int nri_tot, nrj_tot, nrj_target;
4547 int th_dest;
4548 t_nblist *nbld;
4549
4550 nnbl = nbl_lists->nnbl;
4551
4552 if (nnbl == 1)
4553 {
4554 /* Nothing to balance */
4555 return;
4556 }
4557
4558 /* Count the total i-lists and pairs */
4559 nri_tot = 0;
4560 nrj_tot = 0;
4561 for (th = 0; th < nnbl; th++)
4562 {
4563 nri_tot += nbl_lists->nbl_fep[th]->nri;
4564 nrj_tot += nbl_lists->nbl_fep[th]->nrj;
4565 }
4566
4567 nrj_target = (nrj_tot + nnbl - 1)/nnbl;
4568
4569 assert(gmx_omp_nthreads_get(emntNonbonded) == nnbl)((void) (0));
4570
4571#pragma omp parallel for schedule(static) num_threads(nnbl)
4572 for (th = 0; th < nnbl; th++)
4573 {
4574 t_nblist *nbl;
4575
4576 nbl = nbs->work[th].nbl_fep;
4577
4578 /* Note that here we allocate for the total size, instead of
4579 * a per-thread esimate (which is hard to obtain).
4580 */
4581 if (nri_tot > nbl->maxnri)
4582 {
4583 nbl->maxnri = over_alloc_large(nri_tot)(int)(1.19*(nri_tot) + 1000);
4584 reallocate_nblist(nbl);
4585 }
4586 if (nri_tot > nbl->maxnri || nrj_tot > nbl->maxnrj)
4587 {
4588 nbl->maxnrj = over_alloc_small(nrj_tot)(int)(1.19*(nrj_tot) + 8000);
4589 srenew(nbl->jjnr, nbl->maxnrj)(nbl->jjnr) = save_realloc("nbl->jjnr", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 4589, (nbl->jjnr), (nbl->maxnrj), sizeof(*(nbl->jjnr
)))
;
4590 srenew(nbl->excl_fep, nbl->maxnrj)(nbl->excl_fep) = save_realloc("nbl->excl_fep", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 4590, (nbl->excl_fep), (nbl->maxnrj), sizeof(*(nbl->
excl_fep)))
;
4591 }
4592
4593 clear_pairlist_fep(nbl);
4594 }
4595
4596 /* Loop over the source lists and assign and copy i-entries */
4597 th_dest = 0;
4598 nbld = nbs->work[th_dest].nbl_fep;
4599 for (th = 0; th < nnbl; th++)
4600 {
4601 t_nblist *nbls;
4602 int i, j;
4603
4604 nbls = nbl_lists->nbl_fep[th];
4605
4606 for (i = 0; i < nbls->nri; i++)
4607 {
4608 int nrj;
4609
4610 /* The number of pairs in this i-entry */
4611 nrj = nbls->jindex[i+1] - nbls->jindex[i];
4612
4613 /* Decide if list th_dest is too large and we should procede
4614 * to the next destination list.
4615 */
4616 if (th_dest+1 < nnbl && nbld->nrj > 0 &&
4617 nbld->nrj + nrj - nrj_target > nrj_target - nbld->nrj)
4618 {
4619 th_dest++;
4620 nbld = nbs->work[th_dest].nbl_fep;
4621 }
4622
4623 nbld->iinr[nbld->nri] = nbls->iinr[i];
4624 nbld->gid[nbld->nri] = nbls->gid[i];
4625 nbld->shift[nbld->nri] = nbls->shift[i];
4626
4627 for (j = nbls->jindex[i]; j < nbls->jindex[i+1]; j++)
4628 {
4629 nbld->jjnr[nbld->nrj] = nbls->jjnr[j];
4630 nbld->excl_fep[nbld->nrj] = nbls->excl_fep[j];
4631 nbld->nrj++;
4632 }
4633 nbld->nri++;
4634 nbld->jindex[nbld->nri] = nbld->nrj;
4635 }
4636 }
4637
4638 /* Swap the list pointers */
4639 for (th = 0; th < nnbl; th++)
4640 {
4641 t_nblist *nbl_tmp;
4642
4643 nbl_tmp = nbl_lists->nbl_fep[th];
4644 nbl_lists->nbl_fep[th] = nbs->work[th].nbl_fep;
4645 nbs->work[th].nbl_fep = nbl_tmp;
4646
4647 if (debug)
4648 {
4649 fprintf(debug, "nbl_fep[%d] nri %4d nrj %4d\n",
4650 th,
4651 nbl_lists->nbl_fep[th]->nri,
4652 nbl_lists->nbl_fep[th]->nrj);
4653 }
4654 }
4655}
4656
4657/* Returns the next ci to be processes by our thread */
4658static gmx_bool next_ci(const nbnxn_grid_t *grid,
4659 int conv,
4660 int nth, int ci_block,
4661 int *ci_x, int *ci_y,
4662 int *ci_b, int *ci)
4663{
4664 (*ci_b)++;
4665 (*ci)++;
4666
4667 if (*ci_b == ci_block)
4668 {
4669 /* Jump to the next block assigned to this task */
4670 *ci += (nth - 1)*ci_block;
4671 *ci_b = 0;
4672 }
4673
4674 if (*ci >= grid->nc*conv)
4675 {
4676 return FALSE0;
4677 }
4678
4679 while (*ci >= grid->cxy_ind[*ci_x*grid->ncy + *ci_y + 1]*conv)
4680 {
4681 *ci_y += 1;
4682 if (*ci_y == grid->ncy)
4683 {
4684 *ci_x += 1;
4685 *ci_y = 0;
4686 }
4687 }
4688
4689 return TRUE1;
4690}
4691
4692/* Returns the distance^2 for which we put cell pairs in the list
4693 * without checking atom pair distances. This is usually < rlist^2.
4694 */
4695static float boundingbox_only_distance2(const nbnxn_grid_t *gridi,
4696 const nbnxn_grid_t *gridj,
4697 real rlist,
4698 gmx_bool simple)
4699{
4700 /* If the distance between two sub-cell bounding boxes is less
4701 * than this distance, do not check the distance between
4702 * all particle pairs in the sub-cell, since then it is likely
4703 * that the box pair has atom pairs within the cut-off.
4704 * We use the nblist cut-off minus 0.5 times the average x/y diagonal
4705 * spacing of the sub-cells. Around 40% of the checked pairs are pruned.
4706 * Using more than 0.5 gains at most 0.5%.
4707 * If forces are calculated more than twice, the performance gain
4708 * in the force calculation outweighs the cost of checking.
4709 * Note that with subcell lists, the atom-pair distance check
4710 * is only performed when only 1 out of 8 sub-cells in within range,
4711 * this is because the GPU is much faster than the cpu.
4712 */
4713 real bbx, bby;
4714 real rbb2;
4715
4716 bbx = 0.5*(gridi->sx + gridj->sx);
4717 bby = 0.5*(gridi->sy + gridj->sy);
4718 if (!simple)
4719 {
4720 bbx /= GPU_NSUBCELL_X2;
4721 bby /= GPU_NSUBCELL_Y2;
4722 }
4723
4724 rbb2 = sqr(max(0, rlist - 0.5*sqrt(bbx*bbx + bby*bby))(((0) > (rlist - 0.5*sqrt(bbx*bbx + bby*bby))) ? (0) : (rlist
- 0.5*sqrt(bbx*bbx + bby*bby)) )
);
4725
4726#ifndef GMX_DOUBLE
4727 return rbb2;
4728#else
4729 return (float)((1+GMX_FLOAT_EPS5.96046448E-08)*rbb2);
4730#endif
4731}
4732
4733static int get_ci_block_size(const nbnxn_grid_t *gridi,
4734 gmx_bool bDomDec, int nth)
4735{
4736 const int ci_block_enum = 5;
4737 const int ci_block_denom = 11;
4738 const int ci_block_min_atoms = 16;
4739 int ci_block;
4740
4741 /* Here we decide how to distribute the blocks over the threads.
4742 * We use prime numbers to try to avoid that the grid size becomes
4743 * a multiple of the number of threads, which would lead to some
4744 * threads getting "inner" pairs and others getting boundary pairs,
4745 * which in turns will lead to load imbalance between threads.
4746 * Set the block size as 5/11/ntask times the average number of cells
4747 * in a y,z slab. This should ensure a quite uniform distribution
4748 * of the grid parts of the different thread along all three grid
4749 * zone boundaries with 3D domain decomposition. At the same time
4750 * the blocks will not become too small.
4751 */
4752 ci_block = (gridi->nc*ci_block_enum)/(ci_block_denom*gridi->ncx*nth);
4753
4754 /* Ensure the blocks are not too small: avoids cache invalidation */
4755 if (ci_block*gridi->na_sc < ci_block_min_atoms)
4756 {
4757 ci_block = (ci_block_min_atoms + gridi->na_sc - 1)/gridi->na_sc;
4758 }
4759
4760 /* Without domain decomposition
4761 * or with less than 3 blocks per task, divide in nth blocks.
4762 */
4763 if (!bDomDec || ci_block*3*nth > gridi->nc)
4764 {
4765 ci_block = (gridi->nc + nth - 1)/nth;
4766 }
4767
4768 return ci_block;
4769}
4770
4771/* Generates the part of pair-list nbl assigned to our thread */
4772static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
4773 const nbnxn_grid_t *gridi,
4774 const nbnxn_grid_t *gridj,
4775 nbnxn_search_work_t *work,
4776 const nbnxn_atomdata_t *nbat,
4777 const t_blocka *excl,
4778 real rlist,
4779 int nb_kernel_type,
4780 int ci_block,
4781 gmx_bool bFBufferFlag,
4782 int nsubpair_max,
4783 gmx_bool progBal,
4784 int min_ci_balanced,
4785 int th, int nth,
4786 nbnxn_pairlist_t *nbl,
4787 t_nblist *nbl_fep)
4788{
4789 int na_cj_2log;
4790 matrix box;
4791 real rl2, rl_fep2 = 0;
4792 float rbb2;
4793 int d;
4794 int ci_b, ci, ci_x, ci_y, ci_xy, cj;
4795 ivec shp;
4796 int tx, ty, tz;
4797 int shift;
4798 gmx_bool bMakeList;
4799 real shx, shy, shz;
4800 int conv_i, cell0_i;
4801 const nbnxn_bb_t *bb_i = NULL((void*)0);
4802#ifdef NBNXN_BBXXXX
4803 const float *pbb_i = NULL((void*)0);
1
'pbb_i' initialized to a null pointer value
4804#endif
4805 const float *bbcz_i, *bbcz_j;
4806 const int *flags_i;
4807 real bx0, bx1, by0, by1, bz0, bz1;
4808 real bz1_frac;
4809 real d2cx, d2z, d2z_cx, d2z_cy, d2zx, d2zxy, d2xy;
4810 int cxf, cxl, cyf, cyf_x, cyl;
4811 int cx, cy;
4812 int c0, c1, cs, cf, cl;
4813 int ndistc;
4814 int ncpcheck;
4815 int gridi_flag_shift = 0, gridj_flag_shift = 0;
4816 unsigned int *gridj_flag = NULL((void*)0);
4817 int ncj_old_i, ncj_old_j;
4818
4819 nbs_cycle_start(&work->cc[enbsCCsearch]);
4820
4821 if (gridj->bSimple != nbl->bSimple)
2
Taking false branch
4822 {
4823 gmx_incons("Grid incompatible with pair-list")_gmx_error("incons", "Grid incompatible with pair-list", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 4823)
;
4824 }
4825
4826 sync_work(nbl);
4827 nbl->na_sc = gridj->na_sc;
4828 nbl->na_ci = gridj->na_c;
4829 nbl->na_cj = nbnxn_kernel_to_cj_size(nb_kernel_type);
4830 na_cj_2log = get_2log(nbl->na_cj);
4831
4832 nbl->rlist = rlist;
4833
4834 if (bFBufferFlag)
3
Assuming 'bFBufferFlag' is 0
4
Taking false branch
4835 {
4836 /* Determine conversion of clusters to flag blocks */
4837 gridi_flag_shift = 0;
4838 while ((nbl->na_ci<<gridi_flag_shift) < NBNXN_BUFFERFLAG_SIZE16)
4839 {
4840 gridi_flag_shift++;
4841 }
4842 gridj_flag_shift = 0;
4843 while ((nbl->na_cj<<gridj_flag_shift) < NBNXN_BUFFERFLAG_SIZE16)
4844 {
4845 gridj_flag_shift++;
4846 }
4847
4848 gridj_flag = work->buffer_flags.flag;
4849 }
4850
4851 copy_mat(nbs->box, box);
4852
4853 rl2 = nbl->rlist*nbl->rlist;
4854
4855 if (nbs->bFEP && !nbl->bSimple)
4856 {
4857 /* Determine an atom-pair list cut-off distance for FEP atom pairs.
4858 * We should not simply use rlist, since then we would not have
4859 * the small, effective buffering of the NxN lists.
4860 * The buffer is on overestimate, but the resulting cost for pairs
4861 * beyond rlist is neglible compared to the FEP pairs within rlist.
4862 */
4863 rl_fep2 = nbl->rlist + effective_buffer_1x1_vs_MxN(gridi, gridj);
4864
4865 if (debug)
4866 {
4867 fprintf(debug, "nbl_fep atom-pair rlist %f\n", rl_fep2);
4868 }
4869 rl_fep2 = rl_fep2*rl_fep2;
4870 }
4871
4872 rbb2 = boundingbox_only_distance2(gridi, gridj, nbl->rlist, nbl->bSimple);
4873
4874 if (debug)
5
Assuming 'debug' is null
6
Taking false branch
4875 {
4876 fprintf(debug, "nbl bounding box only distance %f\n", sqrt(rbb2));
4877 }
4878
4879 /* Set the shift range */
4880 for (d = 0; d < DIM3; d++)
7
Loop condition is true. Entering loop body
10
Loop condition is true. Entering loop body
12
Loop condition is true. Entering loop body
14
Loop condition is false. Execution continues on line 4903
4881 {
4882 /* Check if we need periodicity shifts.
4883 * Without PBC or with domain decomposition we don't need them.
4884 */
4885 if (d >= ePBC2npbcdim(nbs->ePBC) || nbs->dd_dim[d])
8
Taking false branch
11
Taking false branch
13
Taking false branch
4886 {
4887 shp[d] = 0;
4888 }
4889 else
4890 {
4891 if (d == XX0 &&
9
Taking false branch
4892 box[XX0][XX0] - fabs(box[YY1][XX0]) - fabs(box[ZZ2][XX0]) < sqrt(rl2))
4893 {
4894 shp[d] = 2;
4895 }
4896 else
4897 {
4898 shp[d] = 1;
4899 }
4900 }
4901 }
4902
4903 if (nbl->bSimple && !gridi->bSimple)
15
Taking false branch
4904 {
4905 conv_i = gridi->na_sc/gridj->na_sc;
4906 bb_i = gridi->bb_simple;
4907 bbcz_i = gridi->bbcz_simple;
4908 flags_i = gridi->flags_simple;
4909 }
4910 else
4911 {
4912 conv_i = 1;
4913#ifdef NBNXN_BBXXXX
4914 if (gridi->bSimple)
16
Taking true branch
4915 {
4916 bb_i = gridi->bb;
4917 }
4918 else
4919 {
4920 pbb_i = gridi->pbb;
4921 }
4922#else
4923 /* We use the normal bounding box format for both grid types */
4924 bb_i = gridi->bb;
4925#endif
4926 bbcz_i = gridi->bbcz;
4927 flags_i = gridi->flags;
4928 }
4929 cell0_i = gridi->cell0*conv_i;
4930
4931 bbcz_j = gridj->bbcz;
4932
4933 if (conv_i != 1)
17
Taking false branch
4934 {
4935 /* Blocks of the conversion factor - 1 give a large repeat count
4936 * combined with a small block size. This should result in good
4937 * load balancing for both small and large domains.
4938 */
4939 ci_block = conv_i - 1;
4940 }
4941 if (debug)
18
Assuming 'debug' is null
19
Taking false branch
4942 {
4943 fprintf(debug, "nbl nc_i %d col.av. %.1f ci_block %d\n",
4944 gridi->nc, gridi->nc/(double)(gridi->ncx*gridi->ncy), ci_block);
4945 }
4946
4947 ndistc = 0;
4948 ncpcheck = 0;
4949
4950 /* Initially ci_b and ci to 1 before where we want them to start,
4951 * as they will both be incremented in next_ci.
4952 */
4953 ci_b = -1;
4954 ci = th*ci_block - 1;
4955 ci_x = 0;
4956 ci_y = 0;
4957 while (next_ci(gridi, conv_i, nth, ci_block, &ci_x, &ci_y, &ci_b, &ci))
20
Loop condition is true. Entering loop body
4958 {
4959 if (nbl->bSimple && flags_i[ci] == 0)
21
Taking false branch
4960 {
4961 continue;
4962 }
4963
4964 ncj_old_i = nbl->ncj;
4965
4966 d2cx = 0;
4967 if (gridj != gridi && shp[XX0] == 0)
22
Assuming 'gridj' is equal to 'gridi'
4968 {
4969 if (nbl->bSimple)
4970 {
4971 bx1 = bb_i[ci].upper[BB_X0];
4972 }
4973 else
4974 {
4975 bx1 = gridi->c0[XX0] + (ci_x+1)*gridi->sx;
4976 }
4977 if (bx1 < gridj->c0[XX0])
4978 {
4979 d2cx = sqr(gridj->c0[XX0] - bx1);
4980
4981 if (d2cx >= rl2)
4982 {
4983 continue;
4984 }
4985 }
4986 }
4987
4988 ci_xy = ci_x*gridi->ncy + ci_y;
4989
4990 /* Loop over shift vectors in three dimensions */
4991 for (tz = -shp[ZZ2]; tz <= shp[ZZ2]; tz++)
23
Loop condition is true. Entering loop body
4992 {
4993 shz = tz*box[ZZ2][ZZ2];
4994
4995 bz0 = bbcz_i[ci*NNBSBB_D2 ] + shz;
4996 bz1 = bbcz_i[ci*NNBSBB_D2+1] + shz;
4997
4998 if (tz == 0)
24
Taking false branch
4999 {
5000 d2z = 0;
5001 }
5002 else if (tz < 0)
25
Taking true branch
5003 {
5004 d2z = sqr(bz1);
5005 }
5006 else
5007 {
5008 d2z = sqr(bz0 - box[ZZ2][ZZ2]);
5009 }
5010
5011 d2z_cx = d2z + d2cx;
5012
5013 if (d2z_cx >= rl2)
26
Taking false branch
5014 {
5015 continue;
5016 }
5017
5018 bz1_frac =
5019 bz1/((real)(gridi->cxy_ind[ci_xy+1] - gridi->cxy_ind[ci_xy]));
5020 if (bz1_frac < 0)
27
Taking false branch
5021 {
5022 bz1_frac = 0;
5023 }
5024 /* The check with bz1_frac close to or larger than 1 comes later */
5025
5026 for (ty = -shp[YY1]; ty <= shp[YY1]; ty++)
28
Loop condition is true. Entering loop body
5027 {
5028 shy = ty*box[YY1][YY1] + tz*box[ZZ2][YY1];
5029
5030 if (nbl->bSimple)
29
Taking true branch
5031 {
5032 by0 = bb_i[ci].lower[BB_Y1] + shy;
5033 by1 = bb_i[ci].upper[BB_Y1] + shy;
5034 }
5035 else
5036 {
5037 by0 = gridi->c0[YY1] + (ci_y )*gridi->sy + shy;
5038 by1 = gridi->c0[YY1] + (ci_y+1)*gridi->sy + shy;
5039 }
5040
5041 get_cell_range(by0, by1,
5042 gridj->ncy, gridj->c0[YY1], gridj->sy, gridj->inv_sy,
5043 d2z_cx, rl2,
5044 &cyf, &cyl);
5045
5046 if (cyf > cyl)
30
Assuming 'cyf' is <= 'cyl'
31
Taking false branch
5047 {
5048 continue;
5049 }
5050
5051 d2z_cy = d2z;
5052 if (by1 < gridj->c0[YY1])
32
Taking false branch
5053 {
5054 d2z_cy += sqr(gridj->c0[YY1] - by1);
5055 }
5056 else if (by0 > gridj->c1[YY1])
33
Taking false branch
5057 {
5058 d2z_cy += sqr(by0 - gridj->c1[YY1]);
5059 }
5060
5061 for (tx = -shp[XX0]; tx <= shp[XX0]; tx++)
34
Loop condition is true. Entering loop body
54
Loop condition is true. Entering loop body
70
Loop condition is true. Entering loop body
5062 {
5063 shift = XYZ2IS(tx, ty, tz)((2*2 +1)*((2*1 +1)*((tz)+1)+(ty)+1)+(tx)+2);
5064
5065#ifdef NBNXN_SHIFT_BACKWARD
5066 if (gridi == gridj && shift > CENTRAL(((2*1 +1)*(2*1 +1)*(2*2 +1))/2))
35
Assuming 'gridi' is not equal to 'gridj'
5067 {
5068 continue;
5069 }
5070#endif
5071
5072 shx = tx*box[XX0][XX0] + ty*box[YY1][XX0] + tz*box[ZZ2][XX0];
5073
5074 if (nbl->bSimple)
36
Taking true branch
55
Taking true branch
71
Taking true branch
5075 {
5076 bx0 = bb_i[ci].lower[BB_X0] + shx;
5077 bx1 = bb_i[ci].upper[BB_X0] + shx;
5078 }
5079 else
5080 {
5081 bx0 = gridi->c0[XX0] + (ci_x )*gridi->sx + shx;
5082 bx1 = gridi->c0[XX0] + (ci_x+1)*gridi->sx + shx;
5083 }
5084
5085 get_cell_range(bx0, bx1,
5086 gridj->ncx, gridj->c0[XX0], gridj->sx, gridj->inv_sx,
5087 d2z_cy, rl2,
5088 &cxf, &cxl);
5089
5090 if (cxf > cxl)
37
Assuming 'cxf' is <= 'cxl'
38
Taking false branch
56
Taking false branch
72
Taking false branch
5091 {
5092 continue;
5093 }
5094
5095 if (nbl->bSimple)
39
Taking true branch
57
Taking true branch
73
Taking true branch
5096 {
5097 new_ci_entry(nbl, cell0_i+ci, shift, flags_i[ci]);
5098 }
5099 else
5100 {
5101 new_sci_entry(nbl, cell0_i+ci, shift);
5102 }
5103
5104#ifndef NBNXN_SHIFT_BACKWARD
5105 if (cxf < ci_x)
5106#else
5107 if (shift == CENTRAL(((2*1 +1)*(2*1 +1)*(2*2 +1))/2) && gridi == gridj &&
5108 cxf < ci_x)
5109#endif
5110 {
5111 /* Leave the pairs with i > j.
5112 * x is the major index, so skip half of it.
5113 */
5114 cxf = ci_x;
5115 }
5116
5117 if (nbl->bSimple)
40
Taking true branch
58
Taking true branch
74
Taking false branch
5118 {
5119 set_icell_bb_simple(bb_i, ci, shx, shy, shz,
5120 nbl->work->bb_ci);
5121 }
5122 else
5123 {
5124#ifdef NBNXN_BBXXXX
5125 set_icell_bbxxxx_supersub(pbb_i, ci, shx, shy, shz,
75
Passing null pointer value via 1st parameter 'bb'
76
Calling 'set_icell_bbxxxx_supersub'
5126 nbl->work->pbb_ci);
5127#else
5128 set_icell_bb_supersub(bb_i, ci, shx, shy, shz,
5129 nbl->work->bb_ci);
5130#endif
5131 }
5132
5133 nbs->icell_set_x(cell0_i+ci, shx, shy, shz,
5134 gridi->na_c, nbat->xstride, nbat->x,
5135 nbl->work);
5136
5137 for (cx = cxf; cx <= cxl; cx++)
41
Loop condition is true. Entering loop body
49
Assuming 'cx' is > 'cxl'
50
Loop condition is false. Execution continues on line 5331
59
Loop condition is true. Entering loop body
66
Loop condition is false. Execution continues on line 5331
5138 {
5139 d2zx = d2z;
5140 if (gridj->c0[XX0] + cx*gridj->sx > bx1)
42
Taking false branch
60
Taking false branch
5141 {
5142 d2zx += sqr(gridj->c0[XX0] + cx*gridj->sx - bx1);
5143 }
5144 else if (gridj->c0[XX0] + (cx+1)*gridj->sx < bx0)
43
Taking false branch
61
Taking false branch
5145 {
5146 d2zx += sqr(gridj->c0[XX0] + (cx+1)*gridj->sx - bx0);
5147 }
5148
5149#ifndef NBNXN_SHIFT_BACKWARD
5150 if (gridi == gridj &&
5151 cx == 0 && cyf < ci_y)
5152#else
5153 if (gridi == gridj &&
5154 cx == 0 && shift == CENTRAL(((2*1 +1)*(2*1 +1)*(2*2 +1))/2) && cyf < ci_y)
5155#endif
5156 {
5157 /* Leave the pairs with i > j.
5158 * Skip half of y when i and j have the same x.
5159 */
5160 cyf_x = ci_y;
5161 }
5162 else
5163 {
5164 cyf_x = cyf;
5165 }
5166
5167 for (cy = cyf_x; cy <= cyl; cy++)
44
Loop condition is true. Entering loop body
47
Assuming 'cy' is > 'cyl'
48
Loop condition is false. Execution continues on line 5137
62
Loop condition is true. Entering loop body
65
Loop condition is false. Execution continues on line 5137
5168 {
5169 c0 = gridj->cxy_ind[cx*gridj->ncy+cy];
5170 c1 = gridj->cxy_ind[cx*gridj->ncy+cy+1];
5171#ifdef NBNXN_SHIFT_BACKWARD
5172 if (gridi == gridj &&
5173 shift == CENTRAL(((2*1 +1)*(2*1 +1)*(2*2 +1))/2) && c0 < ci)
5174 {
5175 c0 = ci;
5176 }
5177#endif
5178
5179 d2zxy = d2zx;
5180 if (gridj->c0[YY1] + cy*gridj->sy > by1)
45
Taking false branch
63
Taking false branch
5181 {
5182 d2zxy += sqr(gridj->c0[YY1] + cy*gridj->sy - by1);
5183 }
5184 else if (gridj->c0[YY1] + (cy+1)*gridj->sy < by0)
46
Taking false branch
64
Taking false branch
5185 {
5186 d2zxy += sqr(gridj->c0[YY1] + (cy+1)*gridj->sy - by0);
5187 }
5188 if (c1 > c0 && d2zxy < rl2)
5189 {
5190 cs = c0 + (int)(bz1_frac*(c1 - c0));
5191 if (cs >= c1)
5192 {
5193 cs = c1 - 1;
5194 }
5195
5196 d2xy = d2zxy - d2z;
5197
5198 /* Find the lowest cell that can possibly
5199 * be within range.
5200 */
5201 cf = cs;
5202 while (cf > c0 &&
5203 (bbcz_j[cf*NNBSBB_D2+1] >= bz0 ||
5204 d2xy + sqr(bbcz_j[cf*NNBSBB_D2+1] - bz0) < rl2))
5205 {
5206 cf--;
5207 }
5208
5209 /* Find the highest cell that can possibly
5210 * be within range.
5211 */
5212 cl = cs;
5213 while (cl < c1-1 &&
5214 (bbcz_j[cl*NNBSBB_D2] <= bz1 ||
5215 d2xy + sqr(bbcz_j[cl*NNBSBB_D2] - bz1) < rl2))
5216 {
5217 cl++;
5218 }
5219
5220#ifdef NBNXN_REFCODE
5221 {
5222 /* Simple reference code, for debugging,
5223 * overrides the more complex code above.
5224 */
5225 int k;
5226 cf = c1;
5227 cl = -1;
5228 for (k = c0; k < c1; k++)
5229 {
5230 if (box_dist2(bx0, bx1, by0, by1, bz0, bz1, bb+k) < rl2 &&
5231 k < cf)
5232 {
5233 cf = k;
5234 }
5235 if (box_dist2(bx0, bx1, by0, by1, bz0, bz1, bb+k) < rl2 &&
5236 k > cl)
5237 {
5238 cl = k;
5239 }
5240 }
5241 }
5242#endif
5243
5244 if (gridi == gridj)
5245 {
5246 /* We want each atom/cell pair only once,
5247 * only use cj >= ci.
5248 */
5249#ifndef NBNXN_SHIFT_BACKWARD
5250 cf = max(cf, ci)(((cf) > (ci)) ? (cf) : (ci) );
5251#else
5252 if (shift == CENTRAL(((2*1 +1)*(2*1 +1)*(2*2 +1))/2))
5253 {
5254 cf = max(cf, ci)(((cf) > (ci)) ? (cf) : (ci) );
5255 }
5256#endif
5257 }
5258
5259 if (cf <= cl)
5260 {
5261 /* For f buffer flags with simple lists */
5262 ncj_old_j = nbl->ncj;
5263
5264 switch (nb_kernel_type)
5265 {
5266 case nbnxnk4x4_PlainC:
5267 check_subcell_list_space_simple(nbl, cl-cf+1);
5268
5269 make_cluster_list_simple(gridj,
5270 nbl, ci, cf, cl,
5271 (gridi == gridj && shift == CENTRAL(((2*1 +1)*(2*1 +1)*(2*2 +1))/2)),
5272 nbat->x,
5273 rl2, rbb2,
5274 &ndistc);
5275 break;
5276#ifdef GMX_NBNXN_SIMD_4XN
5277 case nbnxnk4xN_SIMD_4xN:
5278 check_subcell_list_space_simple(nbl, ci_to_cj(na_cj_2log, cl-cf)+2);
5279 make_cluster_list_simd_4xn(gridj,
5280 nbl, ci, cf, cl,
5281 (gridi == gridj && shift == CENTRAL(((2*1 +1)*(2*1 +1)*(2*2 +1))/2)),
5282 nbat->x,
5283 rl2, rbb2,
5284 &ndistc);
5285 break;
5286#endif
5287#ifdef GMX_NBNXN_SIMD_2XNN
5288 case nbnxnk4xN_SIMD_2xNN:
5289 check_subcell_list_space_simple(nbl, ci_to_cj(na_cj_2log, cl-cf)+2);
5290 make_cluster_list_simd_2xnn(gridj,
5291 nbl, ci, cf, cl,
5292 (gridi == gridj && shift == CENTRAL(((2*1 +1)*(2*1 +1)*(2*2 +1))/2)),
5293 nbat->x,
5294 rl2, rbb2,
5295 &ndistc);
5296 break;
5297#endif
5298 case nbnxnk8x8x8_PlainC:
5299 case nbnxnk8x8x8_CUDA:
5300 check_subcell_list_space_supersub(nbl, cl-cf+1);
5301 for (cj = cf; cj <= cl; cj++)
5302 {
5303 make_cluster_list_supersub(gridi, gridj,
5304 nbl, ci, cj,
5305 (gridi == gridj && shift == CENTRAL(((2*1 +1)*(2*1 +1)*(2*2 +1))/2) && ci == cj),
5306 nbat->xstride, nbat->x,
5307 rl2, rbb2,
5308 &ndistc);
5309 }
5310 break;
5311 }
5312 ncpcheck += cl - cf + 1;
5313
5314 if (bFBufferFlag && nbl->ncj > ncj_old_j)
5315 {
5316 int cbf, cbl, cb;
5317
5318 cbf = nbl->cj[ncj_old_j].cj >> gridj_flag_shift;
5319 cbl = nbl->cj[nbl->ncj-1].cj >> gridj_flag_shift;
5320 for (cb = cbf; cb <= cbl; cb++)
5321 {
5322 gridj_flag[cb] = 1U<<th;
5323 }
5324 }
5325 }
5326 }
5327 }
5328 }
5329
5330 /* Set the exclusions for this ci list */
5331 if (nbl->bSimple)
51
Taking true branch
67
Taking true branch
5332 {
5333 set_ci_top_excls(nbs,
5334 nbl,
5335 shift == CENTRAL(((2*1 +1)*(2*1 +1)*(2*2 +1))/2) && gridi == gridj,
5336 gridj->na_c_2log,
5337 na_cj_2log,
5338 &(nbl->ci[nbl->nci]),
5339 excl);
5340
5341 if (nbs->bFEP)
52
Taking false branch
68
Taking false branch
5342 {
5343 make_fep_list(nbs, nbat, nbl,
5344 shift == CENTRAL(((2*1 +1)*(2*1 +1)*(2*2 +1))/2) && gridi == gridj,
5345 &(nbl->ci[nbl->nci]),
5346 gridi, gridj, nbl_fep);
5347 }
5348 }
5349 else
5350 {
5351 set_sci_top_excls(nbs,
5352 nbl,
5353 shift == CENTRAL(((2*1 +1)*(2*1 +1)*(2*2 +1))/2) && gridi == gridj,
5354 gridj->na_c_2log,
5355 &(nbl->sci[nbl->nsci]),
5356 excl);
5357
5358 if (nbs->bFEP)
5359 {
5360 make_fep_list_supersub(nbs, nbat, nbl,
5361 shift == CENTRAL(((2*1 +1)*(2*1 +1)*(2*2 +1))/2) && gridi == gridj,
5362 &(nbl->sci[nbl->nsci]),
5363 shx, shy, shz,
5364 rl_fep2,
5365 gridi, gridj, nbl_fep);
5366 }
5367 }
5368
5369 /* Close this ci list */
5370 if (nbl->bSimple)
53
Taking true branch
69
Taking true branch
5371 {
5372 close_ci_entry_simple(nbl);
5373 }
5374 else
5375 {
5376 close_ci_entry_supersub(nbl,
5377 nsubpair_max,
5378 progBal, min_ci_balanced,
5379 th, nth);
5380 }
5381 }
5382 }
5383 }
5384
5385 if (bFBufferFlag && nbl->ncj > ncj_old_i)
5386 {
5387 work->buffer_flags.flag[(gridi->cell0+ci)>>gridi_flag_shift] = 1U<<th;
5388 }
5389 }
5390
5391 work->ndistc = ndistc;
5392
5393 nbs_cycle_stop(&work->cc[enbsCCsearch]);
5394
5395 if (debug)
5396 {
5397 fprintf(debug, "number of distance checks %d\n", ndistc);
5398 fprintf(debug, "ncpcheck %s %d\n", gridi == gridj ? "local" : "non-local",
5399 ncpcheck);
5400
5401 if (nbl->bSimple)
5402 {
5403 print_nblist_statistics_simple(debug, nbl, nbs, rlist);
5404 }
5405 else
5406 {
5407 print_nblist_statistics_supersub(debug, nbl, nbs, rlist);
5408 }
5409
5410 if (nbs->bFEP)
5411 {
5412 fprintf(debug, "nbl FEP list pairs: %d\n", nbl_fep->nrj);
5413 }
5414 }
5415}
5416
5417static void reduce_buffer_flags(const nbnxn_search_t nbs,
5418 int nsrc,
5419 const nbnxn_buffer_flags_t *dest)
5420{
5421 int s, b;
5422 const unsigned int *flag;
5423
5424 for (s = 0; s < nsrc; s++)
5425 {
5426 flag = nbs->work[s].buffer_flags.flag;
5427
5428 for (b = 0; b < dest->nflag; b++)
5429 {
5430 dest->flag[b] |= flag[b];
5431 }
5432 }
5433}
5434
5435static void print_reduction_cost(const nbnxn_buffer_flags_t *flags, int nout)
5436{
5437 int nelem, nkeep, ncopy, nred, b, c, out;
5438
5439 nelem = 0;
5440 nkeep = 0;
5441 ncopy = 0;
5442 nred = 0;
5443 for (b = 0; b < flags->nflag; b++)
5444 {
5445 if (flags->flag[b] == 1)
5446 {
5447 /* Only flag 0 is set, no copy of reduction required */
5448 nelem++;
5449 nkeep++;
5450 }
5451 else if (flags->flag[b] > 0)
5452 {
5453 c = 0;
5454 for (out = 0; out < nout; out++)
5455 {
5456 if (flags->flag[b] & (1U<<out))
5457 {
5458 c++;
5459 }
5460 }
5461 nelem += c;
5462 if (c == 1)
5463 {
5464 ncopy++;
5465 }
5466 else
5467 {
5468 nred += c;
5469 }
5470 }
5471 }
5472
5473 fprintf(debug, "nbnxn reduction: #flag %d #list %d elem %4.2f, keep %4.2f copy %4.2f red %4.2f\n",
5474 flags->nflag, nout,
5475 nelem/(double)(flags->nflag),
5476 nkeep/(double)(flags->nflag),
5477 ncopy/(double)(flags->nflag),
5478 nred/(double)(flags->nflag));
5479}
5480
5481/* Perform a count (linear) sort to sort the smaller lists to the end.
5482 * This avoids load imbalance on the GPU, as large lists will be
5483 * scheduled and executed first and the smaller lists later.
5484 * Load balancing between multi-processors only happens at the end
5485 * and there smaller lists lead to more effective load balancing.
5486 * The sorting is done on the cj4 count, not on the actual pair counts.
5487 * Not only does this make the sort faster, but it also results in
5488 * better load balancing than using a list sorted on exact load.
5489 * This function swaps the pointer in the pair list to avoid a copy operation.
5490 */
5491static void sort_sci(nbnxn_pairlist_t *nbl)
5492{
5493 nbnxn_list_work_t *work;
5494 int m, i, s, s0, s1;
5495 nbnxn_sci_t *sci_sort;
5496
5497 if (nbl->ncj4 <= nbl->nsci)
5498 {
5499 /* nsci = 0 or all sci have size 1, sorting won't change the order */
5500 return;
5501 }
5502
5503 work = nbl->work;
5504
5505 /* We will distinguish differences up to double the average */
5506 m = (2*nbl->ncj4)/nbl->nsci;
5507
5508 if (m + 1 > work->sort_nalloc)
5509 {
5510 work->sort_nalloc = over_alloc_large(m + 1)(int)(1.19*(m + 1) + 1000);
5511 srenew(work->sort, work->sort_nalloc)(work->sort) = save_realloc("work->sort", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 5511, (work->sort), (work->sort_nalloc), sizeof(*(work
->sort)))
;
5512 }
5513
5514 if (work->sci_sort_nalloc != nbl->sci_nalloc)
5515 {
5516 work->sci_sort_nalloc = nbl->sci_nalloc;
5517 nbnxn_realloc_void((void **)&work->sci_sort,
5518 0,
5519 work->sci_sort_nalloc*sizeof(*work->sci_sort),
5520 nbl->alloc, nbl->free);
5521 }
5522
5523 /* Count the entries of each size */
5524 for (i = 0; i <= m; i++)
5525 {
5526 work->sort[i] = 0;
5527 }
5528 for (s = 0; s < nbl->nsci; s++)
5529 {
5530 i = min(m, nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start)(((m) < (nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start
)) ? (m) : (nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start
) )
;
5531 work->sort[i]++;
5532 }
5533 /* Calculate the offset for each count */
5534 s0 = work->sort[m];
5535 work->sort[m] = 0;
5536 for (i = m - 1; i >= 0; i--)
5537 {
5538 s1 = work->sort[i];
5539 work->sort[i] = work->sort[i + 1] + s0;
5540 s0 = s1;
5541 }
5542
5543 /* Sort entries directly into place */
5544 sci_sort = work->sci_sort;
5545 for (s = 0; s < nbl->nsci; s++)
5546 {
5547 i = min(m, nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start)(((m) < (nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start
)) ? (m) : (nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start
) )
;
5548 sci_sort[work->sort[i]++] = nbl->sci[s];
5549 }
5550
5551 /* Swap the sci pointers so we use the new, sorted list */
5552 work->sci_sort = nbl->sci;
5553 nbl->sci = sci_sort;
5554}
5555
5556/* Make a local or non-local pair-list, depending on iloc */
5557void nbnxn_make_pairlist(const nbnxn_search_t nbs,
5558 nbnxn_atomdata_t *nbat,
5559 const t_blocka *excl,
5560 real rlist,
5561 int min_ci_balanced,
5562 nbnxn_pairlist_set_t *nbl_list,
5563 int iloc,
5564 int nb_kernel_type,
5565 t_nrnb *nrnb)
5566{
5567 nbnxn_grid_t *gridi, *gridj;
5568 gmx_bool bGPUCPU;
5569 int nzi, zi, zj0, zj1, zj;
5570 int nsubpair_max;
5571 int th;
5572 int nnbl;
5573 nbnxn_pairlist_t **nbl;
5574 int ci_block;
5575 gmx_bool CombineNBLists;
5576 gmx_bool progBal;
5577 int np_tot, np_noq, np_hlj, nap;
5578
5579 /* Check if we are running hybrid GPU + CPU nbnxn mode */
5580 bGPUCPU = (!nbs->grid[0].bSimple && nbl_list->bSimple);
5581
5582 nnbl = nbl_list->nnbl;
5583 nbl = nbl_list->nbl;
5584 CombineNBLists = nbl_list->bCombined;
5585
5586 if (debug)
5587 {
5588 fprintf(debug, "ns making %d nblists\n", nnbl);
5589 }
5590
5591 nbat->bUseBufferFlags = (nbat->nout > 1);
5592 /* We should re-init the flags before making the first list */
5593 if (nbat->bUseBufferFlags && (LOCAL_I(iloc)((iloc) == eintLocal) || bGPUCPU))
5594 {
5595 init_buffer_flags(&nbat->buffer_flags, nbat->natoms);
5596 }
5597
5598 if (nbl_list->bSimple)
5599 {
5600 switch (nb_kernel_type)
5601 {
5602#ifdef GMX_NBNXN_SIMD_4XN
5603 case nbnxnk4xN_SIMD_4xN:
5604 nbs->icell_set_x = icell_set_x_simd_4xn;
5605 break;
5606#endif
5607#ifdef GMX_NBNXN_SIMD_2XNN
5608 case nbnxnk4xN_SIMD_2xNN:
5609 nbs->icell_set_x = icell_set_x_simd_2xnn;
5610 break;
5611#endif
5612 default:
5613 nbs->icell_set_x = icell_set_x_simple;
5614 break;
5615 }
5616 }
5617 else
5618 {
5619#ifdef NBNXN_SEARCH_BB_SIMD4
5620 nbs->icell_set_x = icell_set_x_supersub_simd4;
5621#else
5622 nbs->icell_set_x = icell_set_x_supersub;
5623#endif
5624 }
5625
5626 if (LOCAL_I(iloc)((iloc) == eintLocal))
5627 {
5628 /* Only zone (grid) 0 vs 0 */
5629 nzi = 1;
5630 zj0 = 0;
5631 zj1 = 1;
5632 }
5633 else
5634 {
5635 nzi = nbs->zones->nizone;
5636 }
5637
5638 if (!nbl_list->bSimple && min_ci_balanced > 0)
5639 {
5640 nsubpair_max = get_nsubpair_max(nbs, iloc, rlist, min_ci_balanced);
5641 }
5642 else
5643 {
5644 nsubpair_max = 0;
5645 }
5646
5647 /* Clear all pair-lists */
5648 for (th = 0; th < nnbl; th++)
5649 {
5650 clear_pairlist(nbl[th]);
5651
5652 if (nbs->bFEP)
5653 {
5654 clear_pairlist_fep(nbl_list->nbl_fep[th]);
5655 }
5656 }
5657
5658 for (zi = 0; zi < nzi; zi++)
5659 {
5660 gridi = &nbs->grid[zi];
5661
5662 if (NONLOCAL_I(iloc)((iloc) == eintNonlocal))
5663 {
5664 zj0 = nbs->zones->izone[zi].j0;
5665 zj1 = nbs->zones->izone[zi].j1;
5666 if (zi == 0)
5667 {
5668 zj0++;
5669 }
5670 }
5671 for (zj = zj0; zj < zj1; zj++)
5672 {
5673 gridj = &nbs->grid[zj];
5674
5675 if (debug)
5676 {
5677 fprintf(debug, "ns search grid %d vs %d\n", zi, zj);
5678 }
5679
5680 nbs_cycle_start(&nbs->cc[enbsCCsearch]);
5681
5682 if (nbl[0]->bSimple && !gridi->bSimple)
5683 {
5684 /* Hybrid list, determine blocking later */
5685 ci_block = 0;
5686 }
5687 else
5688 {
5689 ci_block = get_ci_block_size(gridi, nbs->DomDec, nnbl);
5690 }
5691
5692 /* With GPU: generate progressively smaller lists for
5693 * load balancing for local only or non-local with 2 zones.
5694 */
5695 progBal = (LOCAL_I(iloc)((iloc) == eintLocal) || nbs->zones->n <= 2);
5696
5697#pragma omp parallel for num_threads(nnbl) schedule(static)
5698 for (th = 0; th < nnbl; th++)
5699 {
5700 /* Re-init the thread-local work flag data before making
5701 * the first list (not an elegant conditional).
5702 */
5703 if (nbat->bUseBufferFlags && ((zi == 0 && zj == 0) ||
5704 (bGPUCPU && zi == 0 && zj == 1)))
5705 {
5706 init_buffer_flags(&nbs->work[th].buffer_flags, nbat->natoms);
5707 }
5708
5709 if (CombineNBLists && th > 0)
5710 {
5711 clear_pairlist(nbl[th]);
5712 }
5713
5714 /* Divide the i super cell equally over the nblists */
5715 nbnxn_make_pairlist_part(nbs, gridi, gridj,
5716 &nbs->work[th], nbat, excl,
5717 rlist,
5718 nb_kernel_type,
5719 ci_block,
5720 nbat->bUseBufferFlags,
5721 nsubpair_max,
5722 progBal, min_ci_balanced,
5723 th, nnbl,
5724 nbl[th],
5725 nbl_list->nbl_fep[th]);
5726 }
5727 nbs_cycle_stop(&nbs->cc[enbsCCsearch]);
5728
5729 np_tot = 0;
5730 np_noq = 0;
5731 np_hlj = 0;
5732 for (th = 0; th < nnbl; th++)
5733 {
5734 inc_nrnb(nrnb, eNR_NBNXN_DIST2, nbs->work[th].ndistc)(nrnb)->n[eNR_NBNXN_DIST2] += nbs->work[th].ndistc;
5735
5736 if (nbl_list->bSimple)
5737 {
5738 np_tot += nbl[th]->ncj;
5739 np_noq += nbl[th]->work->ncj_noq;
5740 np_hlj += nbl[th]->work->ncj_hlj;
5741 }
5742 else
5743 {
5744 /* This count ignores potential subsequent pair pruning */
5745 np_tot += nbl[th]->nci_tot;
5746 }
5747 }
5748 nap = nbl[0]->na_ci*nbl[0]->na_cj;
5749 nbl_list->natpair_ljq = (np_tot - np_noq)*nap - np_hlj*nap/2;
5750 nbl_list->natpair_lj = np_noq*nap;
5751 nbl_list->natpair_q = np_hlj*nap/2;
5752
5753 if (CombineNBLists && nnbl > 1)
5754 {
5755 nbs_cycle_start(&nbs->cc[enbsCCcombine]);
5756
5757 combine_nblists(nnbl-1, nbl+1, nbl[0]);
5758
5759 nbs_cycle_stop(&nbs->cc[enbsCCcombine]);
5760 }
5761 }
5762 }
5763
5764 if (!nbl_list->bSimple)
5765 {
5766 /* Sort the entries on size, large ones first */
5767 if (CombineNBLists || nnbl == 1)
5768 {
5769 sort_sci(nbl[0]);
5770 }
5771 else
5772 {
5773#pragma omp parallel for num_threads(nnbl) schedule(static)
5774 for (th = 0; th < nnbl; th++)
5775 {
5776 sort_sci(nbl[th]);
5777 }
5778 }
5779 }
5780
5781 if (nbat->bUseBufferFlags)
5782 {
5783 reduce_buffer_flags(nbs, nnbl, &nbat->buffer_flags);
5784 }
5785
5786 if (nbs->bFEP)
5787 {
5788 /* Balance the free-energy lists over all the threads */
5789 balance_fep_lists(nbs, nbl_list);
5790 }
5791
5792 /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
5793 if (LOCAL_I(iloc)((iloc) == eintLocal))
5794 {
5795 nbs->search_count++;
5796 }
5797 if (nbs->print_cycles &&
5798 (!nbs->DomDec || (nbs->DomDec && !LOCAL_I(iloc)((iloc) == eintLocal))) &&
5799 nbs->search_count % 100 == 0)
5800 {
5801 nbs_cycle_print(stderrstderr, nbs);
5802 }
5803
5804 if (debug && (CombineNBLists && nnbl > 1))
5805 {
5806 if (nbl[0]->bSimple)
5807 {
5808 print_nblist_statistics_simple(debug, nbl[0], nbs, rlist);
5809 }
5810 else
5811 {
5812 print_nblist_statistics_supersub(debug, nbl[0], nbs, rlist);
5813 }
5814 }
5815
5816 if (debug)
5817 {
5818 if (gmx_debug_at)
5819 {
5820 if (nbl[0]->bSimple)
5821 {
5822 print_nblist_ci_cj(debug, nbl[0]);
5823 }
5824 else
5825 {
5826 print_nblist_sci_cj(debug, nbl[0]);
5827 }
5828 }
5829
5830 if (nbat->bUseBufferFlags)
5831 {
5832 print_reduction_cost(&nbat->buffer_flags, nnbl);
5833 }
5834 }
5835}