1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
10 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
11 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
12 * Copyright (c) 2001-2012, The GROMACS development team,
13 * check out http://www.gromacs.org for more information.
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License
17 * as published by the Free Software Foundation; either version 2
18 * of the License, or (at your option) any later version.
20 * If you want to redistribute modifications, please consider that
21 * scientific software is very special. Version control is crucial -
22 * bugs must be traceable. We will be happy to consider code for
23 * inclusion in the official distribution, but derived work must not
24 * be called official GROMACS. Details are found in the README & COPYING
25 * files - if they are missing, get the official version at www.gromacs.org.
27 * To help us fund GROMACS development, we humbly ask that you cite
28 * the papers on the package - you can find them in the top README file.
30 * For more info, check our website at http://www.gromacs.org
45 #include "nbnxn_consts.h"
46 #include "nbnxn_internal.h"
47 #include "nbnxn_atomdata.h"
48 #include "nbnxn_search.h"
49 #include "gmx_cyclecounter.h"
51 #include "gmx_omp_nthreads.h"
55 /* Pair search box lower and upper corner in x,y,z.
56 * Store this in 4 iso 3 reals, which is useful with SSE.
57 * To avoid complicating the code we also use 4 without SSE.
60 #define NNBSBB_B (2*NNBSBB_C)
61 /* Pair search box lower and upper bound in z only. */
63 /* Pair search box lower and upper corner x,y,z indices */
72 #ifdef NBNXN_SEARCH_SSE
75 #define NBNXN_SEARCH_SSE_SINGLE
78 /* Include basic SSE2 stuff */
79 #include <emmintrin.h>
81 #if defined NBNXN_SEARCH_SSE_SINGLE && GPU_NSUBCELL == 8
85 /* The width of SSE/AVX128 with single precision for bounding boxes with GPU.
86 * Here AVX-256 turns out to be slightly slower than AVX-128.
89 #define STRIDE_8BB_2LOG 2
91 #endif /* NBNXN_SEARCH_SSE */
95 /* The functions below are macros as they are performance sensitive */
97 /* 4x4 list, pack=4: no complex conversion required */
98 /* i-cluster to j-cluster conversion */
99 #define CI_TO_CJ_J4(ci) (ci)
100 /* cluster index to coordinate array index conversion */
101 #define X_IND_CI_J4(ci) ((ci)*STRIDE_P4)
102 #define X_IND_CJ_J4(cj) ((cj)*STRIDE_P4)
104 /* 4x2 list, pack=4: j-cluster size is half the packing width */
105 /* i-cluster to j-cluster conversion */
106 #define CI_TO_CJ_J2(ci) ((ci)<<1)
107 /* cluster index to coordinate array index conversion */
108 #define X_IND_CI_J2(ci) ((ci)*STRIDE_P4)
109 #define X_IND_CJ_J2(cj) (((cj)>>1)*STRIDE_P4 + ((cj) & 1)*(PACK_X4>>1))
111 /* 4x8 list, pack=8: i-cluster size is half the packing width */
112 /* i-cluster to j-cluster conversion */
113 #define CI_TO_CJ_J8(ci) ((ci)>>1)
114 /* cluster index to coordinate array index conversion */
115 #define X_IND_CI_J8(ci) (((ci)>>1)*STRIDE_P8 + ((ci) & 1)*(PACK_X8>>1))
116 #define X_IND_CJ_J8(cj) ((cj)*STRIDE_P8)
118 /* The j-cluster size is matched to the SIMD width */
119 #if GMX_NBNXN_SIMD_BITWIDTH == 128
121 #define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J2(ci)
122 #define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J2(ci)
123 #define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J2(cj)
125 #define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J4(ci)
126 #define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J4(ci)
127 #define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J4(cj)
130 #if GMX_NBNXN_SIMD_BITWIDTH == 256
132 #define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J4(ci)
133 #define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J4(ci)
134 #define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J4(cj)
136 #define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J8(ci)
137 #define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J8(ci)
138 #define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J8(cj)
139 /* Half SIMD with j-cluster size */
140 #define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J4(ci)
141 #define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J4(ci)
142 #define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J4(cj)
145 #error "unsupported GMX_NBNXN_SIMD_WIDTH"
149 #endif /* GMX_NBNXN_SIMD */
152 /* Interaction masks for 4xN atom interactions.
153 * Bit i*CJ_SIZE + j tells if atom i and j interact.
155 /* All interaction mask is the same for all kernels */
156 #define NBNXN_INT_MASK_ALL 0xffffffff
157 /* 4x4 kernel diagonal mask */
158 #define NBNXN_INT_MASK_DIAG 0x08ce
159 /* 4x2 kernel diagonal masks */
160 #define NBNXN_INT_MASK_DIAG_J2_0 0x0002
161 #define NBNXN_INT_MASK_DIAG_J2_1 0x002F
162 /* 4x8 kernel diagonal masks */
163 #define NBNXN_INT_MASK_DIAG_J8_0 0xf0f8fcfe
164 #define NBNXN_INT_MASK_DIAG_J8_1 0x0080c0e0
167 #ifdef NBNXN_SEARCH_SSE
168 /* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
170 /* Size of bounding box corners quadruplet */
171 #define NNBSBB_XXXX (NNBSBB_D*DIM*STRIDE_8BB)
174 /* We shift the i-particles backward for PBC.
175 * This leads to more conditionals than shifting forward.
176 * We do this to get more balanced pair lists.
178 #define NBNXN_SHIFT_BACKWARD
181 /* This define is a lazy way to avoid interdependence of the grid
182 * and searching data structures.
184 #define NBNXN_NA_SC_MAX (GPU_NSUBCELL*NBNXN_GPU_CLUSTER_SIZE)
187 static void nbs_cycle_clear(nbnxn_cycle_t *cc)
191 for(i=0; i<enbsCCnr; i++)
198 static double Mcyc_av(const nbnxn_cycle_t *cc)
200 return (double)cc->c*1e-6/cc->count;
203 static void nbs_cycle_print(FILE *fp,const nbnxn_search_t nbs)
209 fprintf(fp,"ns %4d grid %4.1f search %4.1f red.f %5.3f",
210 nbs->cc[enbsCCgrid].count,
211 Mcyc_av(&nbs->cc[enbsCCgrid]),
212 Mcyc_av(&nbs->cc[enbsCCsearch]),
213 Mcyc_av(&nbs->cc[enbsCCreducef]));
215 if (nbs->nthread_max > 1)
217 if (nbs->cc[enbsCCcombine].count > 0)
219 fprintf(fp," comb %5.2f",
220 Mcyc_av(&nbs->cc[enbsCCcombine]));
222 fprintf(fp," s. th");
223 for(t=0; t<nbs->nthread_max; t++)
226 Mcyc_av(&nbs->work[t].cc[enbsCCsearch]));
232 static void nbnxn_grid_init(nbnxn_grid_t * grid)
235 grid->cxy_ind = NULL;
236 grid->cxy_nalloc = 0;
242 static int get_2log(int n)
247 while ((1<<log2) < n)
253 gmx_fatal(FARGS,"nbnxn na_c (%d) is not a power of 2",n);
259 static int nbnxn_kernel_to_ci_size(int nb_kernel_type)
261 switch (nb_kernel_type)
263 case nbnxnk4x4_PlainC:
264 case nbnxnk4xN_SIMD_4xN:
265 case nbnxnk4xN_SIMD_2xNN:
266 return NBNXN_CPU_CLUSTER_I_SIZE;
267 case nbnxnk8x8x8_CUDA:
268 case nbnxnk8x8x8_PlainC:
269 /* The cluster size for super/sub lists is only set here.
270 * Any value should work for the pair-search and atomdata code.
271 * The kernels, of course, might require a particular value.
273 return NBNXN_GPU_CLUSTER_SIZE;
275 gmx_incons("unknown kernel type");
281 int nbnxn_kernel_to_cj_size(int nb_kernel_type)
283 int nbnxn_simd_width=0;
286 #ifdef GMX_NBNXN_SIMD
287 nbnxn_simd_width = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
290 switch (nb_kernel_type)
292 case nbnxnk4x4_PlainC:
293 cj_size = NBNXN_CPU_CLUSTER_I_SIZE;
295 case nbnxnk4xN_SIMD_4xN:
296 cj_size = nbnxn_simd_width;
298 case nbnxnk4xN_SIMD_2xNN:
299 cj_size = nbnxn_simd_width/2;
301 case nbnxnk8x8x8_CUDA:
302 case nbnxnk8x8x8_PlainC:
303 cj_size = nbnxn_kernel_to_ci_size(nb_kernel_type);
306 gmx_incons("unknown kernel type");
312 static int ci_to_cj(int na_cj_2log,int ci)
316 case 2: return ci; break;
317 case 1: return (ci<<1); break;
318 case 3: return (ci>>1); break;
324 gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type)
326 if (nb_kernel_type == nbnxnkNotSet)
328 gmx_fatal(FARGS, "Non-bonded kernel type not set for Verlet-style pair-list.");
331 switch (nb_kernel_type)
333 case nbnxnk8x8x8_CUDA:
334 case nbnxnk8x8x8_PlainC:
337 case nbnxnk4x4_PlainC:
338 case nbnxnk4xN_SIMD_4xN:
339 case nbnxnk4xN_SIMD_2xNN:
343 gmx_incons("Invalid nonbonded kernel type passed!");
348 void nbnxn_init_search(nbnxn_search_t * nbs_ptr,
350 gmx_domdec_zones_t *zones,
359 nbs->DomDec = (n_dd_cells != NULL);
361 clear_ivec(nbs->dd_dim);
369 if ((*n_dd_cells)[d] > 1)
372 /* Each grid matches a DD zone */
378 snew(nbs->grid,nbs->ngrid);
379 for(g=0; g<nbs->ngrid; g++)
381 nbnxn_grid_init(&nbs->grid[g]);
384 nbs->cell_nalloc = 0;
388 nbs->nthread_max = nthread_max;
390 /* Initialize the work data structures for each thread */
391 snew(nbs->work,nbs->nthread_max);
392 for(t=0; t<nbs->nthread_max; t++)
394 nbs->work[t].cxy_na = NULL;
395 nbs->work[t].cxy_na_nalloc = 0;
396 nbs->work[t].sort_work = NULL;
397 nbs->work[t].sort_work_nalloc = 0;
400 /* Initialize detailed nbsearch cycle counting */
401 nbs->print_cycles = (getenv("GMX_NBNXN_CYCLE") != 0);
402 nbs->search_count = 0;
403 nbs_cycle_clear(nbs->cc);
404 for(t=0; t<nbs->nthread_max; t++)
406 nbs_cycle_clear(nbs->work[t].cc);
410 static real grid_atom_density(int n,rvec corner0,rvec corner1)
414 rvec_sub(corner1,corner0,size);
416 return n/(size[XX]*size[YY]*size[ZZ]);
419 static int set_grid_size_xy(const nbnxn_search_t nbs,
421 int n,rvec corner0,rvec corner1,
427 real adens,tlen,tlen_x,tlen_y,nc_max;
430 rvec_sub(corner1,corner0,size);
434 /* target cell length */
437 /* To minimize the zero interactions, we should make
438 * the largest of the i/j cell cubic.
440 na_c = max(grid->na_c,grid->na_cj);
442 /* Approximately cubic cells */
443 tlen = pow(na_c/atom_density,1.0/3.0);
449 /* Approximately cubic sub cells */
450 tlen = pow(grid->na_c/atom_density,1.0/3.0);
451 tlen_x = tlen*GPU_NSUBCELL_X;
452 tlen_y = tlen*GPU_NSUBCELL_Y;
454 /* We round ncx and ncy down, because we get less cell pairs
455 * in the nbsist when the fixed cell dimensions (x,y) are
456 * larger than the variable one (z) than the other way around.
458 grid->ncx = max(1,(int)(size[XX]/tlen_x));
459 grid->ncy = max(1,(int)(size[YY]/tlen_y));
467 /* We need one additional cell entry for particles moved by DD */
468 if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
470 grid->cxy_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
471 srenew(grid->cxy_na,grid->cxy_nalloc);
472 srenew(grid->cxy_ind,grid->cxy_nalloc+1);
474 for(t=0; t<nbs->nthread_max; t++)
476 if (grid->ncx*grid->ncy+1 > nbs->work[t].cxy_na_nalloc)
478 nbs->work[t].cxy_na_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
479 srenew(nbs->work[t].cxy_na,nbs->work[t].cxy_na_nalloc);
483 /* Worst case scenario of 1 atom in each last cell */
484 if (grid->na_cj <= grid->na_c)
486 nc_max = n/grid->na_sc + grid->ncx*grid->ncy;
490 nc_max = n/grid->na_sc + grid->ncx*grid->ncy*grid->na_cj/grid->na_c;
493 if (nc_max > grid->nc_nalloc)
497 grid->nc_nalloc = over_alloc_large(nc_max);
498 srenew(grid->nsubc,grid->nc_nalloc);
499 srenew(grid->bbcz,grid->nc_nalloc*NNBSBB_D);
501 bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX;
503 bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL*NNBSBB_B;
505 sfree_aligned(grid->bb);
506 /* This snew also zeros the contents, this avoid possible
507 * floating exceptions in SSE with the unused bb elements.
509 snew_aligned(grid->bb,bb_nalloc,16);
513 if (grid->na_cj == grid->na_c)
515 grid->bbj = grid->bb;
519 sfree_aligned(grid->bbj);
520 snew_aligned(grid->bbj,bb_nalloc*grid->na_c/grid->na_cj,16);
524 srenew(grid->flags,grid->nc_nalloc);
527 copy_rvec(corner0,grid->c0);
528 copy_rvec(corner1,grid->c1);
529 grid->sx = size[XX]/grid->ncx;
530 grid->sy = size[YY]/grid->ncy;
531 grid->inv_sx = 1/grid->sx;
532 grid->inv_sy = 1/grid->sy;
537 #define SORT_GRID_OVERSIZE 2
538 #define SGSF (SORT_GRID_OVERSIZE + 1)
540 static void sort_atoms(int dim,gmx_bool Backwards,
541 int *a,int n,rvec *x,
542 real h0,real invh,int nsort,int *sort)
554 /* For small oversize factors clearing the whole area is fastest.
555 * For large oversize we should clear the used elements after use.
557 for(i=0; i<nsort; i++)
561 /* Sort the particles using a simple index sort */
564 /* The cast takes care of float-point rounding effects below zero.
565 * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
566 * times the box height out of the box.
568 zi = (int)((x[a[i]][dim] - h0)*invh);
570 #ifdef DEBUG_NBNXN_GRIDDING
571 if (zi < 0 || zi >= nsort)
573 gmx_fatal(FARGS,"(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d\n",
574 a[i],'x'+dim,x[a[i]][dim],h0,invh,zi,nsort);
578 /* Ideally this particle should go in sort cell zi,
579 * but that might already be in use,
580 * in that case find the first empty cell higher up
588 /* We have multiple atoms in the same sorting slot.
589 * Sort on real z for minimal bounding box size.
590 * There is an extra check for identical z to ensure
591 * well-defined output order, independent of input order
592 * to ensure binary reproducibility after restarts.
594 while(sort[zi] >= 0 && ( x[a[i]][dim] > x[sort[zi]][dim] ||
595 (x[a[i]][dim] == x[sort[zi]][dim] &&
603 /* Shift all elements by one slot until we find an empty slot */
606 while (sort[zim] >= 0)
622 for(zi=0; zi<nsort; zi++)
632 for(zi=nsort-1; zi>=0; zi--)
642 gmx_incons("Lost particles while sorting");
647 #define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
648 #define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
654 /* Coordinate order x,y,z, bb order xyz0 */
655 static void calc_bounding_box(int na,int stride,const real *x,float *bb)
658 real xl,xh,yl,yh,zl,zh;
670 xl = min(xl,x[i+XX]);
671 xh = max(xh,x[i+XX]);
672 yl = min(yl,x[i+YY]);
673 yh = max(yh,x[i+YY]);
674 zl = min(zl,x[i+ZZ]);
675 zh = max(zh,x[i+ZZ]);
678 /* Note: possible double to float conversion here */
679 bb[BBL_X] = R2F_D(xl);
680 bb[BBL_Y] = R2F_D(yl);
681 bb[BBL_Z] = R2F_D(zl);
682 bb[BBU_X] = R2F_U(xh);
683 bb[BBU_Y] = R2F_U(yh);
684 bb[BBU_Z] = R2F_U(zh);
687 /* Packed coordinates, bb order xyz0 */
688 static void calc_bounding_box_x_x4(int na,const real *x,float *bb)
691 real xl,xh,yl,yh,zl,zh;
701 xl = min(xl,x[j+XX*PACK_X4]);
702 xh = max(xh,x[j+XX*PACK_X4]);
703 yl = min(yl,x[j+YY*PACK_X4]);
704 yh = max(yh,x[j+YY*PACK_X4]);
705 zl = min(zl,x[j+ZZ*PACK_X4]);
706 zh = max(zh,x[j+ZZ*PACK_X4]);
708 /* Note: possible double to float conversion here */
709 bb[BBL_X] = R2F_D(xl);
710 bb[BBL_Y] = R2F_D(yl);
711 bb[BBL_Z] = R2F_D(zl);
712 bb[BBU_X] = R2F_U(xh);
713 bb[BBU_Y] = R2F_U(yh);
714 bb[BBU_Z] = R2F_U(zh);
717 /* Packed coordinates, bb order xyz0 */
718 static void calc_bounding_box_x_x8(int na,const real *x,float *bb)
721 real xl,xh,yl,yh,zl,zh;
731 xl = min(xl,x[j+XX*PACK_X8]);
732 xh = max(xh,x[j+XX*PACK_X8]);
733 yl = min(yl,x[j+YY*PACK_X8]);
734 yh = max(yh,x[j+YY*PACK_X8]);
735 zl = min(zl,x[j+ZZ*PACK_X8]);
736 zh = max(zh,x[j+ZZ*PACK_X8]);
738 /* Note: possible double to float conversion here */
739 bb[BBL_X] = R2F_D(xl);
740 bb[BBL_Y] = R2F_D(yl);
741 bb[BBL_Z] = R2F_D(zl);
742 bb[BBU_X] = R2F_U(xh);
743 bb[BBU_Y] = R2F_U(yh);
744 bb[BBU_Z] = R2F_U(zh);
747 #ifdef NBNXN_SEARCH_SSE
749 /* Packed coordinates, bb order xyz0 */
750 static void calc_bounding_box_x_x4_halves(int na,const real *x,
751 float *bb,float *bbj)
753 calc_bounding_box_x_x4(min(na,2),x,bbj);
757 calc_bounding_box_x_x4(min(na-2,2),x+(PACK_X4>>1),bbj+NNBSBB_B);
761 /* Set the "empty" bounding box to the same as the first one,
762 * so we don't need to treat special cases in the rest of the code.
764 _mm_store_ps(bbj+NNBSBB_B ,_mm_load_ps(bbj));
765 _mm_store_ps(bbj+NNBSBB_B+NNBSBB_C,_mm_load_ps(bbj+NNBSBB_C));
768 _mm_store_ps(bb ,_mm_min_ps(_mm_load_ps(bbj),
769 _mm_load_ps(bbj+NNBSBB_B)));
770 _mm_store_ps(bb+NNBSBB_C,_mm_max_ps(_mm_load_ps(bbj+NNBSBB_C),
771 _mm_load_ps(bbj+NNBSBB_B+NNBSBB_C)));
774 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
775 static void calc_bounding_box_xxxx(int na,int stride,const real *x,float *bb)
778 real xl,xh,yl,yh,zl,zh;
790 xl = min(xl,x[i+XX]);
791 xh = max(xh,x[i+XX]);
792 yl = min(yl,x[i+YY]);
793 yh = max(yh,x[i+YY]);
794 zl = min(zl,x[i+ZZ]);
795 zh = max(zh,x[i+ZZ]);
798 /* Note: possible double to float conversion here */
799 bb[0*STRIDE_8BB] = R2F_D(xl);
800 bb[1*STRIDE_8BB] = R2F_D(yl);
801 bb[2*STRIDE_8BB] = R2F_D(zl);
802 bb[3*STRIDE_8BB] = R2F_U(xh);
803 bb[4*STRIDE_8BB] = R2F_U(yh);
804 bb[5*STRIDE_8BB] = R2F_U(zh);
807 #endif /* NBNXN_SEARCH_SSE */
809 #ifdef NBNXN_SEARCH_SSE_SINGLE
811 /* Coordinate order xyz?, bb order xyz0 */
812 static void calc_bounding_box_sse(int na,const float *x,float *bb)
814 __m128 bb_0_SSE,bb_1_SSE;
819 bb_0_SSE = _mm_load_ps(x);
824 x_SSE = _mm_load_ps(x+i*NNBSBB_C);
825 bb_0_SSE = _mm_min_ps(bb_0_SSE,x_SSE);
826 bb_1_SSE = _mm_max_ps(bb_1_SSE,x_SSE);
829 _mm_store_ps(bb ,bb_0_SSE);
830 _mm_store_ps(bb+4,bb_1_SSE);
833 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
834 static void calc_bounding_box_xxxx_sse(int na,const float *x,
838 calc_bounding_box_sse(na,x,bb_work);
840 bb[0*STRIDE_8BB] = bb_work[BBL_X];
841 bb[1*STRIDE_8BB] = bb_work[BBL_Y];
842 bb[2*STRIDE_8BB] = bb_work[BBL_Z];
843 bb[3*STRIDE_8BB] = bb_work[BBU_X];
844 bb[4*STRIDE_8BB] = bb_work[BBU_Y];
845 bb[5*STRIDE_8BB] = bb_work[BBU_Z];
848 #endif /* NBNXN_SEARCH_SSE_SINGLE */
850 #ifdef NBNXN_SEARCH_SSE
852 /* Combines pairs of consecutive bounding boxes */
853 static void combine_bounding_box_pairs(nbnxn_grid_t *grid,const float *bb)
856 __m128 min_SSE,max_SSE;
858 for(i=0; i<grid->ncx*grid->ncy; i++)
860 /* Starting bb in a column is expected to be 2-aligned */
861 sc2 = grid->cxy_ind[i]>>1;
862 /* For odd numbers skip the last bb here */
863 nc2 = (grid->cxy_na[i]+3)>>(2+1);
864 for(c2=sc2; c2<sc2+nc2; c2++)
866 min_SSE = _mm_min_ps(_mm_load_ps(bb+(c2*4+0)*NNBSBB_C),
867 _mm_load_ps(bb+(c2*4+2)*NNBSBB_C));
868 max_SSE = _mm_max_ps(_mm_load_ps(bb+(c2*4+1)*NNBSBB_C),
869 _mm_load_ps(bb+(c2*4+3)*NNBSBB_C));
870 _mm_store_ps(grid->bbj+(c2*2+0)*NNBSBB_C,min_SSE);
871 _mm_store_ps(grid->bbj+(c2*2+1)*NNBSBB_C,max_SSE);
873 if (((grid->cxy_na[i]+3)>>2) & 1)
875 /* Copy the last bb for odd bb count in this column */
876 for(j=0; j<NNBSBB_C; j++)
878 grid->bbj[(c2*2+0)*NNBSBB_C+j] = bb[(c2*4+0)*NNBSBB_C+j];
879 grid->bbj[(c2*2+1)*NNBSBB_C+j] = bb[(c2*4+1)*NNBSBB_C+j];
888 /* Prints the average bb size, used for debug output */
889 static void print_bbsizes_simple(FILE *fp,
890 const nbnxn_search_t nbs,
891 const nbnxn_grid_t *grid)
897 for(c=0; c<grid->nc; c++)
901 ba[d] += grid->bb[c*NNBSBB_B+NNBSBB_C+d] - grid->bb[c*NNBSBB_B+d];
904 dsvmul(1.0/grid->nc,ba,ba);
906 fprintf(fp,"ns bb: %4.2f %4.2f %4.2f %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
907 nbs->box[XX][XX]/grid->ncx,
908 nbs->box[YY][YY]/grid->ncy,
909 nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/grid->nc,
910 ba[XX],ba[YY],ba[ZZ],
911 ba[XX]*grid->ncx/nbs->box[XX][XX],
912 ba[YY]*grid->ncy/nbs->box[YY][YY],
913 ba[ZZ]*grid->nc/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
916 /* Prints the average bb size, used for debug output */
917 static void print_bbsizes_supersub(FILE *fp,
918 const nbnxn_search_t nbs,
919 const nbnxn_grid_t *grid)
926 for(c=0; c<grid->nc; c++)
929 for(s=0; s<grid->nsubc[c]; s+=STRIDE_8BB)
933 cs_w = (c*GPU_NSUBCELL + s)/STRIDE_8BB;
934 for(i=0; i<STRIDE_8BB; i++)
939 grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_8BB+i] -
940 grid->bb[cs_w*NNBSBB_XXXX+ d *STRIDE_8BB+i];
945 for(s=0; s<grid->nsubc[c]; s++)
949 cs = c*GPU_NSUBCELL + s;
953 grid->bb[cs*NNBSBB_B+NNBSBB_C+d] -
954 grid->bb[cs*NNBSBB_B +d];
958 ns += grid->nsubc[c];
960 dsvmul(1.0/ns,ba,ba);
962 fprintf(fp,"ns bb: %4.2f %4.2f %4.2f %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
963 nbs->box[XX][XX]/(grid->ncx*GPU_NSUBCELL_X),
964 nbs->box[YY][YY]/(grid->ncy*GPU_NSUBCELL_Y),
965 nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z),
966 ba[XX],ba[YY],ba[ZZ],
967 ba[XX]*grid->ncx*GPU_NSUBCELL_X/nbs->box[XX][XX],
968 ba[YY]*grid->ncy*GPU_NSUBCELL_Y/nbs->box[YY][YY],
969 ba[ZZ]*grid->nc*GPU_NSUBCELL_Z/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
972 /* Potentially sorts atoms on LJ coefficients !=0 and ==0.
973 * Also sets interaction flags.
975 void sort_on_lj(nbnxn_atomdata_t *nbat,int na_c,
976 int a0,int a1,const int *atinfo,
980 int subc,s,a,n1,n2,a_lj_max,i,j;
981 int sort1[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
982 int sort2[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
988 for(s=a0; s<a1; s+=na_c)
990 /* Make lists for this (sub-)cell on atoms with and without LJ */
995 for(a=s; a<min(s+na_c,a1); a++)
997 haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
999 if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
1001 sort1[n1++] = order[a];
1006 sort2[n2++] = order[a];
1010 /* If we don't have atom with LJ, there's nothing to sort */
1013 *flags |= NBNXN_CI_DO_LJ(subc);
1017 /* Only sort when strictly necessary. Ordering particles
1018 * Ordering particles can lead to less accurate summation
1019 * due to rounding, both for LJ and Coulomb interactions.
1021 if (2*(a_lj_max - s) >= na_c)
1025 order[a0+i] = sort1[i];
1029 order[a0+n1+j] = sort2[j];
1033 *flags |= NBNXN_CI_HALF_LJ(subc);
1038 *flags |= NBNXN_CI_DO_COUL(subc);
1044 /* Fill a pair search cell with atoms.
1045 * Potentially sorts atoms and sets the interaction flags.
1047 void fill_cell(const nbnxn_search_t nbs,
1049 nbnxn_atomdata_t *nbat,
1053 int sx,int sy, int sz,
1064 sort_on_lj(nbat,grid->na_c,a0,a1,atinfo,nbs->a,
1065 grid->flags+(a0>>grid->na_c_2log)-grid->cell0);
1068 /* Now we have sorted the atoms, set the cell indices */
1069 for(a=a0; a<a1; a++)
1071 nbs->cell[nbs->a[a]] = a;
1074 copy_rvec_to_nbat_real(nbs->a+a0,a1-a0,grid->na_c,x,
1075 nbat->XFormat,nbat->x,a0,
1078 if (nbat->XFormat == nbatX4)
1080 /* Store the bounding boxes as xyz.xyz. */
1081 offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
1082 bb_ptr = grid->bb + offset;
1084 #if defined GMX_DOUBLE && defined NBNXN_SEARCH_SSE
1085 if (2*grid->na_cj == grid->na_c)
1087 calc_bounding_box_x_x4_halves(na,nbat->x+X4_IND_A(a0),bb_ptr,
1088 grid->bbj+offset*2);
1093 calc_bounding_box_x_x4(na,nbat->x+X4_IND_A(a0),bb_ptr);
1096 else if (nbat->XFormat == nbatX8)
1098 /* Store the bounding boxes as xyz.xyz. */
1099 offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
1100 bb_ptr = grid->bb + offset;
1102 calc_bounding_box_x_x8(na,nbat->x+X8_IND_A(a0),bb_ptr);
1105 else if (!grid->bSimple)
1107 /* Store the bounding boxes in a format convenient
1108 * for SSE calculations: xxxxyyyyzzzz...
1112 ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_8BB_2LOG))*NNBSBB_XXXX +
1113 (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_8BB-1));
1115 #ifdef NBNXN_SEARCH_SSE_SINGLE
1116 if (nbat->XFormat == nbatXYZQ)
1118 calc_bounding_box_xxxx_sse(na,nbat->x+a0*nbat->xstride,
1124 calc_bounding_box_xxxx(na,nbat->xstride,nbat->x+a0*nbat->xstride,
1129 fprintf(debug,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
1131 bb_ptr[0*STRIDE_8BB],bb_ptr[3*STRIDE_8BB],
1132 bb_ptr[1*STRIDE_8BB],bb_ptr[4*STRIDE_8BB],
1133 bb_ptr[2*STRIDE_8BB],bb_ptr[5*STRIDE_8BB]);
1139 /* Store the bounding boxes as xyz.xyz. */
1140 bb_ptr = grid->bb+((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
1142 calc_bounding_box(na,nbat->xstride,nbat->x+a0*nbat->xstride,
1148 bbo = (a0 - grid->cell0*grid->na_sc)/grid->na_c;
1149 fprintf(debug,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
1151 (grid->bb+bbo*NNBSBB_B)[BBL_X],
1152 (grid->bb+bbo*NNBSBB_B)[BBU_X],
1153 (grid->bb+bbo*NNBSBB_B)[BBL_Y],
1154 (grid->bb+bbo*NNBSBB_B)[BBU_Y],
1155 (grid->bb+bbo*NNBSBB_B)[BBL_Z],
1156 (grid->bb+bbo*NNBSBB_B)[BBU_Z]);
1161 /* Spatially sort the atoms within one grid column */
1162 static void sort_columns_simple(const nbnxn_search_t nbs,
1168 nbnxn_atomdata_t *nbat,
1169 int cxy_start,int cxy_end,
1173 int cx,cy,cz,ncz,cfilled,c;
1179 fprintf(debug,"cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1180 grid->cell0,cxy_start,cxy_end,a0,a1);
1183 /* Sort the atoms within each x,y column in 3 dimensions */
1184 for(cxy=cxy_start; cxy<cxy_end; cxy++)
1187 cy = cxy - cx*grid->ncy;
1189 na = grid->cxy_na[cxy];
1190 ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
1191 ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
1193 /* Sort the atoms within each x,y column on z coordinate */
1194 sort_atoms(ZZ,FALSE,
1197 ncz*grid->na_sc*SORT_GRID_OVERSIZE/nbs->box[ZZ][ZZ],
1198 ncz*grid->na_sc*SGSF,sort_work);
1200 /* Fill the ncz cells in this column */
1201 cfilled = grid->cxy_ind[cxy];
1202 for(cz=0; cz<ncz; cz++)
1204 c = grid->cxy_ind[cxy] + cz ;
1206 ash_c = ash + cz*grid->na_sc;
1207 na_c = min(grid->na_sc,na-(ash_c-ash));
1209 fill_cell(nbs,grid,nbat,
1210 ash_c,ash_c+na_c,atinfo,x,
1211 grid->na_sc*cx + (dd_zone >> 2),
1212 grid->na_sc*cy + (dd_zone & 3),
1216 /* This copy to bbcz is not really necessary.
1217 * But it allows to use the same grid search code
1218 * for the simple and supersub cell setups.
1224 grid->bbcz[c*NNBSBB_D ] = grid->bb[cfilled*NNBSBB_B+2];
1225 grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled*NNBSBB_B+6];
1228 /* Set the unused atom indices to -1 */
1229 for(ind=na; ind<ncz*grid->na_sc; ind++)
1231 nbs->a[ash+ind] = -1;
1236 /* Spatially sort the atoms within one grid column */
1237 static void sort_columns_supersub(const nbnxn_search_t nbs,
1243 nbnxn_atomdata_t *nbat,
1244 int cxy_start,int cxy_end,
1248 int cx,cy,cz=-1,c=-1,ncz;
1249 int na,ash,na_c,ind,a;
1250 int subdiv_z,sub_z,na_z,ash_z;
1251 int subdiv_y,sub_y,na_y,ash_y;
1252 int subdiv_x,sub_x,na_x,ash_x;
1254 /* cppcheck-suppress unassignedVariable */
1255 float bb_work_array[NNBSBB_B+3],*bb_work_align;
1257 bb_work_align = (float *)(((size_t)(bb_work_array+3)) & (~((size_t)15)));
1261 fprintf(debug,"cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1262 grid->cell0,cxy_start,cxy_end,a0,a1);
1265 subdiv_x = grid->na_c;
1266 subdiv_y = GPU_NSUBCELL_X*subdiv_x;
1267 subdiv_z = GPU_NSUBCELL_Y*subdiv_y;
1269 /* Sort the atoms within each x,y column in 3 dimensions */
1270 for(cxy=cxy_start; cxy<cxy_end; cxy++)
1273 cy = cxy - cx*grid->ncy;
1275 na = grid->cxy_na[cxy];
1276 ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
1277 ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
1279 /* Sort the atoms within each x,y column on z coordinate */
1280 sort_atoms(ZZ,FALSE,
1283 ncz*grid->na_sc*SORT_GRID_OVERSIZE/nbs->box[ZZ][ZZ],
1284 ncz*grid->na_sc*SGSF,sort_work);
1286 /* This loop goes over the supercells and subcells along z at once */
1287 for(sub_z=0; sub_z<ncz*GPU_NSUBCELL_Z; sub_z++)
1289 ash_z = ash + sub_z*subdiv_z;
1290 na_z = min(subdiv_z,na-(ash_z-ash));
1292 /* We have already sorted on z */
1294 if (sub_z % GPU_NSUBCELL_Z == 0)
1296 cz = sub_z/GPU_NSUBCELL_Z;
1297 c = grid->cxy_ind[cxy] + cz ;
1299 /* The number of atoms in this supercell */
1300 na_c = min(grid->na_sc,na-(ash_z-ash));
1302 grid->nsubc[c] = min(GPU_NSUBCELL,(na_c+grid->na_c-1)/grid->na_c);
1304 /* Store the z-boundaries of the super cell */
1305 grid->bbcz[c*NNBSBB_D ] = x[nbs->a[ash_z]][ZZ];
1306 grid->bbcz[c*NNBSBB_D+1] = x[nbs->a[ash_z+na_c-1]][ZZ];
1309 #if GPU_NSUBCELL_Y > 1
1310 /* Sort the atoms along y */
1311 sort_atoms(YY,(sub_z & 1),
1312 nbs->a+ash_z,na_z,x,
1313 grid->c0[YY]+cy*grid->sy,grid->inv_sy,
1314 subdiv_y*SGSF,sort_work);
1317 for(sub_y=0; sub_y<GPU_NSUBCELL_Y; sub_y++)
1319 ash_y = ash_z + sub_y*subdiv_y;
1320 na_y = min(subdiv_y,na-(ash_y-ash));
1322 #if GPU_NSUBCELL_X > 1
1323 /* Sort the atoms along x */
1324 sort_atoms(XX,((cz*GPU_NSUBCELL_Y + sub_y) & 1),
1325 nbs->a+ash_y,na_y,x,
1326 grid->c0[XX]+cx*grid->sx,grid->inv_sx,
1327 subdiv_x*SGSF,sort_work);
1330 for(sub_x=0; sub_x<GPU_NSUBCELL_X; sub_x++)
1332 ash_x = ash_y + sub_x*subdiv_x;
1333 na_x = min(subdiv_x,na-(ash_x-ash));
1335 fill_cell(nbs,grid,nbat,
1336 ash_x,ash_x+na_x,atinfo,x,
1337 grid->na_c*(cx*GPU_NSUBCELL_X+sub_x) + (dd_zone >> 2),
1338 grid->na_c*(cy*GPU_NSUBCELL_Y+sub_y) + (dd_zone & 3),
1345 /* Set the unused atom indices to -1 */
1346 for(ind=na; ind<ncz*grid->na_sc; ind++)
1348 nbs->a[ash+ind] = -1;
1353 /* Determine in which grid column atoms should go */
1354 static void calc_column_indices(nbnxn_grid_t *grid,
1356 rvec *x,const int *move,
1357 int thread,int nthread,
1364 /* We add one extra cell for particles which moved during DD */
1365 for(i=0; i<grid->ncx*grid->ncy+1; i++)
1370 n0 = a0 + (int)((thread+0)*(a1 - a0))/nthread;
1371 n1 = a0 + (int)((thread+1)*(a1 - a0))/nthread;
1372 for(i=n0; i<n1; i++)
1374 if (move == NULL || move[i] >= 0)
1376 /* We need to be careful with rounding,
1377 * particles might be a few bits outside the local box.
1378 * The int cast takes care of the lower bound,
1379 * we need to explicitly take care of the upper bound.
1381 cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
1382 if (cx == grid->ncx)
1386 cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
1387 if (cy == grid->ncy)
1391 /* For the moment cell contains only the, grid local,
1392 * x and y indices, not z.
1394 cell[i] = cx*grid->ncy + cy;
1396 #ifdef DEBUG_NBNXN_GRIDDING
1397 if (cell[i] < 0 || cell[i] >= grid->ncx*grid->ncy)
1400 "grid cell cx %d cy %d out of range (max %d %d)\n"
1401 "atom %f %f %f, grid->c0 %f %f",
1402 cx,cy,grid->ncx,grid->ncy,
1403 x[i][XX],x[i][YY],x[i][ZZ],grid->c0[XX],grid->c0[YY]);
1409 /* Put this moved particle after the end of the grid,
1410 * so we can process it later without using conditionals.
1412 cell[i] = grid->ncx*grid->ncy;
1419 /* Determine in which grid cells the atoms should go */
1420 static void calc_cell_indices(const nbnxn_search_t nbs,
1427 nbnxn_atomdata_t *nbat)
1430 int cx,cy,cxy,ncz_max,ncz;
1432 int *cxy_na,cxy_na_i;
1434 nthread = gmx_omp_nthreads_get(emntPairsearch);
1436 #pragma omp parallel for num_threads(nthread) schedule(static)
1437 for(thread=0; thread<nthread; thread++)
1439 calc_column_indices(grid,a0,a1,x,move,thread,nthread,
1440 nbs->cell,nbs->work[thread].cxy_na);
1443 /* Make the cell index as a function of x and y */
1446 grid->cxy_ind[0] = 0;
1447 for(i=0; i<grid->ncx*grid->ncy+1; i++)
1449 /* We set ncz_max at the beginning of the loop iso at the end
1450 * to skip i=grid->ncx*grid->ncy which are moved particles
1451 * that do not need to be ordered on the grid.
1457 cxy_na_i = nbs->work[0].cxy_na[i];
1458 for(thread=1; thread<nthread; thread++)
1460 cxy_na_i += nbs->work[thread].cxy_na[i];
1462 ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
1463 if (nbat->XFormat == nbatX8)
1465 /* Make the number of cell a multiple of 2 */
1466 ncz = (ncz + 1) & ~1;
1468 grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
1469 /* Clear cxy_na, so we can reuse the array below */
1470 grid->cxy_na[i] = 0;
1472 grid->nc = grid->cxy_ind[grid->ncx*grid->ncy] - grid->cxy_ind[0];
1474 nbat->natoms = (grid->cell0 + grid->nc)*grid->na_sc;
1478 fprintf(debug,"ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1479 grid->na_sc,grid->na_c,grid->nc,
1480 grid->ncx,grid->ncy,grid->nc/((double)(grid->ncx*grid->ncy)),
1485 for(cy=0; cy<grid->ncy; cy++)
1487 for(cx=0; cx<grid->ncx; cx++)
1489 fprintf(debug," %2d",grid->cxy_ind[i+1]-grid->cxy_ind[i]);
1492 fprintf(debug,"\n");
1497 /* Make sure the work array for sorting is large enough */
1498 if (ncz_max*grid->na_sc*SGSF > nbs->work[0].sort_work_nalloc)
1500 for(thread=0; thread<nbs->nthread_max; thread++)
1502 nbs->work[thread].sort_work_nalloc =
1503 over_alloc_large(ncz_max*grid->na_sc*SGSF);
1504 srenew(nbs->work[thread].sort_work,
1505 nbs->work[thread].sort_work_nalloc);
1509 /* Now we know the dimensions we can fill the grid.
1510 * This is the first, unsorted fill. We sort the columns after this.
1512 for(i=a0; i<a1; i++)
1514 /* At this point nbs->cell contains the local grid x,y indices */
1516 nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
1519 /* Set the cell indices for the moved particles */
1520 n0 = grid->nc*grid->na_sc;
1521 n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
1522 for(i=n0; i<n1; i++)
1524 nbs->cell[nbs->a[i]] = i;
1527 /* Sort the super-cell columns along z into the sub-cells. */
1528 #pragma omp parallel for num_threads(nbs->nthread_max) schedule(static)
1529 for(thread=0; thread<nbs->nthread_max; thread++)
1533 sort_columns_simple(nbs,dd_zone,grid,a0,a1,atinfo,x,nbat,
1534 ((thread+0)*grid->ncx*grid->ncy)/nthread,
1535 ((thread+1)*grid->ncx*grid->ncy)/nthread,
1536 nbs->work[thread].sort_work);
1540 sort_columns_supersub(nbs,dd_zone,grid,a0,a1,atinfo,x,nbat,
1541 ((thread+0)*grid->ncx*grid->ncy)/nthread,
1542 ((thread+1)*grid->ncx*grid->ncy)/nthread,
1543 nbs->work[thread].sort_work);
1547 #ifdef NBNXN_SEARCH_SSE
1548 if (grid->bSimple && nbat->XFormat == nbatX8)
1550 combine_bounding_box_pairs(grid,grid->bb);
1556 grid->nsubc_tot = 0;
1557 for(i=0; i<grid->nc; i++)
1559 grid->nsubc_tot += grid->nsubc[i];
1567 print_bbsizes_simple(debug,nbs,grid);
1571 fprintf(debug,"ns non-zero sub-cells: %d average atoms %.2f\n",
1572 grid->nsubc_tot,(a1-a0)/(double)grid->nsubc_tot);
1574 print_bbsizes_supersub(debug,nbs,grid);
1579 static void init_buffer_flags(nbnxn_buffer_flags_t *flags,
1584 flags->nflag = (natoms + NBNXN_BUFFERFLAG_SIZE - 1)/NBNXN_BUFFERFLAG_SIZE;
1585 if (flags->nflag > flags->flag_nalloc)
1587 flags->flag_nalloc = over_alloc_large(flags->nflag);
1588 srenew(flags->flag,flags->flag_nalloc);
1590 for(b=0; b<flags->nflag; b++)
1596 /* Sets up a grid and puts the atoms on the grid.
1597 * This function only operates on one domain of the domain decompostion.
1598 * Note that without domain decomposition there is only one domain.
1600 void nbnxn_put_on_grid(nbnxn_search_t nbs,
1601 int ePBC,matrix box,
1603 rvec corner0,rvec corner1,
1608 int nmoved,int *move,
1610 nbnxn_atomdata_t *nbat)
1614 int nc_max_grid,nc_max;
1616 grid = &nbs->grid[dd_zone];
1618 nbs_cycle_start(&nbs->cc[enbsCCgrid]);
1620 grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
1622 grid->na_c = nbnxn_kernel_to_ci_size(nb_kernel_type);
1623 grid->na_cj = nbnxn_kernel_to_cj_size(nb_kernel_type);
1624 grid->na_sc = (grid->bSimple ? 1 : GPU_NSUBCELL)*grid->na_c;
1625 grid->na_c_2log = get_2log(grid->na_c);
1627 nbat->na_c = grid->na_c;
1636 (nbs->grid[dd_zone-1].cell0 + nbs->grid[dd_zone-1].nc)*
1637 nbs->grid[dd_zone-1].na_sc/grid->na_sc;
1645 copy_mat(box,nbs->box);
1647 if (atom_density >= 0)
1649 grid->atom_density = atom_density;
1653 grid->atom_density = grid_atom_density(n-nmoved,corner0,corner1);
1658 nbs->natoms_local = a1 - nmoved;
1659 /* We assume that nbnxn_put_on_grid is called first
1660 * for the local atoms (dd_zone=0).
1662 nbs->natoms_nonlocal = a1 - nmoved;
1666 nbs->natoms_nonlocal = max(nbs->natoms_nonlocal,a1);
1669 nc_max_grid = set_grid_size_xy(nbs,grid,n-nmoved,corner0,corner1,
1670 nbs->grid[0].atom_density,
1673 nc_max = grid->cell0 + nc_max_grid;
1675 if (a1 > nbs->cell_nalloc)
1677 nbs->cell_nalloc = over_alloc_large(a1);
1678 srenew(nbs->cell,nbs->cell_nalloc);
1681 /* To avoid conditionals we store the moved particles at the end of a,
1682 * make sure we have enough space.
1684 if (nc_max*grid->na_sc + nmoved > nbs->a_nalloc)
1686 nbs->a_nalloc = over_alloc_large(nc_max*grid->na_sc + nmoved);
1687 srenew(nbs->a,nbs->a_nalloc);
1690 /* We need padding up to a multiple of the buffer flag size: simply add */
1691 if (nc_max*grid->na_sc + NBNXN_BUFFERFLAG_SIZE > nbat->nalloc)
1693 nbnxn_atomdata_realloc(nbat,nc_max*grid->na_sc+NBNXN_BUFFERFLAG_SIZE);
1696 calc_cell_indices(nbs,dd_zone,grid,a0,a1,atinfo,x,move,nbat);
1700 nbat->natoms_local = nbat->natoms;
1703 nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
1706 /* Calls nbnxn_put_on_grid for all non-local domains */
1707 void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs,
1708 const gmx_domdec_zones_t *zones,
1712 nbnxn_atomdata_t *nbat)
1717 for(zone=1; zone<zones->n; zone++)
1719 for(d=0; d<DIM; d++)
1721 c0[d] = zones->size[zone].bb_x0[d];
1722 c1[d] = zones->size[zone].bb_x1[d];
1725 nbnxn_put_on_grid(nbs,nbs->ePBC,NULL,
1727 zones->cg_range[zone],
1728 zones->cg_range[zone+1],
1738 /* Add simple grid type information to the local super/sub grid */
1739 void nbnxn_grid_add_simple(nbnxn_search_t nbs,
1740 nbnxn_atomdata_t *nbat)
1746 grid = &nbs->grid[0];
1750 gmx_incons("nbnxn_grid_simple called with a simple grid");
1753 ncd = grid->na_sc/NBNXN_CPU_CLUSTER_I_SIZE;
1755 if (grid->nc*ncd > grid->nc_nalloc_simple)
1757 grid->nc_nalloc_simple = over_alloc_large(grid->nc*ncd);
1758 srenew(grid->bbcz_simple,grid->nc_nalloc_simple*NNBSBB_D);
1759 srenew(grid->bb_simple,grid->nc_nalloc_simple*NNBSBB_B);
1760 srenew(grid->flags_simple,grid->nc_nalloc_simple);
1763 sfree_aligned(grid->bbj);
1764 snew_aligned(grid->bbj,grid->nc_nalloc_simple/2,16);
1768 bbcz = grid->bbcz_simple;
1769 bb = grid->bb_simple;
1771 #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
1772 for(sc=0; sc<grid->nc; sc++)
1776 for(c=0; c<ncd; c++)
1780 na = NBNXN_CPU_CLUSTER_I_SIZE;
1782 nbat->type[tx*NBNXN_CPU_CLUSTER_I_SIZE+na-1] == nbat->ntype-1)
1789 switch (nbat->XFormat)
1792 /* PACK_X4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
1793 calc_bounding_box_x_x4(na,nbat->x+tx*STRIDE_P4,
1797 /* PACK_X8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
1798 calc_bounding_box_x_x8(na,nbat->x+X8_IND_A(tx*NBNXN_CPU_CLUSTER_I_SIZE),
1802 calc_bounding_box(na,nbat->xstride,
1803 nbat->x+tx*NBNXN_CPU_CLUSTER_I_SIZE*nbat->xstride,
1807 bbcz[tx*NNBSBB_D+0] = bb[tx*NNBSBB_B +ZZ];
1808 bbcz[tx*NNBSBB_D+1] = bb[tx*NNBSBB_B+NNBSBB_C+ZZ];
1810 /* No interaction optimization yet here */
1811 grid->flags_simple[tx] = NBNXN_CI_DO_LJ(0) | NBNXN_CI_DO_COUL(0);
1815 grid->flags_simple[tx] = 0;
1820 #ifdef NBNXN_SEARCH_SSE
1821 if (grid->bSimple && nbat->XFormat == nbatX8)
1823 combine_bounding_box_pairs(grid,grid->bb_simple);
1828 void nbnxn_get_ncells(nbnxn_search_t nbs,int *ncx,int *ncy)
1830 *ncx = nbs->grid[0].ncx;
1831 *ncy = nbs->grid[0].ncy;
1834 void nbnxn_get_atomorder(nbnxn_search_t nbs,int **a,int *n)
1836 const nbnxn_grid_t *grid;
1838 grid = &nbs->grid[0];
1840 /* Return the atom order for the home cell (index 0) */
1843 *n = grid->cxy_ind[grid->ncx*grid->ncy]*grid->na_sc;
1846 void nbnxn_set_atomorder(nbnxn_search_t nbs)
1849 int ao,cx,cy,cxy,cz,j;
1851 /* Set the atom order for the home cell (index 0) */
1852 grid = &nbs->grid[0];
1855 for(cx=0; cx<grid->ncx; cx++)
1857 for(cy=0; cy<grid->ncy; cy++)
1859 cxy = cx*grid->ncy + cy;
1860 j = grid->cxy_ind[cxy]*grid->na_sc;
1861 for(cz=0; cz<grid->cxy_na[cxy]; cz++)
1872 /* Determines the cell range along one dimension that
1873 * the bounding box b0 - b1 sees.
1875 static void get_cell_range(real b0,real b1,
1876 int nc,real c0,real s,real invs,
1877 real d2,real r2,int *cf,int *cl)
1879 *cf = max((int)((b0 - c0)*invs),0);
1881 while (*cf > 0 && d2 + sqr((b0 - c0) - (*cf-1+1)*s) < r2)
1886 *cl = min((int)((b1 - c0)*invs),nc-1);
1887 while (*cl < nc-1 && d2 + sqr((*cl+1)*s - (b1 - c0)) < r2)
1893 /* Reference code calculating the distance^2 between two bounding boxes */
1894 static float box_dist2(float bx0,float bx1,float by0,
1895 float by1,float bz0,float bz1,
1903 dl = bx0 - bb[BBU_X];
1904 dh = bb[BBL_X] - bx1;
1909 dl = by0 - bb[BBU_Y];
1910 dh = bb[BBL_Y] - by1;
1915 dl = bz0 - bb[BBU_Z];
1916 dh = bb[BBL_Z] - bz1;
1924 /* Plain C code calculating the distance^2 between two bounding boxes */
1925 static float subc_bb_dist2(int si,const float *bb_i_ci,
1926 int csj,const float *bb_j_all)
1928 const float *bb_i,*bb_j;
1932 bb_i = bb_i_ci + si*NNBSBB_B;
1933 bb_j = bb_j_all + csj*NNBSBB_B;
1937 dl = bb_i[BBL_X] - bb_j[BBU_X];
1938 dh = bb_j[BBL_X] - bb_i[BBU_X];
1943 dl = bb_i[BBL_Y] - bb_j[BBU_Y];
1944 dh = bb_j[BBL_Y] - bb_i[BBU_Y];
1949 dl = bb_i[BBL_Z] - bb_j[BBU_Z];
1950 dh = bb_j[BBL_Z] - bb_i[BBU_Z];
1958 #ifdef NBNXN_SEARCH_SSE
1960 /* SSE code for bb distance for bb format xyz0 */
1961 static float subc_bb_dist2_sse(int na_c,
1962 int si,const float *bb_i_ci,
1963 int csj,const float *bb_j_all)
1965 const float *bb_i,*bb_j;
1967 __m128 bb_i_SSE0,bb_i_SSE1;
1968 __m128 bb_j_SSE0,bb_j_SSE1;
1974 #ifndef GMX_X86_SSE4_1
1975 float d2_array[7],*d2_align;
1977 d2_align = (float *)(((size_t)(d2_array+3)) & (~((size_t)15)));
1982 bb_i = bb_i_ci + si*NNBSBB_B;
1983 bb_j = bb_j_all + csj*NNBSBB_B;
1985 bb_i_SSE0 = _mm_load_ps(bb_i);
1986 bb_i_SSE1 = _mm_load_ps(bb_i+NNBSBB_C);
1987 bb_j_SSE0 = _mm_load_ps(bb_j);
1988 bb_j_SSE1 = _mm_load_ps(bb_j+NNBSBB_C);
1990 dl_SSE = _mm_sub_ps(bb_i_SSE0,bb_j_SSE1);
1991 dh_SSE = _mm_sub_ps(bb_j_SSE0,bb_i_SSE1);
1993 dm_SSE = _mm_max_ps(dl_SSE,dh_SSE);
1994 dm0_SSE = _mm_max_ps(dm_SSE,_mm_setzero_ps());
1995 #ifndef GMX_X86_SSE4_1
1996 d2_SSE = _mm_mul_ps(dm0_SSE,dm0_SSE);
1998 _mm_store_ps(d2_align,d2_SSE);
2000 return d2_align[0] + d2_align[1] + d2_align[2];
2002 /* SSE4.1 dot product of components 0,1,2 */
2003 d2_SSE = _mm_dp_ps(dm0_SSE,dm0_SSE,0x71);
2005 _mm_store_ss(&d2,d2_SSE);
2011 /* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
2012 #define SUBC_BB_DIST2_SSE_XXXX_INNER(si,bb_i,d2) \
2016 __m128 dx_0,dy_0,dz_0; \
2017 __m128 dx_1,dy_1,dz_1; \
2020 __m128 m0x,m0y,m0z; \
2022 __m128 d2x,d2y,d2z; \
2025 shi = si*NNBSBB_D*DIM; \
2027 xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_8BB); \
2028 yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_8BB); \
2029 zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_8BB); \
2030 xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_8BB); \
2031 yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_8BB); \
2032 zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_8BB); \
2034 dx_0 = _mm_sub_ps(xi_l,xj_h); \
2035 dy_0 = _mm_sub_ps(yi_l,yj_h); \
2036 dz_0 = _mm_sub_ps(zi_l,zj_h); \
2038 dx_1 = _mm_sub_ps(xj_l,xi_h); \
2039 dy_1 = _mm_sub_ps(yj_l,yi_h); \
2040 dz_1 = _mm_sub_ps(zj_l,zi_h); \
2042 mx = _mm_max_ps(dx_0,dx_1); \
2043 my = _mm_max_ps(dy_0,dy_1); \
2044 mz = _mm_max_ps(dz_0,dz_1); \
2046 m0x = _mm_max_ps(mx,zero); \
2047 m0y = _mm_max_ps(my,zero); \
2048 m0z = _mm_max_ps(mz,zero); \
2050 d2x = _mm_mul_ps(m0x,m0x); \
2051 d2y = _mm_mul_ps(m0y,m0y); \
2052 d2z = _mm_mul_ps(m0z,m0z); \
2054 d2s = _mm_add_ps(d2x,d2y); \
2055 d2t = _mm_add_ps(d2s,d2z); \
2057 _mm_store_ps(d2+si,d2t); \
2060 /* SSE code for nsi bb distances for bb format xxxxyyyyzzzz */
2061 static void subc_bb_dist2_sse_xxxx(const float *bb_j,
2062 int nsi,const float *bb_i,
2065 __m128 xj_l,yj_l,zj_l;
2066 __m128 xj_h,yj_h,zj_h;
2067 __m128 xi_l,yi_l,zi_l;
2068 __m128 xi_h,yi_h,zi_h;
2072 zero = _mm_setzero_ps();
2074 xj_l = _mm_set1_ps(bb_j[0*STRIDE_8BB]);
2075 yj_l = _mm_set1_ps(bb_j[1*STRIDE_8BB]);
2076 zj_l = _mm_set1_ps(bb_j[2*STRIDE_8BB]);
2077 xj_h = _mm_set1_ps(bb_j[3*STRIDE_8BB]);
2078 yj_h = _mm_set1_ps(bb_j[4*STRIDE_8BB]);
2079 zj_h = _mm_set1_ps(bb_j[5*STRIDE_8BB]);
2081 /* Here we "loop" over si (0,STRIDE_8BB) from 0 to nsi with step STRIDE_8BB.
2082 * But as we know the number of iterations is 1 or 2, we unroll manually.
2084 SUBC_BB_DIST2_SSE_XXXX_INNER(0,bb_i,d2);
2085 if (STRIDE_8BB < nsi)
2087 SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_8BB,bb_i,d2);
2091 #endif /* NBNXN_SEARCH_SSE */
2093 /* Plain C function which determines if any atom pair between two cells
2094 * is within distance sqrt(rl2).
2096 static gmx_bool subc_in_range_x(int na_c,
2097 int si,const real *x_i,
2098 int csj,int stride,const real *x_j,
2104 for(i=0; i<na_c; i++)
2106 i0 = (si*na_c + i)*DIM;
2107 for(j=0; j<na_c; j++)
2109 j0 = (csj*na_c + j)*stride;
2111 d2 = sqr(x_i[i0 ] - x_j[j0 ]) +
2112 sqr(x_i[i0+1] - x_j[j0+1]) +
2113 sqr(x_i[i0+2] - x_j[j0+2]);
2125 /* SSE function which determines if any atom pair between two cells,
2126 * both with 8 atoms, is within distance sqrt(rl2).
2128 static gmx_bool subc_in_range_sse8(int na_c,
2129 int si,const real *x_i,
2130 int csj,int stride,const real *x_j,
2133 #ifdef NBNXN_SEARCH_SSE_SINGLE
2134 __m128 ix_SSE0,iy_SSE0,iz_SSE0;
2135 __m128 ix_SSE1,iy_SSE1,iz_SSE1;
2142 rc2_SSE = _mm_set1_ps(rl2);
2144 na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_8BB;
2145 ix_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_8BB);
2146 iy_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_8BB);
2147 iz_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_8BB);
2148 ix_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_8BB);
2149 iy_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_8BB);
2150 iz_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_8BB);
2152 /* We loop from the outer to the inner particles to maximize
2153 * the chance that we find a pair in range quickly and return.
2159 __m128 jx0_SSE,jy0_SSE,jz0_SSE;
2160 __m128 jx1_SSE,jy1_SSE,jz1_SSE;
2162 __m128 dx_SSE0,dy_SSE0,dz_SSE0;
2163 __m128 dx_SSE1,dy_SSE1,dz_SSE1;
2164 __m128 dx_SSE2,dy_SSE2,dz_SSE2;
2165 __m128 dx_SSE3,dy_SSE3,dz_SSE3;
2176 __m128 wco_any_SSE01,wco_any_SSE23,wco_any_SSE;
2178 jx0_SSE = _mm_load1_ps(x_j+j0*stride+0);
2179 jy0_SSE = _mm_load1_ps(x_j+j0*stride+1);
2180 jz0_SSE = _mm_load1_ps(x_j+j0*stride+2);
2182 jx1_SSE = _mm_load1_ps(x_j+j1*stride+0);
2183 jy1_SSE = _mm_load1_ps(x_j+j1*stride+1);
2184 jz1_SSE = _mm_load1_ps(x_j+j1*stride+2);
2186 /* Calculate distance */
2187 dx_SSE0 = _mm_sub_ps(ix_SSE0,jx0_SSE);
2188 dy_SSE0 = _mm_sub_ps(iy_SSE0,jy0_SSE);
2189 dz_SSE0 = _mm_sub_ps(iz_SSE0,jz0_SSE);
2190 dx_SSE1 = _mm_sub_ps(ix_SSE1,jx0_SSE);
2191 dy_SSE1 = _mm_sub_ps(iy_SSE1,jy0_SSE);
2192 dz_SSE1 = _mm_sub_ps(iz_SSE1,jz0_SSE);
2193 dx_SSE2 = _mm_sub_ps(ix_SSE0,jx1_SSE);
2194 dy_SSE2 = _mm_sub_ps(iy_SSE0,jy1_SSE);
2195 dz_SSE2 = _mm_sub_ps(iz_SSE0,jz1_SSE);
2196 dx_SSE3 = _mm_sub_ps(ix_SSE1,jx1_SSE);
2197 dy_SSE3 = _mm_sub_ps(iy_SSE1,jy1_SSE);
2198 dz_SSE3 = _mm_sub_ps(iz_SSE1,jz1_SSE);
2200 /* rsq = dx*dx+dy*dy+dz*dz */
2201 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
2202 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
2203 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
2204 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
2206 wco_SSE0 = _mm_cmplt_ps(rsq_SSE0,rc2_SSE);
2207 wco_SSE1 = _mm_cmplt_ps(rsq_SSE1,rc2_SSE);
2208 wco_SSE2 = _mm_cmplt_ps(rsq_SSE2,rc2_SSE);
2209 wco_SSE3 = _mm_cmplt_ps(rsq_SSE3,rc2_SSE);
2211 wco_any_SSE01 = _mm_or_ps(wco_SSE0,wco_SSE1);
2212 wco_any_SSE23 = _mm_or_ps(wco_SSE2,wco_SSE3);
2213 wco_any_SSE = _mm_or_ps(wco_any_SSE01,wco_any_SSE23);
2215 if (_mm_movemask_ps(wco_any_SSE))
2227 gmx_incons("SSE function called without SSE support");
2233 /* Returns the j sub-cell for index cj_ind */
2234 static int nbl_cj(const nbnxn_pairlist_t *nbl,int cj_ind)
2236 return nbl->cj4[cj_ind>>2].cj[cj_ind & 3];
2239 /* Returns the i-interaction mask of the j sub-cell for index cj_ind */
2240 static unsigned nbl_imask0(const nbnxn_pairlist_t *nbl,int cj_ind)
2242 return nbl->cj4[cj_ind>>2].imei[0].imask;
2245 /* Ensures there is enough space for extra extra exclusion masks */
2246 static void check_excl_space(nbnxn_pairlist_t *nbl,int extra)
2248 if (nbl->nexcl+extra > nbl->excl_nalloc)
2250 nbl->excl_nalloc = over_alloc_small(nbl->nexcl+extra);
2251 nbnxn_realloc_void((void **)&nbl->excl,
2252 nbl->nexcl*sizeof(*nbl->excl),
2253 nbl->excl_nalloc*sizeof(*nbl->excl),
2254 nbl->alloc,nbl->free);
2258 /* Ensures there is enough space for ncell extra j-cells in the list */
2259 static void check_subcell_list_space_simple(nbnxn_pairlist_t *nbl,
2264 cj_max = nbl->ncj + ncell;
2266 if (cj_max > nbl->cj_nalloc)
2268 nbl->cj_nalloc = over_alloc_small(cj_max);
2269 nbnxn_realloc_void((void **)&nbl->cj,
2270 nbl->ncj*sizeof(*nbl->cj),
2271 nbl->cj_nalloc*sizeof(*nbl->cj),
2272 nbl->alloc,nbl->free);
2276 /* Ensures there is enough space for ncell extra j-subcells in the list */
2277 static void check_subcell_list_space_supersub(nbnxn_pairlist_t *nbl,
2280 int ncj4_max,j4,j,w,t;
2283 #define WARP_SIZE 32
2285 /* We can have maximally nsupercell*GPU_NSUBCELL sj lists */
2286 /* We can store 4 j-subcell - i-supercell pairs in one struct.
2287 * since we round down, we need one extra entry.
2289 ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL + 4-1) >> 2);
2291 if (ncj4_max > nbl->cj4_nalloc)
2293 nbl->cj4_nalloc = over_alloc_small(ncj4_max);
2294 nbnxn_realloc_void((void **)&nbl->cj4,
2295 nbl->work->cj4_init*sizeof(*nbl->cj4),
2296 nbl->cj4_nalloc*sizeof(*nbl->cj4),
2297 nbl->alloc,nbl->free);
2300 if (ncj4_max > nbl->work->cj4_init)
2302 for(j4=nbl->work->cj4_init; j4<ncj4_max; j4++)
2304 /* No i-subcells and no excl's in the list initially */
2305 for(w=0; w<NWARP; w++)
2307 nbl->cj4[j4].imei[w].imask = 0U;
2308 nbl->cj4[j4].imei[w].excl_ind = 0;
2312 nbl->work->cj4_init = ncj4_max;
2316 /* Set all excl masks for one GPU warp no exclusions */
2317 static void set_no_excls(nbnxn_excl_t *excl)
2321 for(t=0; t<WARP_SIZE; t++)
2323 /* Turn all interaction bits on */
2324 excl->pair[t] = NBNXN_INT_MASK_ALL;
2328 /* Initializes a single nbnxn_pairlist_t data structure */
2329 static void nbnxn_init_pairlist(nbnxn_pairlist_t *nbl,
2331 nbnxn_alloc_t *alloc,
2336 nbl->alloc = nbnxn_alloc_aligned;
2344 nbl->free = nbnxn_free_aligned;
2351 nbl->bSimple = bSimple;
2362 /* We need one element extra in sj, so alloc initially with 1 */
2363 nbl->cj4_nalloc = 0;
2370 nbl->excl_nalloc = 0;
2372 check_excl_space(nbl,1);
2374 set_no_excls(&nbl->excl[0]);
2379 snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,32);
2381 snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,32);
2383 snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,32);
2384 #ifdef GMX_NBNXN_SIMD
2385 snew_aligned(nbl->work->x_ci_simd_4xn,1,32);
2386 snew_aligned(nbl->work->x_ci_simd_2xnn,1,32);
2388 snew_aligned(nbl->work->d2,GPU_NSUBCELL,32);
2391 void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
2392 gmx_bool bSimple, gmx_bool bCombined,
2393 nbnxn_alloc_t *alloc,
2398 nbl_list->bSimple = bSimple;
2399 nbl_list->bCombined = bCombined;
2401 nbl_list->nnbl = gmx_omp_nthreads_get(emntNonbonded);
2403 if (!nbl_list->bCombined &&
2404 nbl_list->nnbl > NBNXN_BUFFERFLAG_MAX_THREADS)
2406 gmx_fatal(FARGS,"%d OpenMP threads were requested. Since the non-bonded force buffer reduction is prohibitively slow with more than %d threads, we do not allow this. Use %d or less OpenMP threads.",
2407 nbl_list->nnbl,NBNXN_BUFFERFLAG_MAX_THREADS,NBNXN_BUFFERFLAG_MAX_THREADS);
2410 snew(nbl_list->nbl,nbl_list->nnbl);
2411 /* Execute in order to avoid memory interleaving between threads */
2412 #pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)
2413 for(i=0; i<nbl_list->nnbl; i++)
2415 /* Allocate the nblist data structure locally on each thread
2416 * to optimize memory access for NUMA architectures.
2418 snew(nbl_list->nbl[i],1);
2420 /* Only list 0 is used on the GPU, use normal allocation for i>0 */
2423 nbnxn_init_pairlist(nbl_list->nbl[i],nbl_list->bSimple,alloc,free);
2427 nbnxn_init_pairlist(nbl_list->nbl[i],nbl_list->bSimple,NULL,NULL);
2432 /* Print statistics of a pair list, used for debug output */
2433 static void print_nblist_statistics_simple(FILE *fp,const nbnxn_pairlist_t *nbl,
2434 const nbnxn_search_t nbs,real rl)
2436 const nbnxn_grid_t *grid;
2441 /* This code only produces correct statistics with domain decomposition */
2442 grid = &nbs->grid[0];
2444 fprintf(fp,"nbl nci %d ncj %d\n",
2446 fprintf(fp,"nbl na_sc %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
2447 nbl->na_sc,rl,nbl->ncj,nbl->ncj/(double)grid->nc,
2448 nbl->ncj/(double)grid->nc*grid->na_sc,
2449 nbl->ncj/(double)grid->nc*grid->na_sc/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nc*grid->na_sc/det(nbs->box)));
2451 fprintf(fp,"nbl average j cell list length %.1f\n",
2452 0.25*nbl->ncj/(double)nbl->nci);
2454 for(s=0; s<SHIFTS; s++)
2459 for(i=0; i<nbl->nci; i++)
2461 cs[nbl->ci[i].shift & NBNXN_CI_SHIFT] +=
2462 nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start;
2464 j = nbl->ci[i].cj_ind_start;
2465 while (j < nbl->ci[i].cj_ind_end &&
2466 nbl->cj[j].excl != NBNXN_INT_MASK_ALL)
2472 fprintf(fp,"nbl cell pairs, total: %d excl: %d %.1f%%\n",
2473 nbl->ncj,npexcl,100*npexcl/(double)nbl->ncj);
2474 for(s=0; s<SHIFTS; s++)
2478 fprintf(fp,"nbl shift %2d ncj %3d\n",s,cs[s]);
2483 /* Print statistics of a pair lists, used for debug output */
2484 static void print_nblist_statistics_supersub(FILE *fp,const nbnxn_pairlist_t *nbl,
2485 const nbnxn_search_t nbs,real rl)
2487 const nbnxn_grid_t *grid;
2489 int c[GPU_NSUBCELL+1];
2491 /* This code only produces correct statistics with domain decomposition */
2492 grid = &nbs->grid[0];
2494 fprintf(fp,"nbl nsci %d ncj4 %d nsi %d excl4 %d\n",
2495 nbl->nsci,nbl->ncj4,nbl->nci_tot,nbl->nexcl);
2496 fprintf(fp,"nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
2497 nbl->na_ci,rl,nbl->nci_tot,nbl->nci_tot/(double)grid->nsubc_tot,
2498 nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c,
2499 nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nsubc_tot*grid->na_c/det(nbs->box)));
2501 fprintf(fp,"nbl average j super cell list length %.1f\n",
2502 0.25*nbl->ncj4/(double)nbl->nsci);
2503 fprintf(fp,"nbl average i sub cell list length %.1f\n",
2504 nbl->nci_tot/(0.25*nbl->ncj4));
2506 for(si=0; si<=GPU_NSUBCELL; si++)
2510 for(i=0; i<nbl->nsci; i++)
2512 for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
2517 for(si=0; si<GPU_NSUBCELL; si++)
2519 if (nbl->cj4[j4].imei[0].imask & (1U << (j*GPU_NSUBCELL + si)))
2528 for(b=0; b<=GPU_NSUBCELL; b++)
2530 fprintf(fp,"nbl j-list #i-subcell %d %7d %4.1f\n",
2531 b,c[b],100.0*c[b]/(double)(nbl->ncj4*NBNXN_GPU_JGROUP_SIZE));
2535 /* Print the full pair list, used for debug output */
2536 static void print_supersub_nsp(const char *fn,
2537 const nbnxn_pairlist_t *nbl,
2544 sprintf(buf,"%s_%s.xvg",fn,NONLOCAL_I(iloc) ? "nl" : "l");
2545 fp = ffopen(buf,"w");
2547 for(i=0; i<nbl->nci; i++)
2550 for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
2552 for(p=0; p<NBNXN_GPU_JGROUP_SIZE*GPU_NSUBCELL; p++)
2554 nsp += (nbl->cj4[j4].imei[0].imask >> p) & 1;
2557 fprintf(fp,"%4d %3d %3d\n",
2560 nbl->sci[i].cj4_ind_end-nbl->sci[i].cj4_ind_start);
2566 /* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp */
2567 static void low_get_nbl_exclusions(nbnxn_pairlist_t *nbl,int cj4,
2568 int warp,nbnxn_excl_t **excl)
2570 if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
2572 /* No exclusions set, make a new list entry */
2573 nbl->cj4[cj4].imei[warp].excl_ind = nbl->nexcl;
2575 *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
2576 set_no_excls(*excl);
2580 /* We already have some exclusions, new ones can be added to the list */
2581 *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
2585 /* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp,
2586 * allocates extra memory, if necessary.
2588 static void get_nbl_exclusions_1(nbnxn_pairlist_t *nbl,int cj4,
2589 int warp,nbnxn_excl_t **excl)
2591 if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
2593 /* We need to make a new list entry, check if we have space */
2594 check_excl_space(nbl,1);
2596 low_get_nbl_exclusions(nbl,cj4,warp,excl);
2599 /* Returns pointers to the exclusion mask for cj4-unit cj4 for both warps,
2600 * allocates extra memory, if necessary.
2602 static void get_nbl_exclusions_2(nbnxn_pairlist_t *nbl,int cj4,
2603 nbnxn_excl_t **excl_w0,
2604 nbnxn_excl_t **excl_w1)
2606 /* Check for space we might need */
2607 check_excl_space(nbl,2);
2609 low_get_nbl_exclusions(nbl,cj4,0,excl_w0);
2610 low_get_nbl_exclusions(nbl,cj4,1,excl_w1);
2613 /* Sets the self exclusions i=j and pair exclusions i>j */
2614 static void set_self_and_newton_excls_supersub(nbnxn_pairlist_t *nbl,
2615 int cj4_ind,int sj_offset,
2618 nbnxn_excl_t *excl[2];
2621 /* Here we only set the set self and double pair exclusions */
2623 get_nbl_exclusions_2(nbl,cj4_ind,&excl[0],&excl[1]);
2625 /* Only minor < major bits set */
2626 for(ej=0; ej<nbl->na_ci; ej++)
2629 for(ei=ej; ei<nbl->na_ci; ei++)
2631 excl[w]->pair[(ej&(4-1))*nbl->na_ci+ei] &=
2632 ~(1U << (sj_offset*GPU_NSUBCELL+si));
2637 /* Returns a diagonal or off-diagonal interaction mask for plain C lists */
2638 static unsigned int get_imask(gmx_bool rdiag,int ci,int cj)
2640 return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
2643 /* Returns a diagonal or off-diagonal interaction mask for SIMD128 lists */
2644 static unsigned int get_imask_x86_simd128(gmx_bool rdiag,int ci,int cj)
2646 #ifndef GMX_DOUBLE /* cj-size = 4 */
2647 return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
2648 #else /* cj-size = 2 */
2649 return (rdiag && ci*2 == cj ? NBNXN_INT_MASK_DIAG_J2_0 :
2650 (rdiag && ci*2+1 == cj ? NBNXN_INT_MASK_DIAG_J2_1 :
2651 NBNXN_INT_MASK_ALL));
2655 /* Returns a diagonal or off-diagonal interaction mask for SIMD256 lists */
2656 static unsigned int get_imask_x86_simd256(gmx_bool rdiag,int ci,int cj)
2658 #ifndef GMX_DOUBLE /* cj-size = 8 */
2659 return (rdiag && ci == cj*2 ? NBNXN_INT_MASK_DIAG_J8_0 :
2660 (rdiag && ci == cj*2+1 ? NBNXN_INT_MASK_DIAG_J8_1 :
2661 NBNXN_INT_MASK_ALL));
2662 #else /* cj-size = 4 */
2663 return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
2667 #ifdef GMX_NBNXN_SIMD
2668 #if GMX_NBNXN_SIMD_BITWIDTH == 128
2669 #define get_imask_x86_simd_4xn get_imask_x86_simd128
2671 #if GMX_NBNXN_SIMD_BITWIDTH == 256
2672 #define get_imask_x86_simd_4xn get_imask_x86_simd256
2673 #define get_imask_x86_simd_2xnn get_imask_x86_simd128
2675 #error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
2680 /* Plain C code for making a pair list of cell ci vs cell cjf-cjl.
2681 * Checks bounding box distances and possibly atom pair distances.
2683 static void make_cluster_list_simple(const nbnxn_grid_t *gridj,
2684 nbnxn_pairlist_t *nbl,
2685 int ci,int cjf,int cjl,
2686 gmx_bool remove_sub_diag,
2688 real rl2,float rbb2,
2691 const nbnxn_list_work_t *work;
2698 int cjf_gl,cjl_gl,cj;
2702 bb_ci = nbl->work->bb_ci;
2703 x_ci = nbl->work->x_ci;
2706 while (!InRange && cjf <= cjl)
2708 d2 = subc_bb_dist2(0,bb_ci,cjf,gridj->bb);
2711 /* Check if the distance is within the distance where
2712 * we use only the bounding box distance rbb,
2713 * or within the cut-off and there is at least one atom pair
2714 * within the cut-off.
2724 cjf_gl = gridj->cell0 + cjf;
2725 for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
2727 for(j=0; j<NBNXN_CPU_CLUSTER_I_SIZE; j++)
2729 InRange = InRange ||
2730 (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
2731 sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
2732 sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
2735 *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
2748 while (!InRange && cjl > cjf)
2750 d2 = subc_bb_dist2(0,bb_ci,cjl,gridj->bb);
2753 /* Check if the distance is within the distance where
2754 * we use only the bounding box distance rbb,
2755 * or within the cut-off and there is at least one atom pair
2756 * within the cut-off.
2766 cjl_gl = gridj->cell0 + cjl;
2767 for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
2769 for(j=0; j<NBNXN_CPU_CLUSTER_I_SIZE; j++)
2771 InRange = InRange ||
2772 (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
2773 sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
2774 sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
2777 *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
2787 for(cj=cjf; cj<=cjl; cj++)
2789 /* Store cj and the interaction mask */
2790 nbl->cj[nbl->ncj].cj = gridj->cell0 + cj;
2791 nbl->cj[nbl->ncj].excl = get_imask(remove_sub_diag,ci,cj);
2794 /* Increase the closing index in i super-cell list */
2795 nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
2799 #ifdef GMX_NBNXN_SIMD_4XN
2800 #include "nbnxn_search_simd_4xn.h"
2802 #ifdef GMX_NBNXN_SIMD_2XNN
2803 #include "nbnxn_search_simd_2xnn.h"
2806 /* Plain C or SSE code for making a pair list of super-cell sci vs scj.
2807 * Checks bounding box distances and possibly atom pair distances.
2809 static void make_cluster_list_supersub(const nbnxn_search_t nbs,
2810 const nbnxn_grid_t *gridi,
2811 const nbnxn_grid_t *gridj,
2812 nbnxn_pairlist_t *nbl,
2814 gmx_bool sci_equals_scj,
2815 int stride,const real *x,
2816 real rl2,float rbb2,
2821 int cjo,ci1,ci,cj,cj_gl;
2822 int cj4_ind,cj_offset;
2829 #define PRUNE_LIST_CPU_ONE
2830 #ifdef PRUNE_LIST_CPU_ONE
2834 d2l = nbl->work->d2;
2836 bb_ci = nbl->work->bb_ci;
2837 x_ci = nbl->work->x_ci;
2841 for(cjo=0; cjo<gridj->nsubc[scj]; cjo++)
2843 cj4_ind = (nbl->work->cj_ind >> 2);
2844 cj_offset = nbl->work->cj_ind - cj4_ind*NBNXN_GPU_JGROUP_SIZE;
2845 cj4 = &nbl->cj4[cj4_ind];
2847 cj = scj*GPU_NSUBCELL + cjo;
2849 cj_gl = gridj->cell0*GPU_NSUBCELL + cj;
2851 /* Initialize this j-subcell i-subcell list */
2852 cj4->cj[cj_offset] = cj_gl;
2861 ci1 = gridi->nsubc[sci];
2865 /* Determine all ci1 bb distances in one call with SSE */
2866 subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_8BB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_8BB-1)),
2872 /* We use a fixed upper-bound instead of ci1 to help optimization */
2873 for(ci=0; ci<GPU_NSUBCELL; ci++)
2880 #ifndef NBNXN_BBXXXX
2881 /* Determine the bb distance between ci and cj */
2882 d2l[ci] = subc_bb_dist2(ci,bb_ci,cj,gridj->bb);
2887 #ifdef PRUNE_LIST_CPU_ALL
2888 /* Check if the distance is within the distance where
2889 * we use only the bounding box distance rbb,
2890 * or within the cut-off and there is at least one atom pair
2891 * within the cut-off. This check is very costly.
2893 *ndistc += na_c*na_c;
2895 (d2 < rl2 && subc_in_range_x(na_c,ci,x_ci,cj_gl,stride,x,rl2)))
2897 /* Check if the distance between the two bounding boxes
2898 * in within the pair-list cut-off.
2903 /* Flag this i-subcell to be taken into account */
2904 imask |= (1U << (cj_offset*GPU_NSUBCELL+ci));
2906 #ifdef PRUNE_LIST_CPU_ONE
2914 #ifdef PRUNE_LIST_CPU_ONE
2915 /* If we only found 1 pair, check if any atoms are actually
2916 * within the cut-off, so we could get rid of it.
2918 if (npair == 1 && d2l[ci_last] >= rbb2)
2920 /* Avoid using function pointers here, as it's slower */
2922 #ifdef NBNXN_8BB_SSE
2927 (na_c,ci_last,x_ci,cj_gl,stride,x,rl2))
2929 imask &= ~(1U << (cj_offset*GPU_NSUBCELL+ci_last));
2937 /* We have a useful sj entry, close it now */
2939 /* Set the exclucions for the ci== sj entry.
2940 * Here we don't bother to check if this entry is actually flagged,
2941 * as it will nearly always be in the list.
2945 set_self_and_newton_excls_supersub(nbl,cj4_ind,cj_offset,cjo);
2948 /* Copy the cluster interaction mask to the list */
2949 for(w=0; w<NWARP; w++)
2951 cj4->imei[w].imask |= imask;
2954 nbl->work->cj_ind++;
2956 /* Keep the count */
2957 nbl->nci_tot += npair;
2959 /* Increase the closing index in i super-cell list */
2960 nbl->sci[nbl->nsci].cj4_ind_end = ((nbl->work->cj_ind+4-1)>>2);
2965 /* Set all atom-pair exclusions from the topology stored in excl
2966 * as masks in the pair-list for simple list i-entry nbl_ci
2968 static void set_ci_top_excls(const nbnxn_search_t nbs,
2969 nbnxn_pairlist_t *nbl,
2970 gmx_bool diagRemoved,
2973 const nbnxn_ci_t *nbl_ci,
2974 const t_blocka *excl)
2978 int cj_ind_first,cj_ind_last;
2979 int cj_first,cj_last;
2981 int i,ai,aj,si,eind,ge,se;
2982 int found,cj_ind_0,cj_ind_1,cj_ind_m;
2986 nbnxn_excl_t *nbl_excl;
2987 int inner_i,inner_e;
2991 if (nbl_ci->cj_ind_end == nbl_ci->cj_ind_start)
2999 cj_ind_first = nbl_ci->cj_ind_start;
3000 cj_ind_last = nbl->ncj - 1;
3002 cj_first = nbl->cj[cj_ind_first].cj;
3003 cj_last = nbl->cj[cj_ind_last].cj;
3005 /* Determine how many contiguous j-cells we have starting
3006 * from the first i-cell. This number can be used to directly
3007 * calculate j-cell indices for excluded atoms.
3010 if (na_ci_2log == na_cj_2log)
3012 while (cj_ind_first + ndirect <= cj_ind_last &&
3013 nbl->cj[cj_ind_first+ndirect].cj == ci + ndirect)
3018 #ifdef NBNXN_SEARCH_SSE
3021 while (cj_ind_first + ndirect <= cj_ind_last &&
3022 nbl->cj[cj_ind_first+ndirect].cj == ci_to_cj(na_cj_2log,ci) + ndirect)
3029 /* Loop over the atoms in the i super-cell */
3030 for(i=0; i<nbl->na_sc; i++)
3032 ai = nbs->a[ci*nbl->na_sc+i];
3035 si = (i>>na_ci_2log);
3037 /* Loop over the topology-based exclusions for this i-atom */
3038 for(eind=excl->index[ai]; eind<excl->index[ai+1]; eind++)
3044 /* The self exclusion are already set, save some time */
3050 /* Without shifts we only calculate interactions j>i
3051 * for one-way pair-lists.
3053 if (diagRemoved && ge <= ci*nbl->na_sc + i)
3058 se = (ge >> na_cj_2log);
3060 /* Could the cluster se be in our list? */
3061 if (se >= cj_first && se <= cj_last)
3063 if (se < cj_first + ndirect)
3065 /* We can calculate cj_ind directly from se */
3066 found = cj_ind_first + se - cj_first;
3070 /* Search for se using bisection */
3072 cj_ind_0 = cj_ind_first + ndirect;
3073 cj_ind_1 = cj_ind_last + 1;
3074 while (found == -1 && cj_ind_0 < cj_ind_1)
3076 cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
3078 cj_m = nbl->cj[cj_ind_m].cj;
3086 cj_ind_1 = cj_ind_m;
3090 cj_ind_0 = cj_ind_m + 1;
3097 inner_i = i - (si << na_ci_2log);
3098 inner_e = ge - (se << na_cj_2log);
3100 nbl->cj[found].excl &= ~(1U<<((inner_i<<na_cj_2log) + inner_e));
3108 /* Set all atom-pair exclusions from the topology stored in excl
3109 * as masks in the pair-list for i-super-cell entry nbl_sci
3111 static void set_sci_top_excls(const nbnxn_search_t nbs,
3112 nbnxn_pairlist_t *nbl,
3113 gmx_bool diagRemoved,
3115 const nbnxn_sci_t *nbl_sci,
3116 const t_blocka *excl)
3121 int cj_ind_first,cj_ind_last;
3122 int cj_first,cj_last;
3124 int i,ai,aj,si,eind,ge,se;
3125 int found,cj_ind_0,cj_ind_1,cj_ind_m;
3129 nbnxn_excl_t *nbl_excl;
3130 int inner_i,inner_e,w;
3136 if (nbl_sci->cj4_ind_end == nbl_sci->cj4_ind_start)
3144 cj_ind_first = nbl_sci->cj4_ind_start*NBNXN_GPU_JGROUP_SIZE;
3145 cj_ind_last = nbl->work->cj_ind - 1;
3147 cj_first = nbl->cj4[nbl_sci->cj4_ind_start].cj[0];
3148 cj_last = nbl_cj(nbl,cj_ind_last);
3150 /* Determine how many contiguous j-clusters we have starting
3151 * from the first i-cluster. This number can be used to directly
3152 * calculate j-cluster indices for excluded atoms.
3155 while (cj_ind_first + ndirect <= cj_ind_last &&
3156 nbl_cj(nbl,cj_ind_first+ndirect) == sci*GPU_NSUBCELL + ndirect)
3161 /* Loop over the atoms in the i super-cell */
3162 for(i=0; i<nbl->na_sc; i++)
3164 ai = nbs->a[sci*nbl->na_sc+i];
3167 si = (i>>na_c_2log);
3169 /* Loop over the topology-based exclusions for this i-atom */
3170 for(eind=excl->index[ai]; eind<excl->index[ai+1]; eind++)
3176 /* The self exclusion are already set, save some time */
3182 /* Without shifts we only calculate interactions j>i
3183 * for one-way pair-lists.
3185 if (diagRemoved && ge <= sci*nbl->na_sc + i)
3191 /* Could the cluster se be in our list? */
3192 if (se >= cj_first && se <= cj_last)
3194 if (se < cj_first + ndirect)
3196 /* We can calculate cj_ind directly from se */
3197 found = cj_ind_first + se - cj_first;
3201 /* Search for se using bisection */
3203 cj_ind_0 = cj_ind_first + ndirect;
3204 cj_ind_1 = cj_ind_last + 1;
3205 while (found == -1 && cj_ind_0 < cj_ind_1)
3207 cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
3209 cj_m = nbl_cj(nbl,cj_ind_m);
3217 cj_ind_1 = cj_ind_m;
3221 cj_ind_0 = cj_ind_m + 1;
3228 inner_i = i - si*na_c;
3229 inner_e = ge - se*na_c;
3231 /* Macro for getting the index of atom a within a cluster */
3232 #define AMODI(a) ((a) & (NBNXN_CPU_CLUSTER_I_SIZE - 1))
3233 /* Macro for converting an atom number to a cluster number */
3234 #define A2CI(a) ((a) >> NBNXN_CPU_CLUSTER_I_SIZE_2LOG)
3236 if (nbl_imask0(nbl,found) & (1U << (AMODI(found)*GPU_NSUBCELL + si)))
3240 get_nbl_exclusions_1(nbl,A2CI(found),w,&nbl_excl);
3242 nbl_excl->pair[AMODI(inner_e)*nbl->na_ci+inner_i] &=
3243 ~(1U << (AMODI(found)*GPU_NSUBCELL + si));
3255 /* Reallocate the simple ci list for at least n entries */
3256 static void nb_realloc_ci(nbnxn_pairlist_t *nbl,int n)
3258 nbl->ci_nalloc = over_alloc_small(n);
3259 nbnxn_realloc_void((void **)&nbl->ci,
3260 nbl->nci*sizeof(*nbl->ci),
3261 nbl->ci_nalloc*sizeof(*nbl->ci),
3262 nbl->alloc,nbl->free);
3265 /* Reallocate the super-cell sci list for at least n entries */
3266 static void nb_realloc_sci(nbnxn_pairlist_t *nbl,int n)
3268 nbl->sci_nalloc = over_alloc_small(n);
3269 nbnxn_realloc_void((void **)&nbl->sci,
3270 nbl->nsci*sizeof(*nbl->sci),
3271 nbl->sci_nalloc*sizeof(*nbl->sci),
3272 nbl->alloc,nbl->free);
3275 /* Make a new ci entry at index nbl->nci */
3276 static void new_ci_entry(nbnxn_pairlist_t *nbl,int ci,int shift,int flags,
3277 nbnxn_list_work_t *work)
3279 if (nbl->nci + 1 > nbl->ci_nalloc)
3281 nb_realloc_ci(nbl,nbl->nci+1);
3283 nbl->ci[nbl->nci].ci = ci;
3284 nbl->ci[nbl->nci].shift = shift;
3285 /* Store the interaction flags along with the shift */
3286 nbl->ci[nbl->nci].shift |= flags;
3287 nbl->ci[nbl->nci].cj_ind_start = nbl->ncj;
3288 nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
3291 /* Make a new sci entry at index nbl->nsci */
3292 static void new_sci_entry(nbnxn_pairlist_t *nbl,int sci,int shift,int flags,
3293 nbnxn_list_work_t *work)
3295 if (nbl->nsci + 1 > nbl->sci_nalloc)
3297 nb_realloc_sci(nbl,nbl->nsci+1);
3299 nbl->sci[nbl->nsci].sci = sci;
3300 nbl->sci[nbl->nsci].shift = shift;
3301 nbl->sci[nbl->nsci].cj4_ind_start = nbl->ncj4;
3302 nbl->sci[nbl->nsci].cj4_ind_end = nbl->ncj4;
3305 /* Sort the simple j-list cj on exclusions.
3306 * Entries with exclusions will all be sorted to the beginning of the list.
3308 static void sort_cj_excl(nbnxn_cj_t *cj,int ncj,
3309 nbnxn_list_work_t *work)
3313 if (ncj > work->cj_nalloc)
3315 work->cj_nalloc = over_alloc_large(ncj);
3316 srenew(work->cj,work->cj_nalloc);
3319 /* Make a list of the j-cells involving exclusions */
3321 for(j=0; j<ncj; j++)
3323 if (cj[j].excl != NBNXN_INT_MASK_ALL)
3325 work->cj[jnew++] = cj[j];
3328 /* Check if there are exclusions at all or not just the first entry */
3329 if (!((jnew == 0) ||
3330 (jnew == 1 && cj[0].excl != NBNXN_INT_MASK_ALL)))
3332 for(j=0; j<ncj; j++)
3334 if (cj[j].excl == NBNXN_INT_MASK_ALL)
3336 work->cj[jnew++] = cj[j];
3339 for(j=0; j<ncj; j++)
3341 cj[j] = work->cj[j];
3346 /* Close this simple list i entry */
3347 static void close_ci_entry_simple(nbnxn_pairlist_t *nbl)
3351 /* All content of the new ci entry have already been filled correctly,
3352 * we only need to increase the count here (for non empty lists).
3354 jlen = nbl->ci[nbl->nci].cj_ind_end - nbl->ci[nbl->nci].cj_ind_start;
3357 sort_cj_excl(nbl->cj+nbl->ci[nbl->nci].cj_ind_start,jlen,nbl->work);
3359 if (nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0))
3361 nbl->work->ncj_hlj += jlen;
3363 else if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
3365 nbl->work->ncj_noq += jlen;
3372 /* Split sci entry for load balancing on the GPU.
3373 * As we only now the current count on our own thread,
3374 * we will need to estimate the current total amount of i-entries.
3375 * As the lists get concatenated later, this estimate depends
3376 * both on nthread and our own thread index thread.
3378 static void split_sci_entry(nbnxn_pairlist_t *nbl,
3379 int nsp_max_av,gmx_bool progBal,int nc_bal,
3380 int thread,int nthread)
3384 int cj4_start,cj4_end,j4len,cj4;
3386 int nsp,nsp_sci,nsp_cj4,nsp_cj4_e,nsp_cj4_p;
3389 /* Estimate the total numbers of ci's of the nblist combined
3390 * over all threads using the target number of ci's.
3392 nsci_est = nc_bal*thread/nthread + nbl->nsci;
3395 /* The first ci blocks should be larger, to avoid overhead.
3396 * The last ci blocks should be smaller, to improve load balancing.
3399 nsp_max_av*nc_bal*3/(2*(nsci_est - 1 + nc_bal)));
3403 nsp_max = nsp_max_av;
3406 cj4_start = nbl->sci[nbl->nsci-1].cj4_ind_start;
3407 cj4_end = nbl->sci[nbl->nsci-1].cj4_ind_end;
3408 j4len = cj4_end - cj4_start;
3410 if (j4len > 1 && j4len*GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE > nsp_max)
3412 /* Remove the last ci entry and process the cj4's again */
3421 while (cj4 < cj4_end)
3423 nsp_cj4_p = nsp_cj4;
3425 for(p=0; p<GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE; p++)
3427 nsp_cj4 += (nbl->cj4[cj4].imei[0].imask >> p) & 1;
3431 if (nsp > nsp_max && nsp > nsp_cj4)
3433 nbl->sci[sci].cj4_ind_end = cj4;
3436 if (nbl->nsci+1 > nbl->sci_nalloc)
3438 nb_realloc_sci(nbl,nbl->nsci+1);
3440 nbl->sci[sci].sci = nbl->sci[nbl->nsci-1].sci;
3441 nbl->sci[sci].shift = nbl->sci[nbl->nsci-1].shift;
3442 nbl->sci[sci].cj4_ind_start = cj4;
3443 nsp_sci = nsp - nsp_cj4;
3444 nsp_cj4_e = nsp_cj4_p;
3451 /* Put the remaining cj4's in a new ci entry */
3452 nbl->sci[sci].cj4_ind_end = cj4_end;
3454 /* Possibly balance out the last two ci's
3455 * by moving the last cj4 of the second last ci.
3457 if (nsp_sci - nsp_cj4_e >= nsp + nsp_cj4_e)
3459 nbl->sci[sci-1].cj4_ind_end--;
3460 nbl->sci[sci].cj4_ind_start--;
3468 /* Clost this super/sub list i entry */
3469 static void close_ci_entry_supersub(nbnxn_pairlist_t *nbl,
3471 gmx_bool progBal,int nc_bal,
3472 int thread,int nthread)
3477 /* All content of the new ci entry have already been filled correctly,
3478 * we only need to increase the count here (for non empty lists).
3480 j4len = nbl->sci[nbl->nsci].cj4_ind_end - nbl->sci[nbl->nsci].cj4_ind_start;
3483 /* We can only have complete blocks of 4 j-entries in a list,
3484 * so round the count up before closing.
3486 nbl->ncj4 = ((nbl->work->cj_ind + 4-1) >> 2);
3487 nbl->work->cj_ind = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
3493 split_sci_entry(nbl,nsp_max_av,progBal,nc_bal,thread,nthread);
3498 /* Syncs the working array before adding another grid pair to the list */
3499 static void sync_work(nbnxn_pairlist_t *nbl)
3503 nbl->work->cj_ind = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
3504 nbl->work->cj4_init = nbl->ncj4;
3508 /* Clears an nbnxn_pairlist_t data structure */
3509 static void clear_pairlist(nbnxn_pairlist_t *nbl)
3518 nbl->work->ncj_noq = 0;
3519 nbl->work->ncj_hlj = 0;
3522 /* Sets a simple list i-cell bounding box, including PBC shift */
3523 static void set_icell_bb_simple(const float *bb,int ci,
3524 real shx,real shy,real shz,
3530 bb_ci[BBL_X] = bb[ia+BBL_X] + shx;
3531 bb_ci[BBL_Y] = bb[ia+BBL_Y] + shy;
3532 bb_ci[BBL_Z] = bb[ia+BBL_Z] + shz;
3533 bb_ci[BBU_X] = bb[ia+BBU_X] + shx;
3534 bb_ci[BBU_Y] = bb[ia+BBU_Y] + shy;
3535 bb_ci[BBU_Z] = bb[ia+BBU_Z] + shz;
3538 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
3539 static void set_icell_bb_supersub(const float *bb,int ci,
3540 real shx,real shy,real shz,
3546 ia = ci*(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX;
3547 for(m=0; m<(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX; m+=NNBSBB_XXXX)
3549 for(i=0; i<STRIDE_8BB; i++)
3551 bb_ci[m+0*STRIDE_8BB+i] = bb[ia+m+0*STRIDE_8BB+i] + shx;
3552 bb_ci[m+1*STRIDE_8BB+i] = bb[ia+m+1*STRIDE_8BB+i] + shy;
3553 bb_ci[m+2*STRIDE_8BB+i] = bb[ia+m+2*STRIDE_8BB+i] + shz;
3554 bb_ci[m+3*STRIDE_8BB+i] = bb[ia+m+3*STRIDE_8BB+i] + shx;
3555 bb_ci[m+4*STRIDE_8BB+i] = bb[ia+m+4*STRIDE_8BB+i] + shy;
3556 bb_ci[m+5*STRIDE_8BB+i] = bb[ia+m+5*STRIDE_8BB+i] + shz;
3560 ia = ci*GPU_NSUBCELL*NNBSBB_B;
3561 for(i=0; i<GPU_NSUBCELL*NNBSBB_B; i+=NNBSBB_B)
3563 bb_ci[i+BBL_X] = bb[ia+i+BBL_X] + shx;
3564 bb_ci[i+BBL_Y] = bb[ia+i+BBL_Y] + shy;
3565 bb_ci[i+BBL_Z] = bb[ia+i+BBL_Z] + shz;
3566 bb_ci[i+BBU_X] = bb[ia+i+BBU_X] + shx;
3567 bb_ci[i+BBU_Y] = bb[ia+i+BBU_Y] + shy;
3568 bb_ci[i+BBU_Z] = bb[ia+i+BBU_Z] + shz;
3573 /* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
3574 static void icell_set_x_simple(int ci,
3575 real shx,real shy,real shz,
3577 int stride,const real *x,
3578 nbnxn_list_work_t *work)
3582 ia = ci*NBNXN_CPU_CLUSTER_I_SIZE;
3584 for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE; i++)
3586 work->x_ci[i*STRIDE_XYZ+XX] = x[(ia+i)*stride+XX] + shx;
3587 work->x_ci[i*STRIDE_XYZ+YY] = x[(ia+i)*stride+YY] + shy;
3588 work->x_ci[i*STRIDE_XYZ+ZZ] = x[(ia+i)*stride+ZZ] + shz;
3592 /* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
3593 static void icell_set_x_supersub(int ci,
3594 real shx,real shy,real shz,
3596 int stride,const real *x,
3597 nbnxn_list_work_t *work)
3604 ia = ci*GPU_NSUBCELL*na_c;
3605 for(i=0; i<GPU_NSUBCELL*na_c; i++)
3607 x_ci[i*DIM + XX] = x[(ia+i)*stride + XX] + shx;
3608 x_ci[i*DIM + YY] = x[(ia+i)*stride + YY] + shy;
3609 x_ci[i*DIM + ZZ] = x[(ia+i)*stride + ZZ] + shz;
3613 #ifdef NBNXN_SEARCH_SSE
3614 /* Copies PBC shifted super-cell packed atom coordinates to working array */
3615 static void icell_set_x_supersub_sse8(int ci,
3616 real shx,real shy,real shz,
3618 int stride,const real *x,
3619 nbnxn_list_work_t *work)
3626 for(si=0; si<GPU_NSUBCELL; si++)
3628 for(i=0; i<na_c; i+=STRIDE_8BB)
3631 ia = ci*GPU_NSUBCELL*na_c + io;
3632 for(j=0; j<STRIDE_8BB; j++)
3634 x_ci[io*DIM + j + XX*STRIDE_8BB] = x[(ia+j)*stride+XX] + shx;
3635 x_ci[io*DIM + j + YY*STRIDE_8BB] = x[(ia+j)*stride+YY] + shy;
3636 x_ci[io*DIM + j + ZZ*STRIDE_8BB] = x[(ia+j)*stride+ZZ] + shz;
3643 static real nbnxn_rlist_inc_nonloc_fac = 0.6;
3645 /* Due to the cluster size the effective pair-list is longer than
3646 * that of a simple atom pair-list. This function gives the extra distance.
3648 real nbnxn_get_rlist_effective_inc(int cluster_size,real atom_density)
3650 return ((0.5 + nbnxn_rlist_inc_nonloc_fac)*sqr(((cluster_size) - 1.0)/(cluster_size))*pow((cluster_size)/(atom_density),1.0/3.0));
3653 /* Estimates the interaction volume^2 for non-local interactions */
3654 static real nonlocal_vol2(const gmx_domdec_zones_t *zones,rvec ls,real r)
3663 /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
3664 * not home interaction volume^2. As these volumes are not additive,
3665 * this is an overestimate, but it would only be significant in the limit
3666 * of small cells, where we anyhow need to split the lists into
3667 * as small parts as possible.
3670 for(z=0; z<zones->n; z++)
3672 if (zones->shift[z][XX] + zones->shift[z][YY] + zones->shift[z][ZZ] == 1)
3677 for(d=0; d<DIM; d++)
3679 if (zones->shift[z][d] == 0)
3683 za *= zones->size[z].x1[d] - zones->size[z].x0[d];
3687 /* 4 octants of a sphere */
3688 vold_est = 0.25*M_PI*r*r*r*r;
3689 /* 4 quarter pie slices on the edges */
3690 vold_est += 4*cl*M_PI/6.0*r*r*r;
3691 /* One rectangular volume on a face */
3692 vold_est += ca*0.5*r*r;
3694 vol2_est_tot += vold_est*za;
3698 return vol2_est_tot;
3701 /* Estimates the average size of a full j-list for super/sub setup */
3702 static int get_nsubpair_max(const nbnxn_search_t nbs,
3705 int min_ci_balanced)
3707 const nbnxn_grid_t *grid;
3709 real xy_diag2,r_eff_sup,vol_est,nsp_est,nsp_est_nl;
3712 grid = &nbs->grid[0];
3714 ls[XX] = (grid->c1[XX] - grid->c0[XX])/(grid->ncx*GPU_NSUBCELL_X);
3715 ls[YY] = (grid->c1[YY] - grid->c0[YY])/(grid->ncy*GPU_NSUBCELL_Y);
3716 ls[ZZ] = (grid->c1[ZZ] - grid->c0[ZZ])*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z);
3718 /* The average squared length of the diagonal of a sub cell */
3719 xy_diag2 = ls[XX]*ls[XX] + ls[YY]*ls[YY] + ls[ZZ]*ls[ZZ];
3721 /* The formulas below are a heuristic estimate of the average nsj per si*/
3722 r_eff_sup = rlist + nbnxn_rlist_inc_nonloc_fac*sqr((grid->na_c - 1.0)/grid->na_c)*sqrt(xy_diag2/3);
3724 if (!nbs->DomDec || nbs->zones->n == 1)
3731 sqr(grid->atom_density/grid->na_c)*
3732 nonlocal_vol2(nbs->zones,ls,r_eff_sup);
3737 /* Sub-cell interacts with itself */
3738 vol_est = ls[XX]*ls[YY]*ls[ZZ];
3739 /* 6/2 rectangular volume on the faces */
3740 vol_est += (ls[XX]*ls[YY] + ls[XX]*ls[ZZ] + ls[YY]*ls[ZZ])*r_eff_sup;
3741 /* 12/2 quarter pie slices on the edges */
3742 vol_est += 2*(ls[XX] + ls[YY] + ls[ZZ])*0.25*M_PI*sqr(r_eff_sup);
3743 /* 4 octants of a sphere */
3744 vol_est += 0.5*4.0/3.0*M_PI*pow(r_eff_sup,3);
3746 nsp_est = grid->nsubc_tot*vol_est*grid->atom_density/grid->na_c;
3748 /* Subtract the non-local pair count */
3749 nsp_est -= nsp_est_nl;
3753 fprintf(debug,"nsp_est local %5.1f non-local %5.1f\n",
3754 nsp_est,nsp_est_nl);
3759 nsp_est = nsp_est_nl;
3762 if (min_ci_balanced <= 0 || grid->nc >= min_ci_balanced || grid->nc == 0)
3764 /* We don't need to worry */
3769 /* Thus the (average) maximum j-list size should be as follows */
3770 nsubpair_max = max(1,(int)(nsp_est/min_ci_balanced+0.5));
3772 /* Since the target value is a maximum (this avoid high outliers,
3773 * which lead to load imbalance), not average, we get more lists
3774 * than we ask for (to compensate we need to add GPU_NSUBCELL*4/4).
3775 * But more importantly, the optimal GPU performance moves
3776 * to lower number of block for very small blocks.
3777 * To compensate we add the maximum pair count per cj4.
3779 nsubpair_max += GPU_NSUBCELL*NBNXN_CPU_CLUSTER_I_SIZE;
3784 fprintf(debug,"nbl nsp estimate %.1f, nsubpair_max %d\n",
3785 nsp_est,nsubpair_max);
3788 return nsubpair_max;
3791 /* Debug list print function */
3792 static void print_nblist_ci_cj(FILE *fp,const nbnxn_pairlist_t *nbl)
3796 for(i=0; i<nbl->nci; i++)
3798 fprintf(fp,"ci %4d shift %2d ncj %3d\n",
3799 nbl->ci[i].ci,nbl->ci[i].shift,
3800 nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start);
3802 for(j=nbl->ci[i].cj_ind_start; j<nbl->ci[i].cj_ind_end; j++)
3804 fprintf(fp," cj %5d imask %x\n",
3811 /* Debug list print function */
3812 static void print_nblist_sci_cj(FILE *fp,const nbnxn_pairlist_t *nbl)
3816 for(i=0; i<nbl->nsci; i++)
3818 fprintf(fp,"ci %4d shift %2d ncj4 %2d\n",
3819 nbl->sci[i].sci,nbl->sci[i].shift,
3820 nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start);
3822 for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
3826 fprintf(fp," sj %5d imask %x\n",
3828 nbl->cj4[j4].imei[0].imask);
3834 /* Combine pair lists *nbl generated on multiple threads nblc */
3835 static void combine_nblists(int nnbl,nbnxn_pairlist_t **nbl,
3836 nbnxn_pairlist_t *nblc)
3838 int nsci,ncj4,nexcl;
3843 gmx_incons("combine_nblists does not support simple lists");
3848 nexcl = nblc->nexcl;
3849 for(i=0; i<nnbl; i++)
3851 nsci += nbl[i]->nsci;
3852 ncj4 += nbl[i]->ncj4;
3853 nexcl += nbl[i]->nexcl;
3856 if (nsci > nblc->sci_nalloc)
3858 nb_realloc_sci(nblc,nsci);
3860 if (ncj4 > nblc->cj4_nalloc)
3862 nblc->cj4_nalloc = over_alloc_small(ncj4);
3863 nbnxn_realloc_void((void **)&nblc->cj4,
3864 nblc->ncj4*sizeof(*nblc->cj4),
3865 nblc->cj4_nalloc*sizeof(*nblc->cj4),
3866 nblc->alloc,nblc->free);
3868 if (nexcl > nblc->excl_nalloc)
3870 nblc->excl_nalloc = over_alloc_small(nexcl);
3871 nbnxn_realloc_void((void **)&nblc->excl,
3872 nblc->nexcl*sizeof(*nblc->excl),
3873 nblc->excl_nalloc*sizeof(*nblc->excl),
3874 nblc->alloc,nblc->free);
3877 /* Each thread should copy its own data to the combined arrays,
3878 * as otherwise data will go back and forth between different caches.
3880 #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
3881 for(n=0; n<nnbl; n++)
3888 const nbnxn_pairlist_t *nbli;
3890 /* Determine the offset in the combined data for our thread */
3891 sci_offset = nblc->nsci;
3892 cj4_offset = nblc->ncj4;
3893 ci_offset = nblc->nci_tot;
3894 excl_offset = nblc->nexcl;
3898 sci_offset += nbl[i]->nsci;
3899 cj4_offset += nbl[i]->ncj4;
3900 ci_offset += nbl[i]->nci_tot;
3901 excl_offset += nbl[i]->nexcl;
3906 for(i=0; i<nbli->nsci; i++)
3908 nblc->sci[sci_offset+i] = nbli->sci[i];
3909 nblc->sci[sci_offset+i].cj4_ind_start += cj4_offset;
3910 nblc->sci[sci_offset+i].cj4_ind_end += cj4_offset;
3913 for(j4=0; j4<nbli->ncj4; j4++)
3915 nblc->cj4[cj4_offset+j4] = nbli->cj4[j4];
3916 nblc->cj4[cj4_offset+j4].imei[0].excl_ind += excl_offset;
3917 nblc->cj4[cj4_offset+j4].imei[1].excl_ind += excl_offset;
3920 for(j4=0; j4<nbli->nexcl; j4++)
3922 nblc->excl[excl_offset+j4] = nbli->excl[j4];
3926 for(n=0; n<nnbl; n++)
3928 nblc->nsci += nbl[n]->nsci;
3929 nblc->ncj4 += nbl[n]->ncj4;
3930 nblc->nci_tot += nbl[n]->nci_tot;
3931 nblc->nexcl += nbl[n]->nexcl;
3935 /* Returns the next ci to be processes by our thread */
3936 static gmx_bool next_ci(const nbnxn_grid_t *grid,
3938 int nth,int ci_block,
3939 int *ci_x,int *ci_y,
3945 if (*ci_b == ci_block)
3947 /* Jump to the next block assigned to this task */
3948 *ci += (nth - 1)*ci_block;
3952 if (*ci >= grid->nc*conv)
3957 while (*ci >= grid->cxy_ind[*ci_x*grid->ncy + *ci_y + 1]*conv)
3960 if (*ci_y == grid->ncy)
3970 /* Returns the distance^2 for which we put cell pairs in the list
3971 * without checking atom pair distances. This is usually < rlist^2.
3973 static float boundingbox_only_distance2(const nbnxn_grid_t *gridi,
3974 const nbnxn_grid_t *gridj,
3978 /* If the distance between two sub-cell bounding boxes is less
3979 * than this distance, do not check the distance between
3980 * all particle pairs in the sub-cell, since then it is likely
3981 * that the box pair has atom pairs within the cut-off.
3982 * We use the nblist cut-off minus 0.5 times the average x/y diagonal
3983 * spacing of the sub-cells. Around 40% of the checked pairs are pruned.
3984 * Using more than 0.5 gains at most 0.5%.
3985 * If forces are calculated more than twice, the performance gain
3986 * in the force calculation outweighs the cost of checking.
3987 * Note that with subcell lists, the atom-pair distance check
3988 * is only performed when only 1 out of 8 sub-cells in within range,
3989 * this is because the GPU is much faster than the cpu.
3994 bbx = 0.5*(gridi->sx + gridj->sx);
3995 bby = 0.5*(gridi->sy + gridj->sy);
3998 bbx /= GPU_NSUBCELL_X;
3999 bby /= GPU_NSUBCELL_Y;
4002 rbb2 = sqr(max(0,rlist - 0.5*sqrt(bbx*bbx + bby*bby)));
4007 return (float)((1+GMX_FLOAT_EPS)*rbb2);
4011 static int get_ci_block_size(const nbnxn_grid_t *gridi,
4012 gmx_bool bDomDec, int nth)
4014 const int ci_block_enum = 5;
4015 const int ci_block_denom = 11;
4016 const int ci_block_min_atoms = 16;
4019 /* Here we decide how to distribute the blocks over the threads.
4020 * We use prime numbers to try to avoid that the grid size becomes
4021 * a multiple of the number of threads, which would lead to some
4022 * threads getting "inner" pairs and others getting boundary pairs,
4023 * which in turns will lead to load imbalance between threads.
4024 * Set the block size as 5/11/ntask times the average number of cells
4025 * in a y,z slab. This should ensure a quite uniform distribution
4026 * of the grid parts of the different thread along all three grid
4027 * zone boundaries with 3D domain decomposition. At the same time
4028 * the blocks will not become too small.
4030 ci_block = (gridi->nc*ci_block_enum)/(ci_block_denom*gridi->ncx*nth);
4032 /* Ensure the blocks are not too small: avoids cache invalidation */
4033 if (ci_block*gridi->na_sc < ci_block_min_atoms)
4035 ci_block = (ci_block_min_atoms + gridi->na_sc - 1)/gridi->na_sc;
4038 /* Without domain decomposition
4039 * or with less than 3 blocks per task, divide in nth blocks.
4041 if (!bDomDec || ci_block*3*nth > gridi->nc)
4043 ci_block = (gridi->nc + nth - 1)/nth;
4049 /* Generates the part of pair-list nbl assigned to our thread */
4050 static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
4051 const nbnxn_grid_t *gridi,
4052 const nbnxn_grid_t *gridj,
4053 nbnxn_search_work_t *work,
4054 const nbnxn_atomdata_t *nbat,
4055 const t_blocka *excl,
4059 gmx_bool bFBufferFlag,
4062 int min_ci_balanced,
4064 nbnxn_pairlist_t *nbl)
4071 int ci_b,ci,ci_x,ci_y,ci_xy,cj;
4078 const float *bb_i,*bbcz_i,*bbcz_j;
4080 real bx0,bx1,by0,by1,bz0,bz1;
4082 real d2cx,d2z,d2z_cx,d2z_cy,d2zx,d2zxy,d2xy;
4083 int cxf,cxl,cyf,cyf_x,cyl;
4088 int gridi_flag_shift=0,gridj_flag_shift=0;
4089 unsigned *gridj_flag=NULL;
4090 int ncj_old_i,ncj_old_j;
4092 nbs_cycle_start(&work->cc[enbsCCsearch]);
4094 if (gridj->bSimple != nbl->bSimple)
4096 gmx_incons("Grid incompatible with pair-list");
4100 nbl->na_sc = gridj->na_sc;
4101 nbl->na_ci = gridj->na_c;
4102 nbl->na_cj = nbnxn_kernel_to_cj_size(nb_kernel_type);
4103 na_cj_2log = get_2log(nbl->na_cj);
4109 /* Determine conversion of clusters to flag blocks */
4110 gridi_flag_shift = 0;
4111 while ((nbl->na_ci<<gridi_flag_shift) < NBNXN_BUFFERFLAG_SIZE)
4115 gridj_flag_shift = 0;
4116 while ((nbl->na_cj<<gridj_flag_shift) < NBNXN_BUFFERFLAG_SIZE)
4121 gridj_flag = work->buffer_flags.flag;
4124 copy_mat(nbs->box,box);
4126 rl2 = nbl->rlist*nbl->rlist;
4128 rbb2 = boundingbox_only_distance2(gridi,gridj,nbl->rlist,nbl->bSimple);
4132 fprintf(debug,"nbl bounding box only distance %f\n",sqrt(rbb2));
4135 /* Set the shift range */
4136 for(d=0; d<DIM; d++)
4138 /* Check if we need periodicity shifts.
4139 * Without PBC or with domain decomposition we don't need them.
4141 if (d >= ePBC2npbcdim(nbs->ePBC) || nbs->dd_dim[d])
4148 box[XX][XX] - fabs(box[YY][XX]) - fabs(box[ZZ][XX]) < sqrt(rl2))
4159 if (nbl->bSimple && !gridi->bSimple)
4161 conv_i = gridi->na_sc/gridj->na_sc;
4162 bb_i = gridi->bb_simple;
4163 bbcz_i = gridi->bbcz_simple;
4164 flags_i = gridi->flags_simple;
4170 bbcz_i = gridi->bbcz;
4171 flags_i = gridi->flags;
4173 cell0_i = gridi->cell0*conv_i;
4175 bbcz_j = gridj->bbcz;
4179 /* Blocks of the conversion factor - 1 give a large repeat count
4180 * combined with a small block size. This should result in good
4181 * load balancing for both small and large domains.
4183 ci_block = conv_i - 1;
4187 fprintf(debug,"nbl nc_i %d col.av. %.1f ci_block %d\n",
4188 gridi->nc,gridi->nc/(double)(gridi->ncx*gridi->ncy),ci_block);
4194 /* Initially ci_b and ci to 1 before where we want them to start,
4195 * as they will both be incremented in next_ci.
4198 ci = th*ci_block - 1;
4201 while (next_ci(gridi,conv_i,nth,ci_block,&ci_x,&ci_y,&ci_b,&ci))
4203 if (nbl->bSimple && flags_i[ci] == 0)
4208 ncj_old_i = nbl->ncj;
4211 if (gridj != gridi && shp[XX] == 0)
4215 bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX];
4219 bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx;
4221 if (bx1 < gridj->c0[XX])
4223 d2cx = sqr(gridj->c0[XX] - bx1);
4232 ci_xy = ci_x*gridi->ncy + ci_y;
4234 /* Loop over shift vectors in three dimensions */
4235 for (tz=-shp[ZZ]; tz<=shp[ZZ]; tz++)
4237 shz = tz*box[ZZ][ZZ];
4239 bz0 = bbcz_i[ci*NNBSBB_D ] + shz;
4240 bz1 = bbcz_i[ci*NNBSBB_D+1] + shz;
4252 d2z = sqr(bz0 - box[ZZ][ZZ]);
4255 d2z_cx = d2z + d2cx;
4263 bz1/((real)(gridi->cxy_ind[ci_xy+1] - gridi->cxy_ind[ci_xy]));
4268 /* The check with bz1_frac close to or larger than 1 comes later */
4270 for (ty=-shp[YY]; ty<=shp[YY]; ty++)
4272 shy = ty*box[YY][YY] + tz*box[ZZ][YY];
4276 by0 = bb_i[ci*NNBSBB_B +YY] + shy;
4277 by1 = bb_i[ci*NNBSBB_B+NNBSBB_C+YY] + shy;
4281 by0 = gridi->c0[YY] + (ci_y )*gridi->sy + shy;
4282 by1 = gridi->c0[YY] + (ci_y+1)*gridi->sy + shy;
4285 get_cell_range(by0,by1,
4286 gridj->ncy,gridj->c0[YY],gridj->sy,gridj->inv_sy,
4296 if (by1 < gridj->c0[YY])
4298 d2z_cy += sqr(gridj->c0[YY] - by1);
4300 else if (by0 > gridj->c1[YY])
4302 d2z_cy += sqr(by0 - gridj->c1[YY]);
4305 for (tx=-shp[XX]; tx<=shp[XX]; tx++)
4307 shift = XYZ2IS(tx,ty,tz);
4309 #ifdef NBNXN_SHIFT_BACKWARD
4310 if (gridi == gridj && shift > CENTRAL)
4316 shx = tx*box[XX][XX] + ty*box[YY][XX] + tz*box[ZZ][XX];
4320 bx0 = bb_i[ci*NNBSBB_B +XX] + shx;
4321 bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX] + shx;
4325 bx0 = gridi->c0[XX] + (ci_x )*gridi->sx + shx;
4326 bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx + shx;
4329 get_cell_range(bx0,bx1,
4330 gridj->ncx,gridj->c0[XX],gridj->sx,gridj->inv_sx,
4341 new_ci_entry(nbl,cell0_i+ci,shift,flags_i[ci],
4346 new_sci_entry(nbl,cell0_i+ci,shift,flags_i[ci],
4350 #ifndef NBNXN_SHIFT_BACKWARD
4353 if (shift == CENTRAL && gridi == gridj &&
4357 /* Leave the pairs with i > j.
4358 * x is the major index, so skip half of it.
4365 set_icell_bb_simple(bb_i,ci,shx,shy,shz,
4370 set_icell_bb_supersub(bb_i,ci,shx,shy,shz,
4374 nbs->icell_set_x(cell0_i+ci,shx,shy,shz,
4375 gridi->na_c,nbat->xstride,nbat->x,
4378 for(cx=cxf; cx<=cxl; cx++)
4381 if (gridj->c0[XX] + cx*gridj->sx > bx1)
4383 d2zx += sqr(gridj->c0[XX] + cx*gridj->sx - bx1);
4385 else if (gridj->c0[XX] + (cx+1)*gridj->sx < bx0)
4387 d2zx += sqr(gridj->c0[XX] + (cx+1)*gridj->sx - bx0);
4390 #ifndef NBNXN_SHIFT_BACKWARD
4391 if (gridi == gridj &&
4392 cx == 0 && cyf < ci_y)
4394 if (gridi == gridj &&
4395 cx == 0 && shift == CENTRAL && cyf < ci_y)
4398 /* Leave the pairs with i > j.
4399 * Skip half of y when i and j have the same x.
4408 for(cy=cyf_x; cy<=cyl; cy++)
4410 c0 = gridj->cxy_ind[cx*gridj->ncy+cy];
4411 c1 = gridj->cxy_ind[cx*gridj->ncy+cy+1];
4412 #ifdef NBNXN_SHIFT_BACKWARD
4413 if (gridi == gridj &&
4414 shift == CENTRAL && c0 < ci)
4421 if (gridj->c0[YY] + cy*gridj->sy > by1)
4423 d2zxy += sqr(gridj->c0[YY] + cy*gridj->sy - by1);
4425 else if (gridj->c0[YY] + (cy+1)*gridj->sy < by0)
4427 d2zxy += sqr(gridj->c0[YY] + (cy+1)*gridj->sy - by0);
4429 if (c1 > c0 && d2zxy < rl2)
4431 cs = c0 + (int)(bz1_frac*(c1 - c0));
4439 /* Find the lowest cell that can possibly
4444 (bbcz_j[cf*NNBSBB_D+1] >= bz0 ||
4445 d2xy + sqr(bbcz_j[cf*NNBSBB_D+1] - bz0) < rl2))
4450 /* Find the highest cell that can possibly
4455 (bbcz_j[cl*NNBSBB_D] <= bz1 ||
4456 d2xy + sqr(bbcz_j[cl*NNBSBB_D] - bz1) < rl2))
4461 #ifdef NBNXN_REFCODE
4463 /* Simple reference code, for debugging,
4464 * overrides the more complex code above.
4469 for(k=c0; k<c1; k++)
4471 if (box_dist2(bx0,bx1,by0,by1,bz0,bz1,
4472 bb+k*NNBSBB_B) < rl2 &&
4477 if (box_dist2(bx0,bx1,by0,by1,bz0,bz1,
4478 bb+k*NNBSBB_B) < rl2 &&
4489 /* We want each atom/cell pair only once,
4490 * only use cj >= ci.
4492 #ifndef NBNXN_SHIFT_BACKWARD
4495 if (shift == CENTRAL)
4504 /* For f buffer flags with simple lists */
4505 ncj_old_j = nbl->ncj;
4507 switch (nb_kernel_type)
4509 case nbnxnk4x4_PlainC:
4510 check_subcell_list_space_simple(nbl,cl-cf+1);
4512 make_cluster_list_simple(gridj,
4514 (gridi == gridj && shift == CENTRAL),
4519 #ifdef GMX_NBNXN_SIMD_4XN
4520 case nbnxnk4xN_SIMD_4xN:
4521 check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
4522 make_cluster_list_simd_4xn(gridj,
4524 (gridi == gridj && shift == CENTRAL),
4530 #ifdef GMX_NBNXN_SIMD_2XNN
4531 case nbnxnk4xN_SIMD_2xNN:
4532 check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
4533 make_cluster_list_simd_2xnn(gridj,
4535 (gridi == gridj && shift == CENTRAL),
4541 case nbnxnk8x8x8_PlainC:
4542 case nbnxnk8x8x8_CUDA:
4543 check_subcell_list_space_supersub(nbl,cl-cf+1);
4544 for(cj=cf; cj<=cl; cj++)
4546 make_cluster_list_supersub(nbs,gridi,gridj,
4548 (gridi == gridj && shift == CENTRAL && ci == cj),
4549 nbat->xstride,nbat->x,
4555 ncpcheck += cl - cf + 1;
4557 if (bFBufferFlag && nbl->ncj > ncj_old_j)
4561 cbf = nbl->cj[ncj_old_j].cj >> gridj_flag_shift;
4562 cbl = nbl->cj[nbl->ncj-1].cj >> gridj_flag_shift;
4563 for(cb=cbf; cb<=cbl; cb++)
4565 gridj_flag[cb] = 1U<<th;
4573 /* Set the exclusions for this ci list */
4576 set_ci_top_excls(nbs,
4578 shift == CENTRAL && gridi == gridj,
4581 &(nbl->ci[nbl->nci]),
4586 set_sci_top_excls(nbs,
4588 shift == CENTRAL && gridi == gridj,
4590 &(nbl->sci[nbl->nsci]),
4594 /* Close this ci list */
4597 close_ci_entry_simple(nbl);
4601 close_ci_entry_supersub(nbl,
4603 progBal,min_ci_balanced,
4610 if (bFBufferFlag && nbl->ncj > ncj_old_i)
4612 work->buffer_flags.flag[(gridi->cell0+ci)>>gridi_flag_shift] = 1U<<th;
4616 work->ndistc = ndistc;
4618 nbs_cycle_stop(&work->cc[enbsCCsearch]);
4622 fprintf(debug,"number of distance checks %d\n",ndistc);
4623 fprintf(debug,"ncpcheck %s %d\n",gridi==gridj ? "local" : "non-local",
4628 print_nblist_statistics_simple(debug,nbl,nbs,rlist);
4632 print_nblist_statistics_supersub(debug,nbl,nbs,rlist);
4638 static void reduce_buffer_flags(const nbnxn_search_t nbs,
4640 const nbnxn_buffer_flags_t *dest)
4643 const unsigned *flag;
4645 for(s=0; s<nsrc; s++)
4647 flag = nbs->work[s].buffer_flags.flag;
4649 for(b=0; b<dest->nflag; b++)
4651 dest->flag[b] |= flag[b];
4656 static void print_reduction_cost(const nbnxn_buffer_flags_t *flags,int nout)
4658 int nelem,nkeep,ncopy,nred,b,c,out;
4664 for(b=0; b<flags->nflag; b++)
4666 if (flags->flag[b] == 1)
4668 /* Only flag 0 is set, no copy of reduction required */
4672 else if (flags->flag[b] > 0)
4675 for(out=0; out<nout; out++)
4677 if (flags->flag[b] & (1U<<out))
4694 fprintf(debug,"nbnxn reduction: #flag %d #list %d elem %4.2f, keep %4.2f copy %4.2f red %4.2f\n",
4696 nelem/(double)(flags->nflag),
4697 nkeep/(double)(flags->nflag),
4698 ncopy/(double)(flags->nflag),
4699 nred/(double)(flags->nflag));
4702 /* Make a local or non-local pair-list, depending on iloc */
4703 void nbnxn_make_pairlist(const nbnxn_search_t nbs,
4704 nbnxn_atomdata_t *nbat,
4705 const t_blocka *excl,
4707 int min_ci_balanced,
4708 nbnxn_pairlist_set_t *nbl_list,
4713 nbnxn_grid_t *gridi,*gridj;
4714 int nzi,zi,zj0,zj1,zj;
4718 nbnxn_pairlist_t **nbl;
4720 gmx_bool CombineNBLists;
4721 int np_tot,np_noq,np_hlj,nap;
4723 nnbl = nbl_list->nnbl;
4724 nbl = nbl_list->nbl;
4725 CombineNBLists = nbl_list->bCombined;
4729 fprintf(debug,"ns making %d nblists\n", nnbl);
4732 nbat->bUseBufferFlags = (nbat->nout > 1);
4733 if (nbat->bUseBufferFlags && LOCAL_I(iloc))
4735 init_buffer_flags(&nbat->buffer_flags,nbat->natoms);
4738 if (nbl_list->bSimple)
4740 switch (nb_kernel_type)
4742 #ifdef GMX_NBNXN_SIMD_4XN
4743 case nbnxnk4xN_SIMD_4xN:
4744 nbs->icell_set_x = icell_set_x_simd_4xn;
4747 #ifdef GMX_NBNXN_SIMD_2XNN
4748 case nbnxnk4xN_SIMD_2xNN:
4749 nbs->icell_set_x = icell_set_x_simd_2xnn;
4753 nbs->icell_set_x = icell_set_x_simple;
4759 #ifdef NBNXN_SEARCH_SSE
4760 nbs->icell_set_x = icell_set_x_supersub_sse8;
4762 nbs->icell_set_x = icell_set_x_supersub;
4768 /* Only zone (grid) 0 vs 0 */
4775 nzi = nbs->zones->nizone;
4778 if (!nbl_list->bSimple && min_ci_balanced > 0)
4780 nsubpair_max = get_nsubpair_max(nbs,iloc,rlist,min_ci_balanced);
4787 /* Clear all pair-lists */
4788 for(th=0; th<nnbl; th++)
4790 clear_pairlist(nbl[th]);
4793 for(zi=0; zi<nzi; zi++)
4795 gridi = &nbs->grid[zi];
4797 if (NONLOCAL_I(iloc))
4799 zj0 = nbs->zones->izone[zi].j0;
4800 zj1 = nbs->zones->izone[zi].j1;
4806 for(zj=zj0; zj<zj1; zj++)
4808 gridj = &nbs->grid[zj];
4812 fprintf(debug,"ns search grid %d vs %d\n",zi,zj);
4815 nbs_cycle_start(&nbs->cc[enbsCCsearch]);
4817 if (nbl[0]->bSimple && !gridi->bSimple)
4819 /* Hybrid list, determine blocking later */
4824 ci_block = get_ci_block_size(gridi,nbs->DomDec,nnbl);
4827 #pragma omp parallel for num_threads(nnbl) schedule(static)
4828 for(th=0; th<nnbl; th++)
4830 if (nbat->bUseBufferFlags && zi == 0 && zj == 0)
4832 init_buffer_flags(&nbs->work[th].buffer_flags,nbat->natoms);
4835 if (CombineNBLists && th > 0)
4837 clear_pairlist(nbl[th]);
4840 /* Divide the i super cell equally over the nblists */
4841 nbnxn_make_pairlist_part(nbs,gridi,gridj,
4842 &nbs->work[th],nbat,excl,
4846 nbat->bUseBufferFlags,
4848 (LOCAL_I(iloc) || nbs->zones->n <= 2),
4853 nbs_cycle_stop(&nbs->cc[enbsCCsearch]);
4858 for(th=0; th<nnbl; th++)
4860 inc_nrnb(nrnb,eNR_NBNXN_DIST2,nbs->work[th].ndistc);
4862 if (nbl_list->bSimple)
4864 np_tot += nbl[th]->ncj;
4865 np_noq += nbl[th]->work->ncj_noq;
4866 np_hlj += nbl[th]->work->ncj_hlj;
4870 /* This count ignores potential subsequent pair pruning */
4871 np_tot += nbl[th]->nci_tot;
4874 nap = nbl[0]->na_ci*nbl[0]->na_cj;
4875 nbl_list->natpair_ljq = (np_tot - np_noq)*nap - np_hlj*nap/2;
4876 nbl_list->natpair_lj = np_noq*nap;
4877 nbl_list->natpair_q = np_hlj*nap/2;
4879 if (CombineNBLists && nnbl > 1)
4881 nbs_cycle_start(&nbs->cc[enbsCCcombine]);
4883 combine_nblists(nnbl-1,nbl+1,nbl[0]);
4885 nbs_cycle_stop(&nbs->cc[enbsCCcombine]);
4890 if (nbat->bUseBufferFlags)
4892 reduce_buffer_flags(nbs,nnbl,&nbat->buffer_flags);
4896 print_supersub_nsp("nsubpair",nbl[0],iloc);
4899 /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
4902 nbs->search_count++;
4904 if (nbs->print_cycles &&
4905 (!nbs->DomDec || (nbs->DomDec && !LOCAL_I(iloc))) &&
4906 nbs->search_count % 100 == 0)
4908 nbs_cycle_print(stderr,nbs);
4911 if (debug && (CombineNBLists && nnbl > 1))
4913 if (nbl[0]->bSimple)
4915 print_nblist_statistics_simple(debug,nbl[0],nbs,rlist);
4919 print_nblist_statistics_supersub(debug,nbl[0],nbs,rlist);
4927 if (nbl[0]->bSimple)
4929 print_nblist_ci_cj(debug,nbl[0]);
4933 print_nblist_sci_cj(debug,nbl[0]);
4937 if (nbat->bUseBufferFlags)
4939 print_reduction_cost(&nbat->buffer_flags,nnbl);