Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_search.c
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2  *
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  *          GROningen MAchine for Chemical Simulations
9  *
10  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
11  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
12  * Copyright (c) 2001-2012, The GROMACS development team,
13  * check out http://www.gromacs.org for more information.
14
15  * This program is free software; you can redistribute it and/or
16  * modify it under the terms of the GNU General Public License
17  * as published by the Free Software Foundation; either version 2
18  * of the License, or (at your option) any later version.
19  *
20  * If you want to redistribute modifications, please consider that
21  * scientific software is very special. Version control is crucial -
22  * bugs must be traceable. We will be happy to consider code for
23  * inclusion in the official distribution, but derived work must not
24  * be called official GROMACS. Details are found in the README & COPYING
25  * files - if they are missing, get the official version at www.gromacs.org.
26  *
27  * To help us fund GROMACS development, we humbly ask that you cite
28  * the papers on the package - you can find them in the top README file.
29  *
30  * For more info, check our website at http://www.gromacs.org
31  */
32
33 #ifdef HAVE_CONFIG_H
34 #include <config.h>
35 #endif
36
37 #include <math.h>
38 #include <string.h>
39 #include "sysstuff.h"
40 #include "smalloc.h"
41 #include "macros.h"
42 #include "maths.h"
43 #include "vec.h"
44 #include "pbc.h"
45 #include "nbnxn_consts.h"
46 #include "nbnxn_internal.h"
47 #include "nbnxn_atomdata.h"
48 #include "nbnxn_search.h"
49 #include "gmx_cyclecounter.h"
50 #include "gmxfio.h"
51 #include "gmx_omp_nthreads.h"
52 #include "nrnb.h"
53
54
55 /* Pair search box lower and upper corner in x,y,z.
56  * Store this in 4 iso 3 reals, which is useful with SSE.
57  * To avoid complicating the code we also use 4 without SSE.
58  */
59 #define NNBSBB_C         4
60 #define NNBSBB_B         (2*NNBSBB_C)
61 /* Pair search box lower and upper bound in z only. */
62 #define NNBSBB_D         2
63 /* Pair search box lower and upper corner x,y,z indices */
64 #define BBL_X  0
65 #define BBL_Y  1
66 #define BBL_Z  2
67 #define BBU_X  4
68 #define BBU_Y  5
69 #define BBU_Z  6
70
71
72 #ifdef NBNXN_SEARCH_SSE
73
74 #ifndef GMX_DOUBLE
75 #define NBNXN_SEARCH_SSE_SINGLE
76 #include "gmx_x86_simd_single.h"
77 #else
78 #include "gmx_x86_simd_double.h"
79 #endif
80
81 #if defined NBNXN_SEARCH_SSE_SINGLE && GPU_NSUBCELL == 8
82 #define NBNXN_8BB_SSE
83 #endif
84
85 /* The width of SSE/AVX128 with single precision for bounding boxes with GPU.
86  * Here AVX-256 turns out to be slightly slower than AVX-128.
87  */
88 #define STRIDE_8BB        4
89 #define STRIDE_8BB_2LOG   2
90
91
92 /* The functions below are macros as they are performance sensitive */
93
94 /* 4x4 list, pack=4: no complex conversion required */
95 /* i-cluster to j-cluster conversion */
96 #define CI_TO_CJ_J4(ci)   (ci)
97 /* cluster index to coordinate array index conversion */
98 #define X_IND_CI_J4(ci)  ((ci)*STRIDE_P4)
99 #define X_IND_CJ_J4(cj)  ((cj)*STRIDE_P4)
100
101 /* 4x2 list, pack=4: j-cluster size is half the packing width */
102 /* i-cluster to j-cluster conversion */
103 #define CI_TO_CJ_J2(ci)  ((ci)<<1)
104 /* cluster index to coordinate array index conversion */
105 #define X_IND_CI_J2(ci)  ((ci)*STRIDE_P4)
106 #define X_IND_CJ_J2(cj)  (((cj)>>1)*STRIDE_P4 + ((cj) & 1)*(PACK_X4>>1))
107
108 /* 4x8 list, pack=8: i-cluster size is half the packing width */
109 /* i-cluster to j-cluster conversion */
110 #define CI_TO_CJ_J8(ci)  ((ci)>>1)
111 /* cluster index to coordinate array index conversion */
112 #define X_IND_CI_J8(ci)  (((ci)>>1)*STRIDE_P8 + ((ci) & 1)*(PACK_X8>>1))
113 #define X_IND_CJ_J8(cj)  ((cj)*STRIDE_P8)
114
115 /* The j-cluster size is matched to the SIMD width */
116 #ifndef GMX_DOUBLE
117 /* 128 bits can hold 4 floats */
118 #define CI_TO_CJ_S128(ci)  CI_TO_CJ_J4(ci)
119 #define X_IND_CI_S128(ci)  X_IND_CI_J4(ci)
120 #define X_IND_CJ_S128(cj)  X_IND_CJ_J4(cj)
121 /* 256 bits can hold 8 floats */
122 #define CI_TO_CJ_S256(ci)  CI_TO_CJ_J8(ci)
123 #define X_IND_CI_S256(ci)  X_IND_CI_J8(ci)
124 #define X_IND_CJ_S256(cj)  X_IND_CJ_J8(cj)
125 #else
126 /* 128 bits can hold 2 doubles */
127 #define CI_TO_CJ_S128(ci)  CI_TO_CJ_J2(ci)
128 #define X_IND_CI_S128(ci)  X_IND_CI_J2(ci)
129 #define X_IND_CJ_S128(cj)  X_IND_CJ_J2(cj)
130 /* 256 bits can hold 4 doubles */
131 #define CI_TO_CJ_S256(ci)  CI_TO_CJ_J4(ci)
132 #define X_IND_CI_S256(ci)  X_IND_CI_J4(ci)
133 #define X_IND_CJ_S256(cj)  X_IND_CJ_J4(cj)
134 #endif
135
136 #endif /* NBNXN_SEARCH_SSE */
137
138
139 /* Interaction masks for 4xN atom interactions.
140  * Bit i*CJ_SIZE + j tells if atom i and j interact.
141  */
142 /* All interaction mask is the same for all kernels */
143 #define NBNXN_INT_MASK_ALL        0xffffffff
144 /* 4x4 kernel diagonal mask */
145 #define NBNXN_INT_MASK_DIAG       0x08ce
146 /* 4x2 kernel diagonal masks */
147 #define NBNXN_INT_MASK_DIAG_J2_0  0x0002
148 #define NBNXN_INT_MASK_DIAG_J2_1  0x002F
149 /* 4x8 kernel diagonal masks */
150 #define NBNXN_INT_MASK_DIAG_J8_0  0xf0f8fcfe
151 #define NBNXN_INT_MASK_DIAG_J8_1  0x0080c0e0
152
153
154 #ifdef NBNXN_SEARCH_SSE
155 /* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
156 #define NBNXN_BBXXXX
157 /* Size of bounding box corners quadruplet */
158 #define NNBSBB_XXXX      (NNBSBB_D*DIM*STRIDE_8BB)
159 #endif
160
161 /* We shift the i-particles backward for PBC.
162  * This leads to more conditionals than shifting forward.
163  * We do this to get more balanced pair lists.
164  */
165 #define NBNXN_SHIFT_BACKWARD
166
167
168 /* This define is a lazy way to avoid interdependence of the grid
169  * and searching data structures.
170  */
171 #define NBNXN_NA_SC_MAX (GPU_NSUBCELL*NBNXN_GPU_CLUSTER_SIZE)
172
173
174 static void nbs_cycle_clear(nbnxn_cycle_t *cc)
175 {
176     int i;
177
178     for(i=0; i<enbsCCnr; i++)
179     {
180         cc[i].count = 0;
181         cc[i].c     = 0;
182     }
183 }
184
185 static double Mcyc_av(const nbnxn_cycle_t *cc)
186 {
187     return (double)cc->c*1e-6/cc->count;
188 }
189
190 static void nbs_cycle_print(FILE *fp,const nbnxn_search_t nbs)
191 {
192     int n;
193     int t;
194
195     fprintf(fp,"\n");
196     fprintf(fp,"ns %4d grid %4.1f search %4.1f red.f %5.3f",
197             nbs->cc[enbsCCgrid].count,
198             Mcyc_av(&nbs->cc[enbsCCgrid]),
199             Mcyc_av(&nbs->cc[enbsCCsearch]),
200             Mcyc_av(&nbs->cc[enbsCCreducef]));
201
202     if (nbs->nthread_max > 1)
203     {
204         if (nbs->cc[enbsCCcombine].count > 0)
205         {
206             fprintf(fp," comb %5.2f",
207                     Mcyc_av(&nbs->cc[enbsCCcombine]));
208         }
209         fprintf(fp," s. th");
210         for(t=0; t<nbs->nthread_max; t++)
211         {
212             fprintf(fp," %4.1f",
213                     Mcyc_av(&nbs->work[t].cc[enbsCCsearch]));
214         }
215     }
216     fprintf(fp,"\n");
217 }
218
219 static void nbnxn_grid_init(nbnxn_grid_t * grid)
220 {
221     grid->cxy_na      = NULL;
222     grid->cxy_ind     = NULL;
223     grid->cxy_nalloc  = 0;
224     grid->bb          = NULL;
225     grid->bbj         = NULL;
226     grid->nc_nalloc   = 0;
227 }
228
229 static int get_2log(int n)
230 {
231     int log2;
232
233     log2 = 0;
234     while ((1<<log2) < n)
235     {
236         log2++;
237     }
238     if ((1<<log2) != n)
239     {
240         gmx_fatal(FARGS,"nbnxn na_c (%d) is not a power of 2",n);
241     }
242
243     return log2;
244 }
245
246 static int nbnxn_kernel_to_ci_size(int nb_kernel_type)
247 {
248     switch (nb_kernel_type)
249     {
250     case nbk4x4_PlainC:
251     case nbk4xN_X86_SIMD128:
252     case nbk4xN_X86_SIMD256:
253         return NBNXN_CPU_CLUSTER_I_SIZE;
254     case nbk8x8x8_CUDA:
255     case nbk8x8x8_PlainC:
256         /* The cluster size for super/sub lists is only set here.
257          * Any value should work for the pair-search and atomdata code.
258          * The kernels, of course, might require a particular value.
259          */
260         return NBNXN_GPU_CLUSTER_SIZE;
261     default:
262         gmx_incons("unknown kernel type");
263     }
264
265     return 0;
266 }
267
268 int nbnxn_kernel_to_cj_size(int nb_kernel_type)
269 {
270     switch (nb_kernel_type)
271     {
272     case nbk4x4_PlainC:
273         return NBNXN_CPU_CLUSTER_I_SIZE;
274     case nbk4xN_X86_SIMD128:
275         /* Number of reals that fit in SIMD (128 bits = 16 bytes) */
276         return 16/sizeof(real);
277     case nbk4xN_X86_SIMD256:
278         /* Number of reals that fit in SIMD (256 bits = 32 bytes) */
279         return 32/sizeof(real);
280     case nbk8x8x8_CUDA:
281     case nbk8x8x8_PlainC:
282         return nbnxn_kernel_to_ci_size(nb_kernel_type);
283     default:
284         gmx_incons("unknown kernel type");
285     }
286
287     return 0;
288 }
289
290 static int ci_to_cj(int na_cj_2log,int ci)
291 {
292     switch (na_cj_2log)
293     {
294     case 2: return  ci;     break;
295     case 1: return (ci<<1); break;
296     case 3: return (ci>>1); break;
297     }
298
299     return 0;
300 }
301
302 gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type)
303 {
304     if (nb_kernel_type == nbkNotSet)
305     {
306         gmx_fatal(FARGS, "Non-bonded kernel type not set for Verlet-style pair-list.");
307     }
308
309     switch (nb_kernel_type)
310     {
311     case nbk8x8x8_CUDA:
312     case nbk8x8x8_PlainC:
313         return FALSE;
314
315     case nbk4x4_PlainC:
316     case nbk4xN_X86_SIMD128:
317     case nbk4xN_X86_SIMD256:
318         return TRUE;
319
320     default:
321         gmx_incons("Invalid nonbonded kernel type passed!");
322         return FALSE;
323     }
324 }
325
326 void nbnxn_init_search(nbnxn_search_t * nbs_ptr,
327                        ivec *n_dd_cells,
328                        gmx_domdec_zones_t *zones,
329                        int nthread_max)
330 {
331     nbnxn_search_t nbs;
332     int d,g,t;
333
334     snew(nbs,1);
335     *nbs_ptr = nbs;
336
337     nbs->DomDec = (n_dd_cells != NULL);
338
339     clear_ivec(nbs->dd_dim);
340     nbs->ngrid = 1;
341     if (nbs->DomDec)
342     {
343         nbs->zones = zones;
344
345         for(d=0; d<DIM; d++)
346         {
347             if ((*n_dd_cells)[d] > 1)
348             {
349                 nbs->dd_dim[d] = 1;
350                 /* Each grid matches a DD zone */
351                 nbs->ngrid *= 2;
352             }
353         }
354     }
355
356     snew(nbs->grid,nbs->ngrid);
357     for(g=0; g<nbs->ngrid; g++)
358     {
359         nbnxn_grid_init(&nbs->grid[g]);
360     }
361     nbs->cell        = NULL;
362     nbs->cell_nalloc = 0;
363     nbs->a           = NULL;
364     nbs->a_nalloc    = 0;
365
366     nbs->nthread_max = nthread_max;
367
368     /* Initialize the work data structures for each thread */
369     snew(nbs->work,nbs->nthread_max);
370     for(t=0; t<nbs->nthread_max; t++)
371     {
372         nbs->work[t].cxy_na           = NULL;
373         nbs->work[t].cxy_na_nalloc    = 0;
374         nbs->work[t].sort_work        = NULL;
375         nbs->work[t].sort_work_nalloc = 0;
376     }
377
378     /* Initialize detailed nbsearch cycle counting */
379     nbs->print_cycles = (getenv("GMX_NBNXN_CYCLE") != 0);
380     nbs->search_count = 0;
381     nbs_cycle_clear(nbs->cc);
382     for(t=0; t<nbs->nthread_max; t++)
383     {
384         nbs_cycle_clear(nbs->work[t].cc);
385     }
386 }
387
388 static real grid_atom_density(int n,rvec corner0,rvec corner1)
389 {
390     rvec size;
391
392     rvec_sub(corner1,corner0,size);
393
394     return n/(size[XX]*size[YY]*size[ZZ]);
395 }
396
397 static int set_grid_size_xy(const nbnxn_search_t nbs,
398                             nbnxn_grid_t *grid,
399                             int n,rvec corner0,rvec corner1,
400                             real atom_density,
401                             int XFormat)
402 {
403     rvec size;
404     int  na_c;
405     real adens,tlen,tlen_x,tlen_y,nc_max;
406     int  t;
407
408     rvec_sub(corner1,corner0,size);
409
410     if (n > grid->na_sc)
411     {
412         /* target cell length */
413         if (grid->bSimple)
414         {
415             /* To minimize the zero interactions, we should make
416              * the largest of the i/j cell cubic.
417              */
418             na_c = max(grid->na_c,grid->na_cj);
419
420             /* Approximately cubic cells */
421             tlen   = pow(na_c/atom_density,1.0/3.0);
422             tlen_x = tlen;
423             tlen_y = tlen;
424         }
425         else
426         {
427             /* Approximately cubic sub cells */
428             tlen   = pow(grid->na_c/atom_density,1.0/3.0);
429             tlen_x = tlen*GPU_NSUBCELL_X;
430             tlen_y = tlen*GPU_NSUBCELL_Y;
431         }
432         /* We round ncx and ncy down, because we get less cell pairs
433          * in the nbsist when the fixed cell dimensions (x,y) are
434          * larger than the variable one (z) than the other way around.
435          */
436         grid->ncx = max(1,(int)(size[XX]/tlen_x));
437         grid->ncy = max(1,(int)(size[YY]/tlen_y));
438     }
439     else
440     {
441         grid->ncx = 1;
442         grid->ncy = 1;
443     }
444
445     /* We need one additional cell entry for particles moved by DD */
446     if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
447     {
448         grid->cxy_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
449         srenew(grid->cxy_na,grid->cxy_nalloc);
450         srenew(grid->cxy_ind,grid->cxy_nalloc+1);
451     }
452     for(t=0; t<nbs->nthread_max; t++)
453     {
454         if (grid->ncx*grid->ncy+1 > nbs->work[t].cxy_na_nalloc)
455         {
456             nbs->work[t].cxy_na_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
457             srenew(nbs->work[t].cxy_na,nbs->work[t].cxy_na_nalloc);
458         }
459     }
460
461     /* Worst case scenario of 1 atom in each last cell */
462     if (grid->na_cj <= grid->na_c)
463     {
464         nc_max = n/grid->na_sc + grid->ncx*grid->ncy;
465     }
466     else
467     {
468         nc_max = n/grid->na_sc + grid->ncx*grid->ncy*grid->na_cj/grid->na_c;
469     }
470
471     if (nc_max > grid->nc_nalloc)
472     {
473         int bb_nalloc;
474
475         grid->nc_nalloc = over_alloc_large(nc_max);
476         srenew(grid->nsubc,grid->nc_nalloc);
477         srenew(grid->bbcz,grid->nc_nalloc*NNBSBB_D);
478 #ifdef NBNXN_8BB_SSE
479         bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX;
480 #else
481         bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL*NNBSBB_B;
482 #endif
483         sfree_aligned(grid->bb);
484         /* This snew also zeros the contents, this avoid possible
485          * floating exceptions in SSE with the unused bb elements.
486          */
487         snew_aligned(grid->bb,bb_nalloc,16);
488
489         if (grid->bSimple)
490         {
491             if (grid->na_cj == grid->na_c)
492             {
493                 grid->bbj = grid->bb;
494             }
495             else
496             {
497                 sfree_aligned(grid->bbj);
498                 snew_aligned(grid->bbj,bb_nalloc*grid->na_c/grid->na_cj,16);
499             }
500         }
501
502         srenew(grid->flags,grid->nc_nalloc);
503     }
504
505     copy_rvec(corner0,grid->c0);
506     copy_rvec(corner1,grid->c1);
507     grid->sx = size[XX]/grid->ncx;
508     grid->sy = size[YY]/grid->ncy;
509     grid->inv_sx = 1/grid->sx;
510     grid->inv_sy = 1/grid->sy;
511
512     return nc_max;
513 }
514
515 #define SORT_GRID_OVERSIZE 2
516 #define SGSF (SORT_GRID_OVERSIZE + 1)
517
518 static void sort_atoms(int dim,gmx_bool Backwards,
519                        int *a,int n,rvec *x,
520                        real h0,real invh,int nsort,int *sort)
521 {
522     int i,c;
523     int zi,zim;
524     int cp,tmp;
525
526     if (n <= 1)
527     {
528         /* Nothing to do */
529         return;
530     }
531
532     /* For small oversize factors clearing the whole area is fastest.
533      * For large oversize we should clear the used elements after use.
534      */
535     for(i=0; i<nsort; i++)
536     {
537         sort[i] = -1;
538     }
539     /* Sort the particles using a simple index sort */
540     for(i=0; i<n; i++)
541     {
542         /* The cast takes care of float-point rounding effects below zero.
543          * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
544          * times the box height out of the box.
545          */
546         zi = (int)((x[a[i]][dim] - h0)*invh);
547
548 #ifdef DEBUG_NBNXN_GRIDDING
549         if (zi < 0 || zi >= nsort)
550         {
551             gmx_fatal(FARGS,"(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d\n",
552                       a[i],'x'+dim,x[a[i]][dim],h0,invh,zi,nsort);
553         }
554 #endif
555
556         /* Ideally this particle should go in sort cell zi,
557          * but that might already be in use,
558          * in that case find the first empty cell higher up
559          */
560         if (sort[zi] < 0)
561         {
562             sort[zi] = a[i];
563         }
564         else
565         {
566             /* We have multiple atoms in the same sorting slot.
567              * Sort on real z for minimal bounding box size.
568              * There is an extra check for identical z to ensure
569              * well-defined output order, independent of input order
570              * to ensure binary reproducibility after restarts.
571              */
572             while(sort[zi] >= 0 && ( x[a[i]][dim] >  x[sort[zi]][dim] ||
573                                     (x[a[i]][dim] == x[sort[zi]][dim] &&
574                                      a[i] > sort[zi])))
575             {
576                 zi++;
577             }
578
579             if (sort[zi] >= 0)
580             {
581                 /* Shift all elements by one slot until we find an empty slot */
582                 cp = sort[zi];
583                 zim = zi + 1;
584                 while (sort[zim] >= 0)
585                 {
586                     tmp = sort[zim];
587                     sort[zim] = cp;
588                     cp  = tmp;
589                     zim++;
590                 }
591                 sort[zim] = cp;
592             }
593             sort[zi] = a[i];
594         }
595     }
596
597     c = 0;
598     if (!Backwards)
599     {
600         for(zi=0; zi<nsort; zi++)
601         {
602             if (sort[zi] >= 0)
603             {
604                 a[c++] = sort[zi];
605             }
606         }
607     }
608     else
609     {
610         for(zi=nsort-1; zi>=0; zi--)
611         {
612             if (sort[zi] >= 0)
613             {
614                 a[c++] = sort[zi];
615             }
616         }
617     }
618     if (c < n)
619     {
620         gmx_incons("Lost particles while sorting");
621     }
622 }
623
624 #ifdef GMX_DOUBLE
625 #define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
626 #define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
627 #else
628 #define R2F_D(x) (x)
629 #define R2F_U(x) (x)
630 #endif
631
632 /* Coordinate order x,y,z, bb order xyz0 */
633 static void calc_bounding_box(int na,int stride,const real *x,float *bb)
634 {
635     int  i,j;
636     real xl,xh,yl,yh,zl,zh;
637
638     i = 0;
639     xl = x[i+XX];
640     xh = x[i+XX];
641     yl = x[i+YY];
642     yh = x[i+YY];
643     zl = x[i+ZZ];
644     zh = x[i+ZZ];
645     i += stride;
646     for(j=1; j<na; j++)
647     {
648         xl = min(xl,x[i+XX]);
649         xh = max(xh,x[i+XX]);
650         yl = min(yl,x[i+YY]);
651         yh = max(yh,x[i+YY]);
652         zl = min(zl,x[i+ZZ]);
653         zh = max(zh,x[i+ZZ]);
654         i += stride;
655     }
656     /* Note: possible double to float conversion here */
657     bb[BBL_X] = R2F_D(xl);
658     bb[BBL_Y] = R2F_D(yl);
659     bb[BBL_Z] = R2F_D(zl);
660     bb[BBU_X] = R2F_U(xh);
661     bb[BBU_Y] = R2F_U(yh);
662     bb[BBU_Z] = R2F_U(zh);
663 }
664
665 /* Packed coordinates, bb order xyz0 */
666 static void calc_bounding_box_x_x4(int na,const real *x,float *bb)
667 {
668     int  j;
669     real xl,xh,yl,yh,zl,zh;
670
671     xl = x[XX*PACK_X4];
672     xh = x[XX*PACK_X4];
673     yl = x[YY*PACK_X4];
674     yh = x[YY*PACK_X4];
675     zl = x[ZZ*PACK_X4];
676     zh = x[ZZ*PACK_X4];
677     for(j=1; j<na; j++)
678     {
679         xl = min(xl,x[j+XX*PACK_X4]);
680         xh = max(xh,x[j+XX*PACK_X4]);
681         yl = min(yl,x[j+YY*PACK_X4]);
682         yh = max(yh,x[j+YY*PACK_X4]);
683         zl = min(zl,x[j+ZZ*PACK_X4]);
684         zh = max(zh,x[j+ZZ*PACK_X4]);
685     }
686     /* Note: possible double to float conversion here */
687     bb[BBL_X] = R2F_D(xl);
688     bb[BBL_Y] = R2F_D(yl);
689     bb[BBL_Z] = R2F_D(zl);
690     bb[BBU_X] = R2F_U(xh);
691     bb[BBU_Y] = R2F_U(yh);
692     bb[BBU_Z] = R2F_U(zh);
693 }
694
695 /* Packed coordinates, bb order xyz0 */
696 static void calc_bounding_box_x_x8(int na,const real *x,float *bb)
697 {
698     int  j;
699     real xl,xh,yl,yh,zl,zh;
700
701     xl = x[XX*PACK_X8];
702     xh = x[XX*PACK_X8];
703     yl = x[YY*PACK_X8];
704     yh = x[YY*PACK_X8];
705     zl = x[ZZ*PACK_X8];
706     zh = x[ZZ*PACK_X8];
707     for(j=1; j<na; j++)
708     {
709         xl = min(xl,x[j+XX*PACK_X8]);
710         xh = max(xh,x[j+XX*PACK_X8]);
711         yl = min(yl,x[j+YY*PACK_X8]);
712         yh = max(yh,x[j+YY*PACK_X8]);
713         zl = min(zl,x[j+ZZ*PACK_X8]);
714         zh = max(zh,x[j+ZZ*PACK_X8]);
715     }
716     /* Note: possible double to float conversion here */
717     bb[BBL_X] = R2F_D(xl);
718     bb[BBL_Y] = R2F_D(yl);
719     bb[BBL_Z] = R2F_D(zl);
720     bb[BBU_X] = R2F_U(xh);
721     bb[BBU_Y] = R2F_U(yh);
722     bb[BBU_Z] = R2F_U(zh);
723 }
724
725 #ifdef NBNXN_SEARCH_SSE
726
727 /* Packed coordinates, bb order xyz0 */
728 static void calc_bounding_box_x_x4_halves(int na,const real *x,
729                                           float *bb,float *bbj)
730 {
731     calc_bounding_box_x_x4(min(na,2),x,bbj);
732
733     if (na > 2)
734     {
735         calc_bounding_box_x_x4(min(na-2,2),x+(PACK_X4>>1),bbj+NNBSBB_B);
736     }
737     else
738     {
739         /* Set the "empty" bounding box to the same as the first one,
740          * so we don't need to treat special cases in the rest of the code.
741          */
742         _mm_store_ps(bbj+NNBSBB_B         ,_mm_load_ps(bbj));
743         _mm_store_ps(bbj+NNBSBB_B+NNBSBB_C,_mm_load_ps(bbj+NNBSBB_C));
744     }
745
746     _mm_store_ps(bb         ,_mm_min_ps(_mm_load_ps(bbj),
747                                         _mm_load_ps(bbj+NNBSBB_B)));
748     _mm_store_ps(bb+NNBSBB_C,_mm_max_ps(_mm_load_ps(bbj+NNBSBB_C),
749                                         _mm_load_ps(bbj+NNBSBB_B+NNBSBB_C)));
750 }
751
752 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
753 static void calc_bounding_box_xxxx(int na,int stride,const real *x,float *bb)
754 {
755     int  i,j;
756     real xl,xh,yl,yh,zl,zh;
757
758     i = 0;
759     xl = x[i+XX];
760     xh = x[i+XX];
761     yl = x[i+YY];
762     yh = x[i+YY];
763     zl = x[i+ZZ];
764     zh = x[i+ZZ];
765     i += stride;
766     for(j=1; j<na; j++)
767     {
768         xl = min(xl,x[i+XX]);
769         xh = max(xh,x[i+XX]);
770         yl = min(yl,x[i+YY]);
771         yh = max(yh,x[i+YY]);
772         zl = min(zl,x[i+ZZ]);
773         zh = max(zh,x[i+ZZ]);
774         i += stride;
775     }
776     /* Note: possible double to float conversion here */
777     bb[0*STRIDE_8BB] = R2F_D(xl);
778     bb[1*STRIDE_8BB] = R2F_D(yl);
779     bb[2*STRIDE_8BB] = R2F_D(zl);
780     bb[3*STRIDE_8BB] = R2F_U(xh);
781     bb[4*STRIDE_8BB] = R2F_U(yh);
782     bb[5*STRIDE_8BB] = R2F_U(zh);
783 }
784
785 #endif /* NBNXN_SEARCH_SSE */
786
787 #ifdef NBNXN_SEARCH_SSE_SINGLE
788
789 /* Coordinate order xyz?, bb order xyz0 */
790 static void calc_bounding_box_sse(int na,const float *x,float *bb)
791 {
792     __m128 bb_0_SSE,bb_1_SSE;
793     __m128 x_SSE;
794
795     int  i;
796
797     bb_0_SSE = _mm_load_ps(x);
798     bb_1_SSE = bb_0_SSE;
799
800     for(i=1; i<na; i++)
801     {
802         x_SSE    = _mm_load_ps(x+i*NNBSBB_C);
803         bb_0_SSE = _mm_min_ps(bb_0_SSE,x_SSE);
804         bb_1_SSE = _mm_max_ps(bb_1_SSE,x_SSE);
805     }
806
807     _mm_store_ps(bb  ,bb_0_SSE);
808     _mm_store_ps(bb+4,bb_1_SSE);
809 }
810
811 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
812 static void calc_bounding_box_xxxx_sse(int na,const float *x,
813                                        float *bb_work,
814                                        real *bb)
815 {
816     calc_bounding_box_sse(na,x,bb_work);
817
818     bb[0*STRIDE_8BB] = bb_work[BBL_X];
819     bb[1*STRIDE_8BB] = bb_work[BBL_Y];
820     bb[2*STRIDE_8BB] = bb_work[BBL_Z];
821     bb[3*STRIDE_8BB] = bb_work[BBU_X];
822     bb[4*STRIDE_8BB] = bb_work[BBU_Y];
823     bb[5*STRIDE_8BB] = bb_work[BBU_Z];
824 }
825
826 #endif /* NBNXN_SEARCH_SSE_SINGLE */
827
828 #ifdef NBNXN_SEARCH_SSE
829
830 /* Combines pairs of consecutive bounding boxes */
831 static void combine_bounding_box_pairs(nbnxn_grid_t *grid,const float *bb)
832 {
833     int    i,j,sc2,nc2,c2;
834     __m128 min_SSE,max_SSE;
835
836     for(i=0; i<grid->ncx*grid->ncy; i++)
837     {
838         /* Starting bb in a column is expected to be 2-aligned */
839         sc2 = grid->cxy_ind[i]>>1;
840         /* For odd numbers skip the last bb here */
841         nc2 = (grid->cxy_na[i]+3)>>(2+1);
842         for(c2=sc2; c2<sc2+nc2; c2++)
843         {
844             min_SSE = _mm_min_ps(_mm_load_ps(bb+(c2*4+0)*NNBSBB_C),
845                                  _mm_load_ps(bb+(c2*4+2)*NNBSBB_C));
846             max_SSE = _mm_max_ps(_mm_load_ps(bb+(c2*4+1)*NNBSBB_C),
847                                  _mm_load_ps(bb+(c2*4+3)*NNBSBB_C));
848             _mm_store_ps(grid->bbj+(c2*2+0)*NNBSBB_C,min_SSE);
849             _mm_store_ps(grid->bbj+(c2*2+1)*NNBSBB_C,max_SSE);
850         }
851         if (((grid->cxy_na[i]+3)>>2) & 1)
852         {
853             /* Copy the last bb for odd bb count in this column */
854             for(j=0; j<NNBSBB_C; j++)
855             {
856                 grid->bbj[(c2*2+0)*NNBSBB_C+j] = bb[(c2*4+0)*NNBSBB_C+j];
857                 grid->bbj[(c2*2+1)*NNBSBB_C+j] = bb[(c2*4+1)*NNBSBB_C+j];
858             }
859         }
860     }
861 }
862
863 #endif
864
865
866 /* Prints the average bb size, used for debug output */
867 static void print_bbsizes_simple(FILE *fp,
868                                  const nbnxn_search_t nbs,
869                                  const nbnxn_grid_t *grid)
870 {
871     int  c,d;
872     dvec ba;
873
874     clear_dvec(ba);
875     for(c=0; c<grid->nc; c++)
876     {
877         for(d=0; d<DIM; d++)
878         {
879             ba[d] += grid->bb[c*NNBSBB_B+NNBSBB_C+d] - grid->bb[c*NNBSBB_B+d];
880         }
881     }
882     dsvmul(1.0/grid->nc,ba,ba);
883
884     fprintf(fp,"ns bb: %4.2f %4.2f %4.2f  %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
885             nbs->box[XX][XX]/grid->ncx,
886             nbs->box[YY][YY]/grid->ncy,
887             nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/grid->nc,
888             ba[XX],ba[YY],ba[ZZ],
889             ba[XX]*grid->ncx/nbs->box[XX][XX],
890             ba[YY]*grid->ncy/nbs->box[YY][YY],
891             ba[ZZ]*grid->nc/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
892 }
893
894 /* Prints the average bb size, used for debug output */
895 static void print_bbsizes_supersub(FILE *fp,
896                                    const nbnxn_search_t nbs,
897                                    const nbnxn_grid_t *grid)
898 {
899     int  ns,c,s;
900     dvec ba;
901
902     clear_dvec(ba);
903     ns = 0;
904     for(c=0; c<grid->nc; c++)
905     {
906 #ifdef NBNXN_BBXXXX
907         for(s=0; s<grid->nsubc[c]; s+=STRIDE_8BB)
908         {
909             int cs_w,i,d;
910
911             cs_w = (c*GPU_NSUBCELL + s)/STRIDE_8BB;
912             for(i=0; i<STRIDE_8BB; i++)
913             {
914                 for(d=0; d<DIM; d++)
915                 {
916                     ba[d] +=
917                         grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_8BB+i] -
918                         grid->bb[cs_w*NNBSBB_XXXX+     d *STRIDE_8BB+i];
919                 }
920             }
921         }
922 #else
923         for(s=0; s<grid->nsubc[c]; s++)
924         {
925             int cs,d;
926
927             cs = c*GPU_NSUBCELL + s;
928             for(d=0; d<DIM; d++)
929             {
930                 ba[d] +=
931                     grid->bb[cs*NNBSBB_B+NNBSBB_C+d] -
932                     grid->bb[cs*NNBSBB_B         +d];
933             }
934         }
935 #endif
936         ns += grid->nsubc[c];
937     }
938     dsvmul(1.0/ns,ba,ba);
939
940     fprintf(fp,"ns bb: %4.2f %4.2f %4.2f  %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
941             nbs->box[XX][XX]/(grid->ncx*GPU_NSUBCELL_X),
942             nbs->box[YY][YY]/(grid->ncy*GPU_NSUBCELL_Y),
943             nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z),
944             ba[XX],ba[YY],ba[ZZ],
945             ba[XX]*grid->ncx*GPU_NSUBCELL_X/nbs->box[XX][XX],
946             ba[YY]*grid->ncy*GPU_NSUBCELL_Y/nbs->box[YY][YY],
947             ba[ZZ]*grid->nc*GPU_NSUBCELL_Z/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
948 }
949
950 /* Potentially sorts atoms on LJ coefficients !=0 and ==0.
951  * Also sets interaction flags.
952  */
953 void sort_on_lj(nbnxn_atomdata_t *nbat,int na_c,
954                 int a0,int a1,const int *atinfo,
955                 int *order,
956                 int *flags)
957 {
958     int subc,s,a,n1,n2,a_lj_max,i,j;
959     int sort1[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
960     int sort2[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
961     gmx_bool haveQ;
962
963     *flags = 0;
964
965     subc = 0;
966     for(s=a0; s<a1; s+=na_c)
967     {
968         /* Make lists for this (sub-)cell on atoms with and without LJ */
969         n1 = 0;
970         n2 = 0;
971         haveQ = FALSE;
972         a_lj_max = -1;
973         for(a=s; a<min(s+na_c,a1); a++)
974         {
975             haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
976
977             if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
978             {
979                 sort1[n1++] = order[a];
980                 a_lj_max = a;
981             }
982             else
983             {
984                 sort2[n2++] = order[a];
985             }
986         }
987
988         /* If we don't have atom with LJ, there's nothing to sort */
989         if (n1 > 0)
990         {
991             *flags |= NBNXN_CI_DO_LJ(subc);
992
993             if (2*n1 <= na_c)
994             {
995                 /* Only sort when strictly necessary. Ordering particles
996                  * Ordering particles can lead to less accurate summation
997                  * due to rounding, both for LJ and Coulomb interactions.
998                  */
999                 if (2*(a_lj_max - s) >= na_c)
1000                 {
1001                     for(i=0; i<n1; i++)
1002                     {
1003                         order[a0+i] = sort1[i];
1004                     }
1005                     for(j=0; j<n2; j++)
1006                     {
1007                         order[a0+n1+j] = sort2[j];
1008                     }
1009                 }
1010
1011                 *flags |= NBNXN_CI_HALF_LJ(subc);
1012             }
1013         }
1014         if (haveQ)
1015         {
1016             *flags |= NBNXN_CI_DO_COUL(subc);
1017         }
1018         subc++;
1019     }
1020 }
1021
1022 /* Fill a pair search cell with atoms.
1023  * Potentially sorts atoms and sets the interaction flags.
1024  */
1025 void fill_cell(const nbnxn_search_t nbs,
1026                nbnxn_grid_t *grid,
1027                nbnxn_atomdata_t *nbat,
1028                int a0,int a1,
1029                const int *atinfo,
1030                rvec *x,
1031                int sx,int sy, int sz,
1032                float *bb_work)
1033 {
1034     int    na,a;
1035     size_t offset;
1036     float  *bb_ptr;
1037
1038     na = a1 - a0;
1039
1040     if (grid->bSimple)
1041     {
1042         sort_on_lj(nbat,grid->na_c,a0,a1,atinfo,nbs->a,
1043                    grid->flags+(a0>>grid->na_c_2log)-grid->cell0);
1044     }
1045
1046     /* Now we have sorted the atoms, set the cell indices */
1047     for(a=a0; a<a1; a++)
1048     {
1049         nbs->cell[nbs->a[a]] = a;
1050     }
1051
1052     copy_rvec_to_nbat_real(nbs->a+a0,a1-a0,grid->na_c,x,
1053                            nbat->XFormat,nbat->x,a0,
1054                            sx,sy,sz);
1055
1056     if (nbat->XFormat == nbatX4)
1057     {
1058         /* Store the bounding boxes as xyz.xyz. */
1059         offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
1060         bb_ptr = grid->bb + offset;
1061
1062 #if defined GMX_DOUBLE && defined NBNXN_SEARCH_SSE
1063         if (2*grid->na_cj == grid->na_c)
1064         {
1065             calc_bounding_box_x_x4_halves(na,nbat->x+X4_IND_A(a0),bb_ptr,
1066                                           grid->bbj+offset*2);
1067         }
1068         else
1069 #endif
1070         {
1071             calc_bounding_box_x_x4(na,nbat->x+X4_IND_A(a0),bb_ptr);
1072         }
1073     }
1074     else if (nbat->XFormat == nbatX8)
1075     {
1076         /* Store the bounding boxes as xyz.xyz. */
1077         offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
1078         bb_ptr = grid->bb + offset;
1079
1080         calc_bounding_box_x_x8(na,nbat->x+X8_IND_A(a0),bb_ptr);
1081     }
1082 #ifdef NBNXN_BBXXXX
1083     else if (!grid->bSimple)
1084     {
1085         /* Store the bounding boxes in a format convenient
1086          * for SSE calculations: xxxxyyyyzzzz...
1087                              */
1088         bb_ptr =
1089             grid->bb +
1090             ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_8BB_2LOG))*NNBSBB_XXXX +
1091             (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_8BB-1));
1092
1093 #ifdef NBNXN_SEARCH_SSE_SINGLE
1094         if (nbat->XFormat == nbatXYZQ)
1095         {
1096             calc_bounding_box_xxxx_sse(na,nbat->x+a0*nbat->xstride,
1097                                        bb_work,bb_ptr);
1098         }
1099         else
1100 #endif
1101         {
1102             calc_bounding_box_xxxx(na,nbat->xstride,nbat->x+a0*nbat->xstride,
1103                                    bb_ptr);
1104         }
1105         if (gmx_debug_at)
1106         {
1107             fprintf(debug,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
1108                     sx,sy,sz,
1109                     bb_ptr[0*STRIDE_8BB],bb_ptr[3*STRIDE_8BB],
1110                     bb_ptr[1*STRIDE_8BB],bb_ptr[4*STRIDE_8BB],
1111                     bb_ptr[2*STRIDE_8BB],bb_ptr[5*STRIDE_8BB]);
1112         }
1113     }
1114 #endif
1115     else
1116     {
1117         /* Store the bounding boxes as xyz.xyz. */
1118         bb_ptr = grid->bb+((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
1119
1120         calc_bounding_box(na,nbat->xstride,nbat->x+a0*nbat->xstride,
1121                           bb_ptr);
1122
1123         if (gmx_debug_at)
1124         {
1125             int bbo;
1126             bbo = (a0 - grid->cell0*grid->na_sc)/grid->na_c;
1127             fprintf(debug,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
1128                     sx,sy,sz,
1129                     (grid->bb+bbo*NNBSBB_B)[BBL_X],
1130                     (grid->bb+bbo*NNBSBB_B)[BBU_X],
1131                     (grid->bb+bbo*NNBSBB_B)[BBL_Y],
1132                     (grid->bb+bbo*NNBSBB_B)[BBU_Y],
1133                     (grid->bb+bbo*NNBSBB_B)[BBL_Z],
1134                     (grid->bb+bbo*NNBSBB_B)[BBU_Z]);
1135         }
1136     }
1137 }
1138
1139 /* Spatially sort the atoms within one grid column */
1140 static void sort_columns_simple(const nbnxn_search_t nbs,
1141                                 int dd_zone,
1142                                 nbnxn_grid_t *grid,
1143                                 int a0,int a1,
1144                                 const int *atinfo,
1145                                 rvec *x,
1146                                 nbnxn_atomdata_t *nbat,
1147                                 int cxy_start,int cxy_end,
1148                                 int *sort_work)
1149 {
1150     int  cxy;
1151     int  cx,cy,cz,ncz,cfilled,c;
1152     int  na,ash,ind,a;
1153     int  na_c,ash_c;
1154
1155     if (debug)
1156     {
1157         fprintf(debug,"cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1158                 grid->cell0,cxy_start,cxy_end,a0,a1);
1159     }
1160
1161     /* Sort the atoms within each x,y column in 3 dimensions */
1162     for(cxy=cxy_start; cxy<cxy_end; cxy++)
1163     {
1164         cx = cxy/grid->ncy;
1165         cy = cxy - cx*grid->ncy;
1166
1167         na  = grid->cxy_na[cxy];
1168         ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
1169         ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
1170
1171         /* Sort the atoms within each x,y column on z coordinate */
1172         sort_atoms(ZZ,FALSE,
1173                    nbs->a+ash,na,x,
1174                    grid->c0[ZZ],
1175                    ncz*grid->na_sc*SORT_GRID_OVERSIZE/nbs->box[ZZ][ZZ],
1176                    ncz*grid->na_sc*SGSF,sort_work);
1177
1178         /* Fill the ncz cells in this column */
1179         cfilled = grid->cxy_ind[cxy];
1180         for(cz=0; cz<ncz; cz++)
1181         {
1182             c  = grid->cxy_ind[cxy] + cz ;
1183
1184             ash_c = ash + cz*grid->na_sc;
1185             na_c  = min(grid->na_sc,na-(ash_c-ash));
1186
1187             fill_cell(nbs,grid,nbat,
1188                       ash_c,ash_c+na_c,atinfo,x,
1189                       grid->na_sc*cx + (dd_zone >> 2),
1190                       grid->na_sc*cy + (dd_zone & 3),
1191                       grid->na_sc*cz,
1192                       NULL);
1193
1194             /* This copy to bbcz is not really necessary.
1195              * But it allows to use the same grid search code
1196              * for the simple and supersub cell setups.
1197              */
1198             if (na_c > 0)
1199             {
1200                 cfilled = c;
1201             }
1202             grid->bbcz[c*NNBSBB_D  ] = grid->bb[cfilled*NNBSBB_B+2];
1203             grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled*NNBSBB_B+6];
1204         }
1205
1206         /* Set the unused atom indices to -1 */
1207         for(ind=na; ind<ncz*grid->na_sc; ind++)
1208         {
1209             nbs->a[ash+ind] = -1;
1210         }
1211     }
1212 }
1213
1214 /* Spatially sort the atoms within one grid column */
1215 static void sort_columns_supersub(const nbnxn_search_t nbs,
1216                                   int dd_zone,
1217                                   nbnxn_grid_t *grid,
1218                                   int a0,int a1,
1219                                   const int *atinfo,
1220                                   rvec *x,
1221                                   nbnxn_atomdata_t *nbat,
1222                                   int cxy_start,int cxy_end,
1223                                   int *sort_work)
1224 {
1225     int  cxy;
1226     int  cx,cy,cz=-1,c=-1,ncz;
1227     int  na,ash,na_c,ind,a;
1228     int  subdiv_z,sub_z,na_z,ash_z;
1229     int  subdiv_y,sub_y,na_y,ash_y;
1230     int  subdiv_x,sub_x,na_x,ash_x;
1231
1232     /* cppcheck-suppress unassignedVariable */
1233     float bb_work_array[NNBSBB_B+3],*bb_work_align;
1234
1235     bb_work_align = (float *)(((size_t)(bb_work_array+3)) & (~((size_t)15)));
1236
1237     if (debug)
1238     {
1239         fprintf(debug,"cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1240                 grid->cell0,cxy_start,cxy_end,a0,a1);
1241     }
1242
1243     subdiv_x = grid->na_c;
1244     subdiv_y = GPU_NSUBCELL_X*subdiv_x;
1245     subdiv_z = GPU_NSUBCELL_Y*subdiv_y;
1246
1247     /* Sort the atoms within each x,y column in 3 dimensions */
1248     for(cxy=cxy_start; cxy<cxy_end; cxy++)
1249     {
1250         cx = cxy/grid->ncy;
1251         cy = cxy - cx*grid->ncy;
1252
1253         na  = grid->cxy_na[cxy];
1254         ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
1255         ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
1256
1257         /* Sort the atoms within each x,y column on z coordinate */
1258         sort_atoms(ZZ,FALSE,
1259                    nbs->a+ash,na,x,
1260                    grid->c0[ZZ],
1261                    ncz*grid->na_sc*SORT_GRID_OVERSIZE/nbs->box[ZZ][ZZ],
1262                    ncz*grid->na_sc*SGSF,sort_work);
1263
1264         /* This loop goes over the supercells and subcells along z at once */
1265         for(sub_z=0; sub_z<ncz*GPU_NSUBCELL_Z; sub_z++)
1266         {
1267             ash_z = ash + sub_z*subdiv_z;
1268             na_z  = min(subdiv_z,na-(ash_z-ash));
1269
1270             /* We have already sorted on z */
1271
1272             if (sub_z % GPU_NSUBCELL_Z == 0)
1273             {
1274                 cz = sub_z/GPU_NSUBCELL_Z;
1275                 c  = grid->cxy_ind[cxy] + cz ;
1276
1277                 /* The number of atoms in this supercell */
1278                 na_c = min(grid->na_sc,na-(ash_z-ash));
1279
1280                 grid->nsubc[c] = min(GPU_NSUBCELL,(na_c+grid->na_c-1)/grid->na_c);
1281
1282                 /* Store the z-boundaries of the super cell */
1283                 grid->bbcz[c*NNBSBB_D  ] = x[nbs->a[ash_z]][ZZ];
1284                 grid->bbcz[c*NNBSBB_D+1] = x[nbs->a[ash_z+na_c-1]][ZZ];
1285             }
1286
1287 #if GPU_NSUBCELL_Y > 1
1288             /* Sort the atoms along y */
1289             sort_atoms(YY,(sub_z & 1),
1290                        nbs->a+ash_z,na_z,x,
1291                        grid->c0[YY]+cy*grid->sy,grid->inv_sy,
1292                        subdiv_y*SGSF,sort_work);
1293 #endif
1294
1295             for(sub_y=0; sub_y<GPU_NSUBCELL_Y; sub_y++)
1296             {
1297                 ash_y = ash_z + sub_y*subdiv_y;
1298                 na_y  = min(subdiv_y,na-(ash_y-ash));
1299
1300 #if GPU_NSUBCELL_X > 1
1301                 /* Sort the atoms along x */
1302                 sort_atoms(XX,((cz*GPU_NSUBCELL_Y + sub_y) & 1),
1303                            nbs->a+ash_y,na_y,x,
1304                            grid->c0[XX]+cx*grid->sx,grid->inv_sx,
1305                            subdiv_x*SGSF,sort_work);
1306 #endif
1307
1308                 for(sub_x=0; sub_x<GPU_NSUBCELL_X; sub_x++)
1309                 {
1310                     ash_x = ash_y + sub_x*subdiv_x;
1311                     na_x  = min(subdiv_x,na-(ash_x-ash));
1312
1313                     fill_cell(nbs,grid,nbat,
1314                               ash_x,ash_x+na_x,atinfo,x,
1315                               grid->na_c*(cx*GPU_NSUBCELL_X+sub_x) + (dd_zone >> 2),
1316                               grid->na_c*(cy*GPU_NSUBCELL_Y+sub_y) + (dd_zone & 3),
1317                               grid->na_c*sub_z,
1318                               bb_work_align);
1319                 }
1320             }
1321         }
1322
1323         /* Set the unused atom indices to -1 */
1324         for(ind=na; ind<ncz*grid->na_sc; ind++)
1325         {
1326             nbs->a[ash+ind] = -1;
1327         }
1328     }
1329 }
1330
1331 /* Determine in which grid column atoms should go */
1332 static void calc_column_indices(nbnxn_grid_t *grid,
1333                                 int a0,int a1,
1334                                 rvec *x,const int *move,
1335                                 int thread,int nthread,
1336                                 int *cell,
1337                                 int *cxy_na)
1338 {
1339     int  n0,n1,i;
1340     int  cx,cy;
1341
1342     /* We add one extra cell for particles which moved during DD */
1343     for(i=0; i<grid->ncx*grid->ncy+1; i++)
1344     {
1345         cxy_na[i] = 0;
1346     }
1347
1348     n0 = a0 + (int)((thread+0)*(a1 - a0))/nthread;
1349     n1 = a0 + (int)((thread+1)*(a1 - a0))/nthread;
1350     for(i=n0; i<n1; i++)
1351     {
1352         if (move == NULL || move[i] >= 0)
1353         {
1354             /* We need to be careful with rounding,
1355              * particles might be a few bits outside the local box.
1356              * The int cast takes care of the lower bound,
1357              * we need to explicitly take care of the upper bound.
1358              */
1359             cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
1360             if (cx == grid->ncx)
1361             {
1362                 cx = grid->ncx - 1;
1363             }
1364             cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
1365             if (cy == grid->ncy)
1366             {
1367                 cy = grid->ncy - 1;
1368             }
1369             /* For the moment cell contains only the, grid local,
1370              * x and y indices, not z.
1371              */
1372             cell[i] = cx*grid->ncy + cy;
1373
1374 #ifdef DEBUG_NBNXN_GRIDDING
1375             if (cell[i] < 0 || cell[i] >= grid->ncx*grid->ncy)
1376             {
1377                 gmx_fatal(FARGS,
1378                           "grid cell cx %d cy %d out of range (max %d %d)",
1379                           cx,cy,grid->ncx,grid->ncy);
1380             }
1381 #endif
1382         }
1383         else
1384         {
1385             /* Put this moved particle after the end of the grid,
1386              * so we can process it later without using conditionals.
1387              */
1388             cell[i] = grid->ncx*grid->ncy;
1389         }
1390
1391         cxy_na[cell[i]]++;
1392     }
1393 }
1394
1395 /* Determine in which grid cells the atoms should go */
1396 static void calc_cell_indices(const nbnxn_search_t nbs,
1397                               int dd_zone,
1398                               nbnxn_grid_t *grid,
1399                               int a0,int a1,
1400                               const int *atinfo,
1401                               rvec *x,
1402                               const int *move,
1403                               nbnxn_atomdata_t *nbat)
1404 {
1405     int  n0,n1,i;
1406     int  cx,cy,cxy,ncz_max,ncz;
1407     int  nthread,thread;
1408     int  *cxy_na,cxy_na_i;
1409
1410     nthread = gmx_omp_nthreads_get(emntPairsearch);
1411
1412 #pragma omp parallel for num_threads(nthread) schedule(static)
1413     for(thread=0; thread<nthread; thread++)
1414     {
1415         calc_column_indices(grid,a0,a1,x,move,thread,nthread,
1416                             nbs->cell,nbs->work[thread].cxy_na);
1417     }
1418
1419     /* Make the cell index as a function of x and y */
1420     ncz_max = 0;
1421     ncz = 0;
1422     grid->cxy_ind[0] = 0;
1423     for(i=0; i<grid->ncx*grid->ncy+1; i++)
1424     {
1425         /* We set ncz_max at the beginning of the loop iso at the end
1426          * to skip i=grid->ncx*grid->ncy which are moved particles
1427          * that do not need to be ordered on the grid.
1428          */
1429         if (ncz > ncz_max)
1430         {
1431             ncz_max = ncz;
1432         }
1433         cxy_na_i = nbs->work[0].cxy_na[i];
1434         for(thread=1; thread<nthread; thread++)
1435         {
1436             cxy_na_i += nbs->work[thread].cxy_na[i];
1437         }
1438         ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
1439         if (nbat->XFormat == nbatX8)
1440         {
1441             /* Make the number of cell a multiple of 2 */
1442             ncz = (ncz + 1) & ~1;
1443         }
1444         grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
1445         /* Clear cxy_na, so we can reuse the array below */
1446         grid->cxy_na[i] = 0;
1447     }
1448     grid->nc = grid->cxy_ind[grid->ncx*grid->ncy] - grid->cxy_ind[0];
1449
1450     nbat->natoms = (grid->cell0 + grid->nc)*grid->na_sc;
1451
1452     if (debug)
1453     {
1454         fprintf(debug,"ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1455                 grid->na_sc,grid->na_c,grid->nc,
1456                 grid->ncx,grid->ncy,grid->nc/((double)(grid->ncx*grid->ncy)),
1457                 ncz_max);
1458         if (gmx_debug_at)
1459         {
1460             i = 0;
1461             for(cy=0; cy<grid->ncy; cy++)
1462             {
1463                 for(cx=0; cx<grid->ncx; cx++)
1464                 {
1465                     fprintf(debug," %2d",grid->cxy_ind[i+1]-grid->cxy_ind[i]);
1466                     i++;
1467                 }
1468                 fprintf(debug,"\n");
1469             }
1470         }
1471     }
1472
1473     /* Make sure the work array for sorting is large enough */
1474     if (ncz_max*grid->na_sc*SGSF > nbs->work[0].sort_work_nalloc)
1475     {
1476         for(thread=0; thread<nbs->nthread_max; thread++)
1477         {
1478             nbs->work[thread].sort_work_nalloc =
1479                 over_alloc_large(ncz_max*grid->na_sc*SGSF);
1480             srenew(nbs->work[thread].sort_work,
1481                    nbs->work[thread].sort_work_nalloc);
1482         }
1483     }
1484
1485     /* Now we know the dimensions we can fill the grid.
1486      * This is the first, unsorted fill. We sort the columns after this.
1487      */
1488     for(i=a0; i<a1; i++)
1489     {
1490         /* At this point nbs->cell contains the local grid x,y indices */
1491         cxy = nbs->cell[i];
1492         nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
1493     }
1494
1495     /* Set the cell indices for the moved particles */
1496     n0 = grid->nc*grid->na_sc;
1497     n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
1498     for(i=n0; i<n1; i++)
1499     {
1500         nbs->cell[nbs->a[i]] = i;
1501     }
1502
1503     /* Sort the super-cell columns along z into the sub-cells. */
1504 #pragma omp parallel for num_threads(nbs->nthread_max) schedule(static)
1505     for(thread=0; thread<nbs->nthread_max; thread++)
1506     {
1507         if (grid->bSimple)
1508         {
1509             sort_columns_simple(nbs,dd_zone,grid,a0,a1,atinfo,x,nbat,
1510                                 ((thread+0)*grid->ncx*grid->ncy)/nthread,
1511                                 ((thread+1)*grid->ncx*grid->ncy)/nthread,
1512                                 nbs->work[thread].sort_work);
1513         }
1514         else
1515         {
1516             sort_columns_supersub(nbs,dd_zone,grid,a0,a1,atinfo,x,nbat,
1517                                   ((thread+0)*grid->ncx*grid->ncy)/nthread,
1518                                   ((thread+1)*grid->ncx*grid->ncy)/nthread,
1519                                   nbs->work[thread].sort_work);
1520         }
1521     }
1522
1523 #ifdef NBNXN_SEARCH_SSE
1524     if (grid->bSimple && nbat->XFormat == nbatX8)
1525     {
1526         combine_bounding_box_pairs(grid,grid->bb);
1527     }
1528 #endif
1529
1530     if (!grid->bSimple)
1531     {
1532         grid->nsubc_tot = 0;
1533         for(i=0; i<grid->nc; i++)
1534         {
1535             grid->nsubc_tot += grid->nsubc[i];
1536         }
1537     }
1538
1539     if (debug)
1540     {
1541         if (grid->bSimple)
1542         {
1543             print_bbsizes_simple(debug,nbs,grid);
1544         }
1545         else
1546         {
1547             fprintf(debug,"ns non-zero sub-cells: %d average atoms %.2f\n",
1548                     grid->nsubc_tot,(a1-a0)/(double)grid->nsubc_tot);
1549
1550             print_bbsizes_supersub(debug,nbs,grid);
1551         }
1552     }
1553 }
1554
1555 static void init_grid_flags(nbnxn_cellblock_flags *flags,
1556                             const nbnxn_grid_t *grid)
1557 {
1558     int cb;
1559
1560     flags->ncb = (grid->nc + NBNXN_CELLBLOCK_SIZE - 1)/NBNXN_CELLBLOCK_SIZE;
1561     if (flags->ncb > flags->flag_nalloc)
1562     {
1563         flags->flag_nalloc = over_alloc_large(flags->ncb);
1564         srenew(flags->flag,flags->flag_nalloc);
1565     }
1566     for(cb=0; cb<flags->ncb; cb++)
1567     {
1568         flags->flag[cb] = 0;
1569     }
1570
1571     flags->bUse = TRUE;
1572 }
1573
1574 /* Sets up a grid and puts the atoms on the grid.
1575  * This function only operates on one domain of the domain decompostion.
1576  * Note that without domain decomposition there is only one domain.
1577  */
1578 void nbnxn_put_on_grid(nbnxn_search_t nbs,
1579                        int ePBC,matrix box,
1580                        int dd_zone,
1581                        rvec corner0,rvec corner1,
1582                        int a0,int a1,
1583                        real atom_density,
1584                        const int *atinfo,
1585                        rvec *x,
1586                        int nmoved,int *move,
1587                        int nb_kernel_type,
1588                        nbnxn_atomdata_t *nbat)
1589 {
1590     nbnxn_grid_t *grid;
1591     int n;
1592     int nc_max_grid,nc_max;
1593
1594     grid = &nbs->grid[dd_zone];
1595
1596     nbs_cycle_start(&nbs->cc[enbsCCgrid]);
1597
1598     grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
1599
1600     grid->na_c      = nbnxn_kernel_to_ci_size(nb_kernel_type);
1601     grid->na_cj     = nbnxn_kernel_to_cj_size(nb_kernel_type);
1602     grid->na_sc     = (grid->bSimple ? 1 : GPU_NSUBCELL)*grid->na_c;
1603     grid->na_c_2log = get_2log(grid->na_c);
1604
1605     nbat->na_c = grid->na_c;
1606
1607     if (dd_zone == 0)
1608     {
1609         grid->cell0 = 0;
1610     }
1611     else
1612     {
1613         grid->cell0 =
1614             (nbs->grid[dd_zone-1].cell0 + nbs->grid[dd_zone-1].nc)*
1615             nbs->grid[dd_zone-1].na_sc/grid->na_sc;
1616     }
1617
1618     n = a1 - a0;
1619
1620     if (dd_zone == 0)
1621     {
1622         nbs->ePBC = ePBC;
1623         copy_mat(box,nbs->box);
1624
1625         if (atom_density >= 0)
1626         {
1627             grid->atom_density = atom_density;
1628         }
1629         else
1630         {
1631             grid->atom_density = grid_atom_density(n-nmoved,corner0,corner1);
1632         }
1633
1634         grid->cell0 = 0;
1635
1636         nbs->natoms_local    = a1 - nmoved;
1637         /* We assume that nbnxn_put_on_grid is called first
1638          * for the local atoms (dd_zone=0).
1639          */
1640         nbs->natoms_nonlocal = a1 - nmoved;
1641     }
1642     else
1643     {
1644         nbs->natoms_nonlocal = max(nbs->natoms_nonlocal,a1);
1645     }
1646
1647     nc_max_grid = set_grid_size_xy(nbs,grid,n-nmoved,corner0,corner1,
1648                                    nbs->grid[0].atom_density,
1649                                    nbat->XFormat);
1650
1651     nc_max = grid->cell0 + nc_max_grid;
1652
1653     if (a1 > nbs->cell_nalloc)
1654     {
1655         nbs->cell_nalloc = over_alloc_large(a1);
1656         srenew(nbs->cell,nbs->cell_nalloc);
1657     }
1658
1659     /* To avoid conditionals we store the moved particles at the end of a,
1660      * make sure we have enough space.
1661      */
1662     if (nc_max*grid->na_sc + nmoved > nbs->a_nalloc)
1663     {
1664         nbs->a_nalloc = over_alloc_large(nc_max*grid->na_sc + nmoved);
1665         srenew(nbs->a,nbs->a_nalloc);
1666     }
1667
1668     if (nc_max*grid->na_sc > nbat->nalloc)
1669     {
1670         nbnxn_atomdata_realloc(nbat,nc_max*grid->na_sc);
1671     }
1672
1673     calc_cell_indices(nbs,dd_zone,grid,a0,a1,atinfo,x,move,nbat);
1674
1675     if (dd_zone == 0)
1676     {
1677         nbat->natoms_local = nbat->natoms;
1678     }
1679
1680     init_grid_flags(&grid->cellblock_flags,grid);
1681
1682     nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
1683 }
1684
1685 /* Calls nbnxn_put_on_grid for all non-local domains */
1686 void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs,
1687                                 const gmx_domdec_zones_t *zones,
1688                                 const int *atinfo,
1689                                 rvec *x,
1690                                 int nb_kernel_type,
1691                                 nbnxn_atomdata_t *nbat)
1692 {
1693     int  zone,d;
1694     rvec c0,c1;
1695
1696     for(zone=1; zone<zones->n; zone++)
1697     {
1698         for(d=0; d<DIM; d++)
1699         {
1700             c0[d] = zones->size[zone].bb_x0[d];
1701             c1[d] = zones->size[zone].bb_x1[d];
1702         }
1703
1704         nbnxn_put_on_grid(nbs,nbs->ePBC,NULL,
1705                           zone,c0,c1,
1706                           zones->cg_range[zone],
1707                           zones->cg_range[zone+1],
1708                           -1,
1709                           atinfo,
1710                           x,
1711                           0,NULL,
1712                           nb_kernel_type,
1713                           nbat);
1714     }
1715 }
1716
1717 /* Add simple grid type information to the local super/sub grid */
1718 void nbnxn_grid_add_simple(nbnxn_search_t nbs,
1719                            nbnxn_atomdata_t *nbat)
1720 {
1721     nbnxn_grid_t *grid;
1722     float *bbcz,*bb;
1723     int ncd,sc;
1724
1725     grid = &nbs->grid[0];
1726
1727     if (grid->bSimple)
1728     {
1729         gmx_incons("nbnxn_grid_simple called with a simple grid");
1730     }
1731
1732     ncd = grid->na_sc/NBNXN_CPU_CLUSTER_I_SIZE;
1733
1734     if (grid->nc*ncd > grid->nc_nalloc_simple)
1735     {
1736         grid->nc_nalloc_simple = over_alloc_large(grid->nc*ncd);
1737         srenew(grid->bbcz_simple,grid->nc_nalloc_simple*NNBSBB_D);
1738         srenew(grid->bb_simple,grid->nc_nalloc_simple*NNBSBB_B);
1739         srenew(grid->flags_simple,grid->nc_nalloc_simple);
1740         if (nbat->XFormat)
1741         {
1742             sfree_aligned(grid->bbj);
1743             snew_aligned(grid->bbj,grid->nc_nalloc_simple/2,16);
1744         }
1745     }
1746
1747     bbcz = grid->bbcz_simple;
1748     bb   = grid->bb_simple;
1749
1750 #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
1751     for(sc=0; sc<grid->nc; sc++)
1752     {
1753         int c,tx,na;
1754
1755         for(c=0; c<ncd; c++)
1756         {
1757             tx = sc*ncd + c;
1758
1759             na = NBNXN_CPU_CLUSTER_I_SIZE;
1760             while (na > 0 &&
1761                    nbat->type[tx*NBNXN_CPU_CLUSTER_I_SIZE+na-1] == nbat->ntype-1)
1762             {
1763                 na--;
1764             }
1765
1766             if (na > 0)
1767             {
1768                 switch (nbat->XFormat)
1769                 {
1770                 case nbatX4:
1771                     /* PACK_X4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
1772                     calc_bounding_box_x_x4(na,nbat->x+tx*STRIDE_P4,
1773                                            bb+tx*NNBSBB_B);
1774                     break;
1775                 case nbatX8:
1776                     /* PACK_X8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
1777                     calc_bounding_box_x_x8(na,nbat->x+X8_IND_A(tx*NBNXN_CPU_CLUSTER_I_SIZE),
1778                                            bb+tx*NNBSBB_B);
1779                     break;
1780                 default:
1781                     calc_bounding_box(na,nbat->xstride,
1782                                       nbat->x+tx*NBNXN_CPU_CLUSTER_I_SIZE*nbat->xstride,
1783                                       bb+tx*NNBSBB_B);
1784                     break;
1785                 }
1786                 bbcz[tx*NNBSBB_D+0] = bb[tx*NNBSBB_B         +ZZ];
1787                 bbcz[tx*NNBSBB_D+1] = bb[tx*NNBSBB_B+NNBSBB_C+ZZ];
1788
1789                 /* No interaction optimization yet here */
1790                 grid->flags_simple[tx] = NBNXN_CI_DO_LJ(0) | NBNXN_CI_DO_COUL(0);
1791             }
1792             else
1793             {
1794                 grid->flags_simple[tx] = 0;
1795             }
1796         }
1797     }
1798
1799 #ifdef NBNXN_SEARCH_SSE
1800     if (grid->bSimple && nbat->XFormat == nbatX8)
1801     {
1802         combine_bounding_box_pairs(grid,grid->bb_simple);
1803     }
1804 #endif
1805 }
1806
1807 void nbnxn_get_ncells(nbnxn_search_t nbs,int *ncx,int *ncy)
1808 {
1809     *ncx = nbs->grid[0].ncx;
1810     *ncy = nbs->grid[0].ncy;
1811 }
1812
1813 void nbnxn_get_atomorder(nbnxn_search_t nbs,int **a,int *n)
1814 {
1815     const nbnxn_grid_t *grid;
1816
1817     grid = &nbs->grid[0];
1818
1819     /* Return the atom order for the home cell (index 0) */
1820     *a  = nbs->a;
1821
1822     *n = grid->cxy_ind[grid->ncx*grid->ncy]*grid->na_sc;
1823 }
1824
1825 void nbnxn_set_atomorder(nbnxn_search_t nbs)
1826 {
1827     nbnxn_grid_t *grid;
1828     int ao,cx,cy,cxy,cz,j;
1829
1830     /* Set the atom order for the home cell (index 0) */
1831     grid = &nbs->grid[0];
1832
1833     ao = 0;
1834     for(cx=0; cx<grid->ncx; cx++)
1835     {
1836         for(cy=0; cy<grid->ncy; cy++)
1837         {
1838             cxy = cx*grid->ncy + cy;
1839             j   = grid->cxy_ind[cxy]*grid->na_sc;
1840             for(cz=0; cz<grid->cxy_na[cxy]; cz++)
1841             {
1842                 nbs->a[j]     = ao;
1843                 nbs->cell[ao] = j;
1844                 ao++;
1845                 j++;
1846             }
1847         }
1848     }
1849 }
1850
1851 /* Determines the cell range along one dimension that
1852  * the bounding box b0 - b1 sees.
1853  */
1854 static void get_cell_range(real b0,real b1,
1855                            int nc,real c0,real s,real invs,
1856                            real d2,real r2,int *cf,int *cl)
1857 {
1858     *cf = max((int)((b0 - c0)*invs),0);
1859
1860     while (*cf > 0 && d2 + sqr((b0 - c0) - (*cf-1+1)*s) < r2)
1861     {
1862         (*cf)--;
1863     }
1864
1865     *cl = min((int)((b1 - c0)*invs),nc-1);
1866     while (*cl < nc-1 && d2 + sqr((*cl+1)*s - (b1 - c0)) < r2)
1867     {
1868         (*cl)++;
1869     }
1870 }
1871
1872 /* Reference code calculating the distance^2 between two bounding boxes */
1873 static float box_dist2(float bx0,float bx1,float by0,
1874                        float by1,float bz0,float bz1,
1875                        const float *bb)
1876 {
1877     float d2;
1878     float dl,dh,dm,dm0;
1879
1880     d2 = 0;
1881
1882     dl  = bx0 - bb[BBU_X];
1883     dh  = bb[BBL_X] - bx1;
1884     dm  = max(dl,dh);
1885     dm0 = max(dm,0);
1886     d2 += dm0*dm0;
1887
1888     dl  = by0 - bb[BBU_Y];
1889     dh  = bb[BBL_Y] - by1;
1890     dm  = max(dl,dh);
1891     dm0 = max(dm,0);
1892     d2 += dm0*dm0;
1893
1894     dl  = bz0 - bb[BBU_Z];
1895     dh  = bb[BBL_Z] - bz1;
1896     dm  = max(dl,dh);
1897     dm0 = max(dm,0);
1898     d2 += dm0*dm0;
1899
1900     return d2;
1901 }
1902
1903 /* Plain C code calculating the distance^2 between two bounding boxes */
1904 static float subc_bb_dist2(int si,const float *bb_i_ci,
1905                            int csj,const float *bb_j_all)
1906 {
1907     const float *bb_i,*bb_j;
1908     float d2;
1909     float dl,dh,dm,dm0;
1910
1911     bb_i = bb_i_ci  +  si*NNBSBB_B;
1912     bb_j = bb_j_all + csj*NNBSBB_B;
1913
1914     d2 = 0;
1915
1916     dl  = bb_i[BBL_X] - bb_j[BBU_X];
1917     dh  = bb_j[BBL_X] - bb_i[BBU_X];
1918     dm  = max(dl,dh);
1919     dm0 = max(dm,0);
1920     d2 += dm0*dm0;
1921
1922     dl  = bb_i[BBL_Y] - bb_j[BBU_Y];
1923     dh  = bb_j[BBL_Y] - bb_i[BBU_Y];
1924     dm  = max(dl,dh);
1925     dm0 = max(dm,0);
1926     d2 += dm0*dm0;
1927
1928     dl  = bb_i[BBL_Z] - bb_j[BBU_Z];
1929     dh  = bb_j[BBL_Z] - bb_i[BBU_Z];
1930     dm  = max(dl,dh);
1931     dm0 = max(dm,0);
1932     d2 += dm0*dm0;
1933
1934     return d2;
1935 }
1936
1937 #ifdef NBNXN_SEARCH_SSE
1938
1939 /* SSE code for bb distance for bb format xyz0 */
1940 static float subc_bb_dist2_sse(int na_c,
1941                               int si,const float *bb_i_ci,
1942                               int csj,const float *bb_j_all)
1943 {
1944     const float *bb_i,*bb_j;
1945
1946     __m128 bb_i_SSE0,bb_i_SSE1;
1947     __m128 bb_j_SSE0,bb_j_SSE1;
1948     __m128 dl_SSE;
1949     __m128 dh_SSE;
1950     __m128 dm_SSE;
1951     __m128 dm0_SSE;
1952     __m128 d2_SSE;
1953 #ifndef GMX_X86_SSE4_1
1954     float d2_array[7],*d2_align;
1955
1956     d2_align = (float *)(((size_t)(d2_array+3)) & (~((size_t)15)));
1957 #else
1958     float d2;
1959 #endif
1960
1961     bb_i = bb_i_ci  +  si*NNBSBB_B;
1962     bb_j = bb_j_all + csj*NNBSBB_B;
1963
1964     bb_i_SSE0 = _mm_load_ps(bb_i);
1965     bb_i_SSE1 = _mm_load_ps(bb_i+NNBSBB_C);
1966     bb_j_SSE0 = _mm_load_ps(bb_j);
1967     bb_j_SSE1 = _mm_load_ps(bb_j+NNBSBB_C);
1968
1969     dl_SSE    = _mm_sub_ps(bb_i_SSE0,bb_j_SSE1);
1970     dh_SSE    = _mm_sub_ps(bb_j_SSE0,bb_i_SSE1);
1971
1972     dm_SSE    = _mm_max_ps(dl_SSE,dh_SSE);
1973     dm0_SSE   = _mm_max_ps(dm_SSE,_mm_setzero_ps());
1974 #ifndef GMX_X86_SSE4_1
1975     d2_SSE    = _mm_mul_ps(dm0_SSE,dm0_SSE);
1976
1977     _mm_store_ps(d2_align,d2_SSE);
1978
1979     return d2_align[0] + d2_align[1] + d2_align[2];
1980 #else
1981     /* SSE4.1 dot product of components 0,1,2 */
1982     d2_SSE    = _mm_dp_ps(dm0_SSE,dm0_SSE,0x71);
1983
1984     _mm_store_ss(&d2,d2_SSE);
1985
1986     return d2;
1987 #endif
1988 }
1989
1990 /* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
1991 #define SUBC_BB_DIST2_SSE_XXXX_INNER(si,bb_i,d2) \
1992 {                                                \
1993     int    shi;                                  \
1994                                                  \
1995     __m128 dx_0,dy_0,dz_0;                       \
1996     __m128 dx_1,dy_1,dz_1;                       \
1997                                                  \
1998     __m128 mx,my,mz;                             \
1999     __m128 m0x,m0y,m0z;                          \
2000                                                  \
2001     __m128 d2x,d2y,d2z;                          \
2002     __m128 d2s,d2t;                              \
2003                                                  \
2004     shi = si*NNBSBB_D*DIM;                       \
2005                                                  \
2006     xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_8BB);   \
2007     yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_8BB);   \
2008     zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_8BB);   \
2009     xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_8BB);   \
2010     yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_8BB);   \
2011     zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_8BB);   \
2012                                                  \
2013     dx_0 = _mm_sub_ps(xi_l,xj_h);                \
2014     dy_0 = _mm_sub_ps(yi_l,yj_h);                \
2015     dz_0 = _mm_sub_ps(zi_l,zj_h);                \
2016                                                  \
2017     dx_1 = _mm_sub_ps(xj_l,xi_h);                \
2018     dy_1 = _mm_sub_ps(yj_l,yi_h);                \
2019     dz_1 = _mm_sub_ps(zj_l,zi_h);                \
2020                                                  \
2021     mx   = _mm_max_ps(dx_0,dx_1);                \
2022     my   = _mm_max_ps(dy_0,dy_1);                \
2023     mz   = _mm_max_ps(dz_0,dz_1);                \
2024                                                  \
2025     m0x  = _mm_max_ps(mx,zero);                  \
2026     m0y  = _mm_max_ps(my,zero);                  \
2027     m0z  = _mm_max_ps(mz,zero);                  \
2028                                                  \
2029     d2x  = _mm_mul_ps(m0x,m0x);                  \
2030     d2y  = _mm_mul_ps(m0y,m0y);                  \
2031     d2z  = _mm_mul_ps(m0z,m0z);                  \
2032                                                  \
2033     d2s  = _mm_add_ps(d2x,d2y);                  \
2034     d2t  = _mm_add_ps(d2s,d2z);                  \
2035                                                  \
2036     _mm_store_ps(d2+si,d2t);                     \
2037 }
2038
2039 /* SSE code for nsi bb distances for bb format xxxxyyyyzzzz */
2040 static void subc_bb_dist2_sse_xxxx(const float *bb_j,
2041                                    int nsi,const float *bb_i,
2042                                    float *d2)
2043 {
2044     __m128 xj_l,yj_l,zj_l;
2045     __m128 xj_h,yj_h,zj_h;
2046     __m128 xi_l,yi_l,zi_l;
2047     __m128 xi_h,yi_h,zi_h;
2048
2049     __m128 zero;
2050
2051     zero = _mm_setzero_ps();
2052
2053     xj_l = _mm_set1_ps(bb_j[0*STRIDE_8BB]);
2054     yj_l = _mm_set1_ps(bb_j[1*STRIDE_8BB]);
2055     zj_l = _mm_set1_ps(bb_j[2*STRIDE_8BB]);
2056     xj_h = _mm_set1_ps(bb_j[3*STRIDE_8BB]);
2057     yj_h = _mm_set1_ps(bb_j[4*STRIDE_8BB]);
2058     zj_h = _mm_set1_ps(bb_j[5*STRIDE_8BB]);
2059
2060     /* Here we "loop" over si (0,STRIDE_8BB) from 0 to nsi with step STRIDE_8BB.
2061      * But as we know the number of iterations is 1 or 2, we unroll manually.
2062      */
2063     SUBC_BB_DIST2_SSE_XXXX_INNER(0,bb_i,d2);
2064     if (STRIDE_8BB < nsi)
2065     {
2066         SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_8BB,bb_i,d2);
2067     }
2068 }
2069
2070 #endif /* NBNXN_SEARCH_SSE */
2071
2072 /* Plain C function which determines if any atom pair between two cells
2073  * is within distance sqrt(rl2).
2074  */
2075 static gmx_bool subc_in_range_x(int na_c,
2076                                 int si,const real *x_i,
2077                                 int csj,int stride,const real *x_j,
2078                                 real rl2)
2079 {
2080     int  i,j,i0,j0;
2081     real d2;
2082
2083     for(i=0; i<na_c; i++)
2084     {
2085         i0 = (si*na_c + i)*DIM;
2086         for(j=0; j<na_c; j++)
2087         {
2088             j0 = (csj*na_c + j)*stride;
2089
2090             d2 = sqr(x_i[i0  ] - x_j[j0  ]) +
2091                  sqr(x_i[i0+1] - x_j[j0+1]) +
2092                  sqr(x_i[i0+2] - x_j[j0+2]);
2093
2094             if (d2 < rl2)
2095             {
2096                 return TRUE;
2097             }
2098         }
2099     }
2100
2101     return FALSE;
2102 }
2103
2104 /* SSE function which determines if any atom pair between two cells,
2105  * both with 8 atoms, is within distance sqrt(rl2).
2106  */
2107 static gmx_bool subc_in_range_sse8(int na_c,
2108                                    int si,const real *x_i,
2109                                    int csj,int stride,const real *x_j,
2110                                    real rl2)
2111 {
2112 #ifdef NBNXN_SEARCH_SSE_SINGLE
2113     __m128 ix_SSE0,iy_SSE0,iz_SSE0;
2114     __m128 ix_SSE1,iy_SSE1,iz_SSE1;
2115
2116     __m128 rc2_SSE;
2117
2118     int na_c_sse;
2119     int j0,j1;
2120
2121     rc2_SSE   = _mm_set1_ps(rl2);
2122
2123     na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_8BB;
2124     ix_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_8BB);
2125     iy_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_8BB);
2126     iz_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_8BB);
2127     ix_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_8BB);
2128     iy_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_8BB);
2129     iz_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_8BB);
2130
2131     /* We loop from the outer to the inner particles to maximize
2132      * the chance that we find a pair in range quickly and return.
2133      */
2134     j0 = csj*na_c;
2135     j1 = j0 + na_c - 1;
2136     while (j0 < j1)
2137     {
2138         __m128 jx0_SSE,jy0_SSE,jz0_SSE;
2139         __m128 jx1_SSE,jy1_SSE,jz1_SSE;
2140
2141         __m128 dx_SSE0,dy_SSE0,dz_SSE0;
2142         __m128 dx_SSE1,dy_SSE1,dz_SSE1;
2143         __m128 dx_SSE2,dy_SSE2,dz_SSE2;
2144         __m128 dx_SSE3,dy_SSE3,dz_SSE3;
2145
2146         __m128 rsq_SSE0;
2147         __m128 rsq_SSE1;
2148         __m128 rsq_SSE2;
2149         __m128 rsq_SSE3;
2150
2151         __m128 wco_SSE0;
2152         __m128 wco_SSE1;
2153         __m128 wco_SSE2;
2154         __m128 wco_SSE3;
2155         __m128 wco_any_SSE01,wco_any_SSE23,wco_any_SSE;
2156
2157         jx0_SSE = _mm_load1_ps(x_j+j0*stride+0);
2158         jy0_SSE = _mm_load1_ps(x_j+j0*stride+1);
2159         jz0_SSE = _mm_load1_ps(x_j+j0*stride+2);
2160
2161         jx1_SSE = _mm_load1_ps(x_j+j1*stride+0);
2162         jy1_SSE = _mm_load1_ps(x_j+j1*stride+1);
2163         jz1_SSE = _mm_load1_ps(x_j+j1*stride+2);
2164
2165         /* Calculate distance */
2166         dx_SSE0            = _mm_sub_ps(ix_SSE0,jx0_SSE);
2167         dy_SSE0            = _mm_sub_ps(iy_SSE0,jy0_SSE);
2168         dz_SSE0            = _mm_sub_ps(iz_SSE0,jz0_SSE);
2169         dx_SSE1            = _mm_sub_ps(ix_SSE1,jx0_SSE);
2170         dy_SSE1            = _mm_sub_ps(iy_SSE1,jy0_SSE);
2171         dz_SSE1            = _mm_sub_ps(iz_SSE1,jz0_SSE);
2172         dx_SSE2            = _mm_sub_ps(ix_SSE0,jx1_SSE);
2173         dy_SSE2            = _mm_sub_ps(iy_SSE0,jy1_SSE);
2174         dz_SSE2            = _mm_sub_ps(iz_SSE0,jz1_SSE);
2175         dx_SSE3            = _mm_sub_ps(ix_SSE1,jx1_SSE);
2176         dy_SSE3            = _mm_sub_ps(iy_SSE1,jy1_SSE);
2177         dz_SSE3            = _mm_sub_ps(iz_SSE1,jz1_SSE);
2178
2179         /* rsq = dx*dx+dy*dy+dz*dz */
2180         rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
2181         rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
2182         rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
2183         rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
2184
2185         wco_SSE0           = _mm_cmplt_ps(rsq_SSE0,rc2_SSE);
2186         wco_SSE1           = _mm_cmplt_ps(rsq_SSE1,rc2_SSE);
2187         wco_SSE2           = _mm_cmplt_ps(rsq_SSE2,rc2_SSE);
2188         wco_SSE3           = _mm_cmplt_ps(rsq_SSE3,rc2_SSE);
2189
2190         wco_any_SSE01      = _mm_or_ps(wco_SSE0,wco_SSE1);
2191         wco_any_SSE23      = _mm_or_ps(wco_SSE2,wco_SSE3);
2192         wco_any_SSE        = _mm_or_ps(wco_any_SSE01,wco_any_SSE23);
2193
2194         if (_mm_movemask_ps(wco_any_SSE))
2195         {
2196             return TRUE;
2197         }
2198
2199         j0++;
2200         j1--;
2201     }
2202     return FALSE;
2203
2204 #else
2205     /* No SSE */
2206     gmx_incons("SSE function called without SSE support");
2207
2208     return TRUE;
2209 #endif
2210 }
2211
2212 /* Returns the j sub-cell for index cj_ind */
2213 static int nbl_cj(const nbnxn_pairlist_t *nbl,int cj_ind)
2214 {
2215     return nbl->cj4[cj_ind>>2].cj[cj_ind & 3];
2216 }
2217
2218 /* Returns the i-interaction mask of the j sub-cell for index cj_ind */
2219 static unsigned nbl_imask0(const nbnxn_pairlist_t *nbl,int cj_ind)
2220 {
2221     return nbl->cj4[cj_ind>>2].imei[0].imask;
2222 }
2223
2224 /* Ensures there is enough space for extra extra exclusion masks */
2225 static void check_excl_space(nbnxn_pairlist_t *nbl,int extra)
2226 {
2227     if (nbl->nexcl+extra > nbl->excl_nalloc)
2228     {
2229         nbl->excl_nalloc = over_alloc_small(nbl->nexcl+extra);
2230         nbnxn_realloc_void((void **)&nbl->excl,
2231                            nbl->nexcl*sizeof(*nbl->excl),
2232                            nbl->excl_nalloc*sizeof(*nbl->excl),
2233                            nbl->alloc,nbl->free);
2234     }
2235 }
2236
2237 /* Ensures there is enough space for ncell extra j-cells in the list */
2238 static void check_subcell_list_space_simple(nbnxn_pairlist_t *nbl,
2239                                             int ncell)
2240 {
2241     int cj_max;
2242
2243     cj_max = nbl->ncj + ncell;
2244
2245     if (cj_max > nbl->cj_nalloc)
2246     {
2247         nbl->cj_nalloc = over_alloc_small(cj_max);
2248         nbnxn_realloc_void((void **)&nbl->cj,
2249                            nbl->ncj*sizeof(*nbl->cj),
2250                            nbl->cj_nalloc*sizeof(*nbl->cj),
2251                            nbl->alloc,nbl->free);
2252     }
2253 }
2254
2255 /* Ensures there is enough space for ncell extra j-subcells in the list */
2256 static void check_subcell_list_space_supersub(nbnxn_pairlist_t *nbl,
2257                                               int nsupercell)
2258 {
2259     int ncj4_max,j4,j,w,t;
2260
2261 #define NWARP       2
2262 #define WARP_SIZE  32
2263
2264     /* We can have maximally nsupercell*GPU_NSUBCELL sj lists */
2265     /* We can store 4 j-subcell - i-supercell pairs in one struct.
2266      * since we round down, we need one extra entry.
2267      */
2268     ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL + 4-1) >> 2);
2269
2270     if (ncj4_max > nbl->cj4_nalloc)
2271     {
2272         nbl->cj4_nalloc = over_alloc_small(ncj4_max);
2273         nbnxn_realloc_void((void **)&nbl->cj4,
2274                            nbl->work->cj4_init*sizeof(*nbl->cj4),
2275                            nbl->cj4_nalloc*sizeof(*nbl->cj4),
2276                            nbl->alloc,nbl->free);
2277     }
2278
2279     if (ncj4_max > nbl->work->cj4_init)
2280     {
2281         for(j4=nbl->work->cj4_init; j4<ncj4_max; j4++)
2282         {
2283             /* No i-subcells and no excl's in the list initially */
2284             for(w=0; w<NWARP; w++)
2285             {
2286                 nbl->cj4[j4].imei[w].imask    = 0U;
2287                 nbl->cj4[j4].imei[w].excl_ind = 0;
2288
2289             }
2290         }
2291         nbl->work->cj4_init = ncj4_max;
2292     }
2293 }
2294
2295 /* Set all excl masks for one GPU warp no exclusions */
2296 static void set_no_excls(nbnxn_excl_t *excl)
2297 {
2298     int t;
2299
2300     for(t=0; t<WARP_SIZE; t++)
2301     {
2302         /* Turn all interaction bits on */
2303         excl->pair[t] = NBNXN_INT_MASK_ALL;
2304     }
2305 }
2306
2307 /* Initializes a single nbnxn_pairlist_t data structure */
2308 static void nbnxn_init_pairlist(nbnxn_pairlist_t *nbl,
2309                                 gmx_bool bSimple,
2310                                 nbnxn_alloc_t *alloc,
2311                                 nbnxn_free_t  *free)
2312 {
2313     if (alloc == NULL)
2314     {
2315         nbl->alloc = nbnxn_alloc_aligned;
2316     }
2317     else
2318     {
2319         nbl->alloc = alloc;
2320     }
2321     if (free == NULL)
2322     {
2323         nbl->free = nbnxn_free_aligned;
2324     }
2325     else
2326     {
2327         nbl->free = free;
2328     }
2329
2330     nbl->bSimple     = bSimple;
2331     nbl->na_sc       = 0;
2332     nbl->na_ci       = 0;
2333     nbl->na_cj       = 0;
2334     nbl->nci         = 0;
2335     nbl->ci          = NULL;
2336     nbl->ci_nalloc   = 0;
2337     nbl->ncj         = 0;
2338     nbl->cj          = NULL;
2339     nbl->cj_nalloc   = 0;
2340     nbl->ncj4        = 0;
2341     /* We need one element extra in sj, so alloc initially with 1 */
2342     nbl->cj4_nalloc  = 0;
2343     nbl->cj4         = NULL;
2344     nbl->nci_tot     = 0;
2345
2346     if (!nbl->bSimple)
2347     {
2348         nbl->excl        = NULL;
2349         nbl->excl_nalloc = 0;
2350         nbl->nexcl       = 0;
2351         check_excl_space(nbl,1);
2352         nbl->nexcl       = 1;
2353         set_no_excls(&nbl->excl[0]);
2354     }
2355
2356     snew(nbl->work,1);
2357 #ifdef NBNXN_BBXXXX
2358     snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,16);
2359 #else
2360     snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,16);
2361 #endif
2362     snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,16);
2363 #ifdef NBNXN_SEARCH_SSE
2364     snew_aligned(nbl->work->x_ci_x86_simd128,1,16);
2365 #ifdef GMX_X86_AVX_256
2366     snew_aligned(nbl->work->x_ci_x86_simd256,1,32);
2367 #endif
2368 #endif
2369     snew_aligned(nbl->work->d2,GPU_NSUBCELL,16);
2370 }
2371
2372 void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
2373                              gmx_bool bSimple, gmx_bool bCombined,
2374                              nbnxn_alloc_t *alloc,
2375                              nbnxn_free_t  *free)
2376 {
2377     int i;
2378
2379     nbl_list->bSimple   = bSimple;
2380     nbl_list->bCombined = bCombined;
2381
2382     nbl_list->nnbl = gmx_omp_nthreads_get(emntNonbonded);
2383
2384     if (!nbl_list->bCombined &&
2385         nbl_list->nnbl > NBNXN_CELLBLOCK_MAX_THREADS)
2386     {
2387         gmx_fatal(FARGS,"%d OpenMP threads were requested. Since the non-bonded force buffer reduction is prohibitively slow with more than %d threads, we do not allow this. Use %d or less OpenMP threads.",
2388                   nbl_list->nnbl,NBNXN_CELLBLOCK_MAX_THREADS,NBNXN_CELLBLOCK_MAX_THREADS);
2389     }
2390
2391     snew(nbl_list->nbl,nbl_list->nnbl);
2392     /* Execute in order to avoid memory interleaving between threads */
2393 #pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)
2394     for(i=0; i<nbl_list->nnbl; i++)
2395     {
2396         /* Allocate the nblist data structure locally on each thread
2397          * to optimize memory access for NUMA architectures.
2398          */
2399         snew(nbl_list->nbl[i],1);
2400
2401         /* Only list 0 is used on the GPU, use normal allocation for i>0 */
2402         if (i == 0)
2403         {
2404             nbnxn_init_pairlist(nbl_list->nbl[i],nbl_list->bSimple,alloc,free);
2405         }
2406         else
2407         {
2408             nbnxn_init_pairlist(nbl_list->nbl[i],nbl_list->bSimple,NULL,NULL);
2409         }
2410     }
2411 }
2412
2413 /* Print statistics of a pair list, used for debug output */
2414 static void print_nblist_statistics_simple(FILE *fp,const nbnxn_pairlist_t *nbl,
2415                                            const nbnxn_search_t nbs,real rl)
2416 {
2417     const nbnxn_grid_t *grid;
2418     int cs[SHIFTS];
2419     int s,i,j;
2420     int npexcl;
2421
2422     /* This code only produces correct statistics with domain decomposition */
2423     grid = &nbs->grid[0];
2424
2425     fprintf(fp,"nbl nci %d ncj %d\n",
2426             nbl->nci,nbl->ncj);
2427     fprintf(fp,"nbl na_sc %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
2428             nbl->na_sc,rl,nbl->ncj,nbl->ncj/(double)grid->nc,
2429             nbl->ncj/(double)grid->nc*grid->na_sc,
2430             nbl->ncj/(double)grid->nc*grid->na_sc/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nc*grid->na_sc/det(nbs->box)));
2431
2432     fprintf(fp,"nbl average j cell list length %.1f\n",
2433             0.25*nbl->ncj/(double)nbl->nci);
2434
2435     for(s=0; s<SHIFTS; s++)
2436     {
2437         cs[s] = 0;
2438     }
2439     npexcl = 0;
2440     for(i=0; i<nbl->nci; i++)
2441     {
2442         cs[nbl->ci[i].shift & NBNXN_CI_SHIFT] +=
2443             nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start;
2444
2445         j = nbl->ci[i].cj_ind_start;
2446         while (j < nbl->ci[i].cj_ind_end &&
2447                nbl->cj[j].excl != NBNXN_INT_MASK_ALL)
2448         {
2449             npexcl++;
2450             j++;
2451         }
2452     }
2453     fprintf(fp,"nbl cell pairs, total: %d excl: %d %.1f%%\n",
2454             nbl->ncj,npexcl,100*npexcl/(double)nbl->ncj);
2455     for(s=0; s<SHIFTS; s++)
2456     {
2457         if (cs[s] > 0)
2458         {
2459             fprintf(fp,"nbl shift %2d ncj %3d\n",s,cs[s]);
2460         }
2461     }
2462 }
2463
2464 /* Print statistics of a pair lists, used for debug output */
2465 static void print_nblist_statistics_supersub(FILE *fp,const nbnxn_pairlist_t *nbl,
2466                                              const nbnxn_search_t nbs,real rl)
2467 {
2468     const nbnxn_grid_t *grid;
2469     int i,j4,j,si,b;
2470     int c[GPU_NSUBCELL+1];
2471
2472     /* This code only produces correct statistics with domain decomposition */
2473     grid = &nbs->grid[0];
2474
2475     fprintf(fp,"nbl nsci %d ncj4 %d nsi %d excl4 %d\n",
2476             nbl->nsci,nbl->ncj4,nbl->nci_tot,nbl->nexcl);
2477     fprintf(fp,"nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
2478             nbl->na_ci,rl,nbl->nci_tot,nbl->nci_tot/(double)grid->nsubc_tot,
2479             nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c,
2480             nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nsubc_tot*grid->na_c/det(nbs->box)));
2481
2482     fprintf(fp,"nbl average j super cell list length %.1f\n",
2483             0.25*nbl->ncj4/(double)nbl->nsci);
2484     fprintf(fp,"nbl average i sub cell list length %.1f\n",
2485             nbl->nci_tot/(0.25*nbl->ncj4));
2486
2487     for(si=0; si<=GPU_NSUBCELL; si++)
2488     {
2489         c[si] = 0;
2490     }
2491     for(i=0; i<nbl->nsci; i++)
2492     {
2493         for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
2494         {
2495             for(j=0; j<4; j++)
2496             {
2497                 b = 0;
2498                 for(si=0; si<GPU_NSUBCELL; si++)
2499                 {
2500                     if (nbl->cj4[j4].imei[0].imask & (1U << (j*GPU_NSUBCELL + si)))
2501                     {
2502                         b++;
2503                     }
2504                 }
2505                 c[b]++;
2506             }
2507         }
2508     }
2509     for(b=0; b<=GPU_NSUBCELL; b++)
2510     {
2511         fprintf(fp,"nbl j-list #i-subcell %d %7d %4.1f\n",
2512                 b,c[b],100.0*c[b]/(double)(nbl->ncj4*NBNXN_GPU_JGROUP_SIZE));
2513     }
2514 }
2515
2516 /* Print the full pair list, used for debug output */
2517 static void print_supersub_nsp(const char *fn,
2518                                const nbnxn_pairlist_t *nbl,
2519                                int iloc)
2520 {
2521     char buf[STRLEN];
2522     FILE *fp;
2523     int i,nsp,j4,p;
2524
2525     sprintf(buf,"%s_%s.xvg",fn,NONLOCAL_I(iloc) ? "nl" : "l");
2526     fp = ffopen(buf,"w");
2527
2528     for(i=0; i<nbl->nci; i++)
2529     {
2530         nsp = 0;
2531         for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
2532         {
2533             for(p=0; p<NBNXN_GPU_JGROUP_SIZE*GPU_NSUBCELL; p++)
2534             {
2535                 nsp += (nbl->cj4[j4].imei[0].imask >> p) & 1;
2536             }
2537         }
2538         fprintf(fp,"%4d %3d %3d\n",
2539                 i,
2540                 nsp,
2541                 nbl->sci[i].cj4_ind_end-nbl->sci[i].cj4_ind_start);
2542     }
2543
2544     fclose(fp);
2545 }
2546
2547 /* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp */
2548 static void low_get_nbl_exclusions(nbnxn_pairlist_t *nbl,int cj4,
2549                                    int warp,nbnxn_excl_t **excl)
2550 {
2551     if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
2552     {
2553         /* No exclusions set, make a new list entry */
2554         nbl->cj4[cj4].imei[warp].excl_ind = nbl->nexcl;
2555         nbl->nexcl++;
2556         *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
2557         set_no_excls(*excl);
2558     }
2559     else
2560     {
2561         /* We already have some exclusions, new ones can be added to the list */
2562         *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
2563     }
2564 }
2565
2566 /* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp,
2567  * allocates extra memory, if necessary.
2568  */
2569 static void get_nbl_exclusions_1(nbnxn_pairlist_t *nbl,int cj4,
2570                                  int warp,nbnxn_excl_t **excl)
2571 {
2572     if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
2573     {
2574         /* We need to make a new list entry, check if we have space */
2575         check_excl_space(nbl,1);
2576     }
2577     low_get_nbl_exclusions(nbl,cj4,warp,excl);
2578 }
2579
2580 /* Returns pointers to the exclusion mask for cj4-unit cj4 for both warps,
2581  * allocates extra memory, if necessary.
2582  */
2583 static void get_nbl_exclusions_2(nbnxn_pairlist_t *nbl,int cj4,
2584                                  nbnxn_excl_t **excl_w0,
2585                                  nbnxn_excl_t **excl_w1)
2586 {
2587     /* Check for space we might need */
2588     check_excl_space(nbl,2);
2589
2590     low_get_nbl_exclusions(nbl,cj4,0,excl_w0);
2591     low_get_nbl_exclusions(nbl,cj4,1,excl_w1);
2592 }
2593
2594 /* Sets the self exclusions i=j and pair exclusions i>j */
2595 static void set_self_and_newton_excls_supersub(nbnxn_pairlist_t *nbl,
2596                                                int cj4_ind,int sj_offset,
2597                                                int si)
2598 {
2599     nbnxn_excl_t *excl[2];
2600     int  ei,ej,w;
2601
2602     /* Here we only set the set self and double pair exclusions */
2603
2604     get_nbl_exclusions_2(nbl,cj4_ind,&excl[0],&excl[1]);
2605
2606     /* Only minor < major bits set */
2607     for(ej=0; ej<nbl->na_ci; ej++)
2608     {
2609         w = (ej>>2);
2610         for(ei=ej; ei<nbl->na_ci; ei++)
2611         {
2612             excl[w]->pair[(ej&(4-1))*nbl->na_ci+ei] &=
2613                 ~(1U << (sj_offset*GPU_NSUBCELL+si));
2614         }
2615     }
2616 }
2617
2618 /* Returns a diagonal or off-diagonal interaction mask for plain C lists */
2619 static unsigned int get_imask(gmx_bool rdiag,int ci,int cj)
2620 {
2621     return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
2622 }
2623
2624 #ifdef NBNXN_SEARCH_SSE
2625 /* Returns a diagonal or off-diagonal interaction mask for SIMD128 lists */
2626 static unsigned int get_imask_x86_simd128(gmx_bool rdiag,int ci,int cj)
2627 {
2628 #ifndef GMX_DOUBLE /* cj-size = 4 */
2629     return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
2630 #else              /* cj-size = 2 */
2631     return (rdiag && ci*2 == cj ? NBNXN_INT_MASK_DIAG_J2_0 :
2632             (rdiag && ci*2+1 == cj ? NBNXN_INT_MASK_DIAG_J2_1 :
2633              NBNXN_INT_MASK_ALL));
2634 #endif
2635 }
2636
2637 #ifdef GMX_X86_AVX_256
2638 /* Returns a diagonal or off-diagonal interaction mask for SIMD256 lists */
2639 static unsigned int get_imask_x86_simd256(gmx_bool rdiag,int ci,int cj)
2640 {
2641 #ifndef GMX_DOUBLE /* cj-size = 8 */
2642     return (rdiag && ci == cj*2 ? NBNXN_INT_MASK_DIAG_J8_0 :
2643             (rdiag && ci == cj*2+1 ? NBNXN_INT_MASK_DIAG_J8_1 :
2644              NBNXN_INT_MASK_ALL));
2645 #else              /* cj-size = 2 */
2646     return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
2647 #endif
2648 }
2649 #endif
2650 #endif /* NBNXN_SEARCH_SSE */
2651
2652 /* Plain C code for making a pair list of cell ci vs cell cjf-cjl.
2653  * Checks bounding box distances and possibly atom pair distances.
2654  */
2655 static void make_cluster_list_simple(const nbnxn_grid_t *gridj,
2656                                      nbnxn_pairlist_t *nbl,
2657                                      int ci,int cjf,int cjl,
2658                                      gmx_bool remove_sub_diag,
2659                                      const real *x_j,
2660                                      real rl2,float rbb2,
2661                                      int *ndistc)
2662 {
2663     const nbnxn_list_work_t *work;
2664
2665     const float *bb_ci;
2666     const real  *x_ci;
2667
2668     gmx_bool   InRange;
2669     real       d2;
2670     int        cjf_gl,cjl_gl,cj;
2671
2672     work = nbl->work;
2673
2674     bb_ci = nbl->work->bb_ci;
2675     x_ci  = nbl->work->x_ci;
2676
2677     InRange = FALSE;
2678     while (!InRange && cjf <= cjl)
2679     {
2680         d2 = subc_bb_dist2(0,bb_ci,cjf,gridj->bb);
2681         *ndistc += 2;
2682
2683         /* Check if the distance is within the distance where
2684          * we use only the bounding box distance rbb,
2685          * or within the cut-off and there is at least one atom pair
2686          * within the cut-off.
2687          */
2688         if (d2 < rbb2)
2689         {
2690             InRange = TRUE;
2691         }
2692         else if (d2 < rl2)
2693         {
2694             int i,j;
2695
2696             cjf_gl = gridj->cell0 + cjf;
2697             for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
2698             {
2699                 for(j=0; j<NBNXN_CPU_CLUSTER_I_SIZE; j++)
2700                 {
2701                     InRange = InRange ||
2702                         (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
2703                          sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
2704                          sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
2705                 }
2706             }
2707             *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
2708         }
2709         if (!InRange)
2710         {
2711             cjf++;
2712         }
2713     }
2714     if (!InRange)
2715     {
2716         return;
2717     }
2718
2719     InRange = FALSE;
2720     while (!InRange && cjl > cjf)
2721     {
2722         d2 = subc_bb_dist2(0,bb_ci,cjl,gridj->bb);
2723         *ndistc += 2;
2724
2725         /* Check if the distance is within the distance where
2726          * we use only the bounding box distance rbb,
2727          * or within the cut-off and there is at least one atom pair
2728          * within the cut-off.
2729          */
2730         if (d2 < rbb2)
2731         {
2732             InRange = TRUE;
2733         }
2734         else if (d2 < rl2)
2735         {
2736             int i,j;
2737
2738             cjl_gl = gridj->cell0 + cjl;
2739             for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
2740             {
2741                 for(j=0; j<NBNXN_CPU_CLUSTER_I_SIZE; j++)
2742                 {
2743                     InRange = InRange ||
2744                         (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
2745                          sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
2746                          sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
2747                 }
2748             }
2749             *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
2750         }
2751         if (!InRange)
2752         {
2753             cjl--;
2754         }
2755     }
2756
2757     if (cjf <= cjl)
2758     {
2759         for(cj=cjf; cj<=cjl; cj++)
2760         {
2761             /* Store cj and the interaction mask */
2762             nbl->cj[nbl->ncj].cj   = gridj->cell0 + cj;
2763             nbl->cj[nbl->ncj].excl = get_imask(remove_sub_diag,ci,cj);
2764             nbl->ncj++;
2765         }
2766         /* Increase the closing index in i super-cell list */
2767         nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
2768     }
2769 }
2770
2771 #ifdef NBNXN_SEARCH_SSE
2772 /* Include make_cluster_list_x86_simd128/256 */
2773 #define GMX_MM128_HERE
2774 #include "gmx_x86_simd_macros.h"
2775 #define STRIDE_S  PACK_X4
2776 #include "nbnxn_search_x86_simd.h"
2777 #undef STRIDE_S
2778 #undef GMX_MM128_HERE
2779 #ifdef GMX_X86_AVX_256
2780 /* Include make_cluster_list_x86_simd128/256 */
2781 #define GMX_MM256_HERE
2782 #include "gmx_x86_simd_macros.h"
2783 #define STRIDE_S  GMX_X86_SIMD_WIDTH_HERE
2784 #include "nbnxn_search_x86_simd.h"
2785 #undef STRIDE_S
2786 #undef GMX_MM256_HERE
2787 #endif
2788 #endif
2789
2790 /* Plain C or SSE code for making a pair list of super-cell sci vs scj.
2791  * Checks bounding box distances and possibly atom pair distances.
2792  */
2793 static void make_cluster_list_supersub(const nbnxn_search_t nbs,
2794                                        const nbnxn_grid_t *gridi,
2795                                        const nbnxn_grid_t *gridj,
2796                                        nbnxn_pairlist_t *nbl,
2797                                        int sci,int scj,
2798                                        gmx_bool sci_equals_scj,
2799                                        int stride,const real *x,
2800                                        real rl2,float rbb2,
2801                                        int *ndistc)
2802 {
2803     int  na_c;
2804     int  npair;
2805     int  cjo,ci1,ci,cj,cj_gl;
2806     int  cj4_ind,cj_offset;
2807     unsigned imask;
2808     nbnxn_cj4_t *cj4;
2809     const float *bb_ci;
2810     const real *x_ci;
2811     float *d2l,d2;
2812     int  w;
2813 #define PRUNE_LIST_CPU_ONE
2814 #ifdef PRUNE_LIST_CPU_ONE
2815     int  ci_last=-1;
2816 #endif
2817
2818     d2l = nbl->work->d2;
2819
2820     bb_ci = nbl->work->bb_ci;
2821     x_ci  = nbl->work->x_ci;
2822
2823     na_c = gridj->na_c;
2824
2825     for(cjo=0; cjo<gridj->nsubc[scj]; cjo++)
2826     {
2827         cj4_ind   = (nbl->work->cj_ind >> 2);
2828         cj_offset = nbl->work->cj_ind - cj4_ind*NBNXN_GPU_JGROUP_SIZE;
2829         cj4       = &nbl->cj4[cj4_ind];
2830
2831         cj = scj*GPU_NSUBCELL + cjo;
2832
2833         cj_gl = gridj->cell0*GPU_NSUBCELL + cj;
2834
2835         /* Initialize this j-subcell i-subcell list */
2836         cj4->cj[cj_offset] = cj_gl;
2837         imask              = 0;
2838
2839         if (sci_equals_scj)
2840         {
2841             ci1 = cjo + 1;
2842         }
2843         else
2844         {
2845             ci1 = gridi->nsubc[sci];
2846         }
2847
2848 #ifdef NBNXN_BBXXXX
2849         /* Determine all ci1 bb distances in one call with SSE */
2850         subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_8BB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_8BB-1)),
2851                                ci1,bb_ci,d2l);
2852         *ndistc += na_c*2;
2853 #endif
2854
2855         npair = 0;
2856         /* We use a fixed upper-bound instead of ci1 to help optimization */
2857         for(ci=0; ci<GPU_NSUBCELL; ci++)
2858         {
2859             if (ci == ci1)
2860             {
2861                 break;
2862             }
2863
2864 #ifndef NBNXN_BBXXXX
2865             /* Determine the bb distance between ci and cj */
2866             d2l[ci] = subc_bb_dist2(ci,bb_ci,cj,gridj->bb);
2867             *ndistc += 2;
2868 #endif
2869             d2 = d2l[ci];
2870
2871 #ifdef PRUNE_LIST_CPU_ALL
2872             /* Check if the distance is within the distance where
2873              * we use only the bounding box distance rbb,
2874              * or within the cut-off and there is at least one atom pair
2875              * within the cut-off. This check is very costly.
2876              */
2877             *ndistc += na_c*na_c;
2878             if (d2 < rbb2 ||
2879                 (d2 < rl2 && subc_in_range_x(na_c,ci,x_ci,cj_gl,stride,x,rl2)))
2880 #else
2881             /* Check if the distance between the two bounding boxes
2882              * in within the pair-list cut-off.
2883              */
2884             if (d2 < rl2)
2885 #endif
2886             {
2887                 /* Flag this i-subcell to be taken into account */
2888                 imask |= (1U << (cj_offset*GPU_NSUBCELL+ci));
2889
2890 #ifdef PRUNE_LIST_CPU_ONE
2891                 ci_last = ci;
2892 #endif
2893
2894                 npair++;
2895             }
2896         }
2897
2898 #ifdef PRUNE_LIST_CPU_ONE
2899         /* If we only found 1 pair, check if any atoms are actually
2900          * within the cut-off, so we could get rid of it.
2901          */
2902         if (npair == 1 && d2l[ci_last] >= rbb2)
2903         {
2904             /* Avoid using function pointers here, as it's slower */
2905             if (
2906 #ifdef NBNXN_8BB_SSE
2907                 !subc_in_range_sse8
2908 #else
2909                 !subc_in_range_x
2910 #endif
2911                                 (na_c,ci_last,x_ci,cj_gl,stride,x,rl2))
2912             {
2913                 imask &= ~(1U << (cj_offset*GPU_NSUBCELL+ci_last));
2914                 npair--;
2915             }
2916         }
2917 #endif
2918
2919         if (npair > 0)
2920         {
2921             /* We have a useful sj entry, close it now */
2922
2923             /* Set the exclucions for the ci== sj entry.
2924              * Here we don't bother to check if this entry is actually flagged,
2925              * as it will nearly always be in the list.
2926              */
2927             if (sci_equals_scj)
2928             {
2929                 set_self_and_newton_excls_supersub(nbl,cj4_ind,cj_offset,cjo);
2930             }
2931
2932             /* Copy the cluster interaction mask to the list */
2933             for(w=0; w<NWARP; w++)
2934             {
2935                 cj4->imei[w].imask |= imask;
2936             }
2937
2938             nbl->work->cj_ind++;
2939
2940             /* Keep the count */
2941             nbl->nci_tot += npair;
2942
2943             /* Increase the closing index in i super-cell list */
2944             nbl->sci[nbl->nsci].cj4_ind_end = ((nbl->work->cj_ind+4-1)>>2);
2945         }
2946     }
2947 }
2948
2949 /* Set all atom-pair exclusions from the topology stored in excl
2950  * as masks in the pair-list for simple list i-entry nbl_ci
2951  */
2952 static void set_ci_top_excls(const nbnxn_search_t nbs,
2953                              nbnxn_pairlist_t *nbl,
2954                              gmx_bool diagRemoved,
2955                              int na_ci_2log,
2956                              int na_cj_2log,
2957                              const nbnxn_ci_t *nbl_ci,
2958                              const t_blocka *excl)
2959 {
2960     const int *cell;
2961     int ci;
2962     int cj_ind_first,cj_ind_last;
2963     int cj_first,cj_last;
2964     int ndirect;
2965     int i,ai,aj,si,eind,ge,se;
2966     int found,cj_ind_0,cj_ind_1,cj_ind_m;
2967     int cj_m;
2968     gmx_bool Found_si;
2969     int si_ind;
2970     nbnxn_excl_t *nbl_excl;
2971     int inner_i,inner_e;
2972
2973     cell = nbs->cell;
2974
2975     if (nbl_ci->cj_ind_end == nbl_ci->cj_ind_start)
2976     {
2977         /* Empty list */
2978         return;
2979     }
2980
2981     ci = nbl_ci->ci;
2982
2983     cj_ind_first = nbl_ci->cj_ind_start;
2984     cj_ind_last  = nbl->ncj - 1;
2985
2986     cj_first = nbl->cj[cj_ind_first].cj;
2987     cj_last  = nbl->cj[cj_ind_last].cj;
2988
2989     /* Determine how many contiguous j-cells we have starting
2990      * from the first i-cell. This number can be used to directly
2991      * calculate j-cell indices for excluded atoms.
2992      */
2993     ndirect = 0;
2994     if (na_ci_2log == na_cj_2log)
2995     {
2996         while (cj_ind_first + ndirect <= cj_ind_last &&
2997                nbl->cj[cj_ind_first+ndirect].cj == ci + ndirect)
2998         {
2999             ndirect++;
3000         }
3001     }
3002 #ifdef NBNXN_SEARCH_SSE
3003     else
3004     {
3005         while (cj_ind_first + ndirect <= cj_ind_last &&
3006                nbl->cj[cj_ind_first+ndirect].cj == ci_to_cj(na_cj_2log,ci) + ndirect)
3007         {
3008             ndirect++;
3009         }
3010     }
3011 #endif
3012
3013     /* Loop over the atoms in the i super-cell */
3014     for(i=0; i<nbl->na_sc; i++)
3015     {
3016         ai = nbs->a[ci*nbl->na_sc+i];
3017         if (ai >= 0)
3018         {
3019             si  = (i>>na_ci_2log);
3020
3021             /* Loop over the topology-based exclusions for this i-atom */
3022             for(eind=excl->index[ai]; eind<excl->index[ai+1]; eind++)
3023             {
3024                 aj = excl->a[eind];
3025
3026                 if (aj == ai)
3027                 {
3028                     /* The self exclusion are already set, save some time */
3029                     continue;
3030                 }
3031
3032                 ge = cell[aj];
3033
3034                 /* Without shifts we only calculate interactions j>i
3035                  * for one-way pair-lists.
3036                  */
3037                 if (diagRemoved && ge <= ci*nbl->na_sc + i)
3038                 {
3039                     continue;
3040                 }
3041
3042                 se = (ge >> na_cj_2log);
3043
3044                 /* Could the cluster se be in our list? */
3045                 if (se >= cj_first && se <= cj_last)
3046                 {
3047                     if (se < cj_first + ndirect)
3048                     {
3049                         /* We can calculate cj_ind directly from se */
3050                         found = cj_ind_first + se - cj_first;
3051                     }
3052                     else
3053                     {
3054                         /* Search for se using bisection */
3055                         found = -1;
3056                         cj_ind_0 = cj_ind_first + ndirect;
3057                         cj_ind_1 = cj_ind_last + 1;
3058                         while (found == -1 && cj_ind_0 < cj_ind_1)
3059                         {
3060                             cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
3061
3062                             cj_m = nbl->cj[cj_ind_m].cj;
3063
3064                             if (se == cj_m)
3065                             {
3066                                 found = cj_ind_m;
3067                             }
3068                             else if (se < cj_m)
3069                             {
3070                                 cj_ind_1 = cj_ind_m;
3071                             }
3072                             else
3073                             {
3074                                 cj_ind_0 = cj_ind_m + 1;
3075                             }
3076                         }
3077                     }
3078
3079                     if (found >= 0)
3080                     {
3081                         inner_i = i  - (si << na_ci_2log);
3082                         inner_e = ge - (se << na_cj_2log);
3083
3084                         nbl->cj[found].excl &= ~(1U<<((inner_i<<na_cj_2log) + inner_e));
3085                     }
3086                 }
3087             }
3088         }
3089     }
3090 }
3091
3092 /* Set all atom-pair exclusions from the topology stored in excl
3093  * as masks in the pair-list for i-super-cell entry nbl_sci
3094  */
3095 static void set_sci_top_excls(const nbnxn_search_t nbs,
3096                               nbnxn_pairlist_t *nbl,
3097                               gmx_bool diagRemoved,
3098                               int na_c_2log,
3099                               const nbnxn_sci_t *nbl_sci,
3100                               const t_blocka *excl)
3101 {
3102     const int *cell;
3103     int na_c;
3104     int sci;
3105     int cj_ind_first,cj_ind_last;
3106     int cj_first,cj_last;
3107     int ndirect;
3108     int i,ai,aj,si,eind,ge,se;
3109     int found,cj_ind_0,cj_ind_1,cj_ind_m;
3110     int cj_m;
3111     gmx_bool Found_si;
3112     int si_ind;
3113     nbnxn_excl_t *nbl_excl;
3114     int inner_i,inner_e,w;
3115
3116     cell = nbs->cell;
3117
3118     na_c = nbl->na_ci;
3119
3120     if (nbl_sci->cj4_ind_end == nbl_sci->cj4_ind_start)
3121     {
3122         /* Empty list */
3123         return;
3124     }
3125
3126     sci = nbl_sci->sci;
3127
3128     cj_ind_first = nbl_sci->cj4_ind_start*NBNXN_GPU_JGROUP_SIZE;
3129     cj_ind_last  = nbl->work->cj_ind - 1;
3130
3131     cj_first = nbl->cj4[nbl_sci->cj4_ind_start].cj[0];
3132     cj_last  = nbl_cj(nbl,cj_ind_last);
3133
3134     /* Determine how many contiguous j-clusters we have starting
3135      * from the first i-cluster. This number can be used to directly
3136      * calculate j-cluster indices for excluded atoms.
3137      */
3138     ndirect = 0;
3139     while (cj_ind_first + ndirect <= cj_ind_last &&
3140            nbl_cj(nbl,cj_ind_first+ndirect) == sci*GPU_NSUBCELL + ndirect)
3141     {
3142         ndirect++;
3143     }
3144
3145     /* Loop over the atoms in the i super-cell */
3146     for(i=0; i<nbl->na_sc; i++)
3147     {
3148         ai = nbs->a[sci*nbl->na_sc+i];
3149         if (ai >= 0)
3150         {
3151             si  = (i>>na_c_2log);
3152
3153             /* Loop over the topology-based exclusions for this i-atom */
3154             for(eind=excl->index[ai]; eind<excl->index[ai+1]; eind++)
3155             {
3156                 aj = excl->a[eind];
3157
3158                 if (aj == ai)
3159                 {
3160                     /* The self exclusion are already set, save some time */
3161                     continue;
3162                 }
3163
3164                 ge = cell[aj];
3165
3166                 /* Without shifts we only calculate interactions j>i
3167                  * for one-way pair-lists.
3168                  */
3169                 if (diagRemoved && ge <= sci*nbl->na_sc + i)
3170                 {
3171                     continue;
3172                 }
3173
3174                 se = ge>>na_c_2log;
3175                 /* Could the cluster se be in our list? */
3176                 if (se >= cj_first && se <= cj_last)
3177                 {
3178                     if (se < cj_first + ndirect)
3179                     {
3180                         /* We can calculate cj_ind directly from se */
3181                         found = cj_ind_first + se - cj_first;
3182                     }
3183                     else
3184                     {
3185                         /* Search for se using bisection */
3186                         found = -1;
3187                         cj_ind_0 = cj_ind_first + ndirect;
3188                         cj_ind_1 = cj_ind_last + 1;
3189                         while (found == -1 && cj_ind_0 < cj_ind_1)
3190                         {
3191                             cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
3192
3193                             cj_m = nbl_cj(nbl,cj_ind_m);
3194
3195                             if (se == cj_m)
3196                             {
3197                                 found = cj_ind_m;
3198                             }
3199                             else if (se < cj_m)
3200                             {
3201                                 cj_ind_1 = cj_ind_m;
3202                             }
3203                             else
3204                             {
3205                                 cj_ind_0 = cj_ind_m + 1;
3206                             }
3207                         }
3208                     }
3209
3210                     if (found >= 0)
3211                     {
3212                         inner_i = i  - si*na_c;
3213                         inner_e = ge - se*na_c;
3214
3215 /* Macro for getting the index of atom a within a cluster */
3216 #define AMODI(a)  ((a) & (NBNXN_CPU_CLUSTER_I_SIZE - 1))
3217 /* Macro for converting an atom number to a cluster number */
3218 #define A2CI(a)   ((a) >> NBNXN_CPU_CLUSTER_I_SIZE_2LOG)
3219
3220                         if (nbl_imask0(nbl,found) & (1U << (AMODI(found)*GPU_NSUBCELL + si)))
3221                         {
3222                             w       = (inner_e >> 2);
3223
3224                             get_nbl_exclusions_1(nbl,A2CI(found),w,&nbl_excl);
3225
3226                             nbl_excl->pair[AMODI(inner_e)*nbl->na_ci+inner_i] &=
3227                                 ~(1U << (AMODI(found)*GPU_NSUBCELL + si));
3228                         }
3229
3230 #undef AMODI
3231 #undef A2CI
3232                     }
3233                 }
3234             }
3235         }
3236     }
3237 }
3238
3239 /* Reallocate the simple ci list for at least n entries */
3240 static void nb_realloc_ci(nbnxn_pairlist_t *nbl,int n)
3241 {
3242     nbl->ci_nalloc = over_alloc_small(n);
3243     nbnxn_realloc_void((void **)&nbl->ci,
3244                        nbl->nci*sizeof(*nbl->ci),
3245                        nbl->ci_nalloc*sizeof(*nbl->ci),
3246                        nbl->alloc,nbl->free);
3247 }
3248
3249 /* Reallocate the super-cell sci list for at least n entries */
3250 static void nb_realloc_sci(nbnxn_pairlist_t *nbl,int n)
3251 {
3252     nbl->sci_nalloc = over_alloc_small(n);
3253     nbnxn_realloc_void((void **)&nbl->sci,
3254                        nbl->nsci*sizeof(*nbl->sci),
3255                        nbl->sci_nalloc*sizeof(*nbl->sci),
3256                        nbl->alloc,nbl->free);
3257 }
3258
3259 /* Make a new ci entry at index nbl->nci */
3260 static void new_ci_entry(nbnxn_pairlist_t *nbl,int ci,int shift,int flags,
3261                          nbnxn_list_work_t *work)
3262 {
3263     if (nbl->nci + 1 > nbl->ci_nalloc)
3264     {
3265         nb_realloc_ci(nbl,nbl->nci+1);
3266     }
3267     nbl->ci[nbl->nci].ci            = ci;
3268     nbl->ci[nbl->nci].shift         = shift;
3269     /* Store the interaction flags along with the shift */
3270     nbl->ci[nbl->nci].shift        |= flags;
3271     nbl->ci[nbl->nci].cj_ind_start  = nbl->ncj;
3272     nbl->ci[nbl->nci].cj_ind_end    = nbl->ncj;
3273 }
3274
3275 /* Make a new sci entry at index nbl->nsci */
3276 static void new_sci_entry(nbnxn_pairlist_t *nbl,int sci,int shift,int flags,
3277                           nbnxn_list_work_t *work)
3278 {
3279     if (nbl->nsci + 1 > nbl->sci_nalloc)
3280     {
3281         nb_realloc_sci(nbl,nbl->nsci+1);
3282     }
3283     nbl->sci[nbl->nsci].sci           = sci;
3284     nbl->sci[nbl->nsci].shift         = shift;
3285     nbl->sci[nbl->nsci].cj4_ind_start = nbl->ncj4;
3286     nbl->sci[nbl->nsci].cj4_ind_end   = nbl->ncj4;
3287 }
3288
3289 /* Sort the simple j-list cj on exclusions.
3290  * Entries with exclusions will all be sorted to the beginning of the list.
3291  */
3292 static void sort_cj_excl(nbnxn_cj_t *cj,int ncj,
3293                          nbnxn_list_work_t *work)
3294 {
3295     int jnew,j;
3296
3297     if (ncj > work->cj_nalloc)
3298     {
3299         work->cj_nalloc = over_alloc_large(ncj);
3300         srenew(work->cj,work->cj_nalloc);
3301     }
3302
3303     /* Make a list of the j-cells involving exclusions */
3304     jnew = 0;
3305     for(j=0; j<ncj; j++)
3306     {
3307         if (cj[j].excl != NBNXN_INT_MASK_ALL)
3308         {
3309             work->cj[jnew++] = cj[j];
3310         }
3311     }
3312     /* Check if there are exclusions at all or not just the first entry */
3313     if (!((jnew == 0) ||
3314           (jnew == 1 && cj[0].excl != NBNXN_INT_MASK_ALL)))
3315     {
3316         for(j=0; j<ncj; j++)
3317         {
3318             if (cj[j].excl == NBNXN_INT_MASK_ALL)
3319             {
3320                 work->cj[jnew++] = cj[j];
3321             }
3322         }
3323         for(j=0; j<ncj; j++)
3324         {
3325             cj[j] = work->cj[j];
3326         }
3327     }
3328 }
3329
3330 /* Close this simple list i entry */
3331 static void close_ci_entry_simple(nbnxn_pairlist_t *nbl)
3332 {
3333     int jlen;
3334
3335     /* All content of the new ci entry have already been filled correctly,
3336      * we only need to increase the count here (for non empty lists).
3337      */
3338     jlen = nbl->ci[nbl->nci].cj_ind_end - nbl->ci[nbl->nci].cj_ind_start;
3339     if (jlen > 0)
3340     {
3341         sort_cj_excl(nbl->cj+nbl->ci[nbl->nci].cj_ind_start,jlen,nbl->work);
3342
3343         if (nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0))
3344         {
3345             nbl->work->ncj_hlj += jlen;
3346         }
3347         else if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
3348         {
3349             nbl->work->ncj_noq += jlen;
3350         }
3351
3352         nbl->nci++;
3353     }
3354 }
3355
3356 /* Split sci entry for load balancing on the GPU.
3357  * As we only now the current count on our own thread,
3358  * we will need to estimate the current total amount of i-entries.
3359  * As the lists get concatenated later, this estimate depends
3360  * both on nthread and our own thread index thread.
3361  */
3362 static void split_sci_entry(nbnxn_pairlist_t *nbl,
3363                             int nsp_max_av,gmx_bool progBal,int nc_bal,
3364                             int thread,int nthread)
3365 {
3366     int nsci_est;
3367     int nsp_max;
3368     int cj4_start,cj4_end,j4len,cj4;
3369     int sci;
3370     int nsp,nsp_sci,nsp_cj4,nsp_cj4_e,nsp_cj4_p;
3371     int p;
3372
3373     /* Estimate the total numbers of ci's of the nblist combined
3374      * over all threads using the target number of ci's.
3375      */
3376     nsci_est = nc_bal*thread/nthread + nbl->nsci;
3377     if (progBal)
3378     {
3379         /* The first ci blocks should be larger, to avoid overhead.
3380          * The last ci blocks should be smaller, to improve load balancing.
3381          */
3382         nsp_max = max(1,
3383                       nsp_max_av*nc_bal*3/(2*(nsci_est - 1 + nc_bal)));
3384     }
3385     else
3386     {
3387         nsp_max = nsp_max_av;
3388     }
3389
3390     cj4_start = nbl->sci[nbl->nsci-1].cj4_ind_start;
3391     cj4_end   = nbl->sci[nbl->nsci-1].cj4_ind_end;
3392     j4len = cj4_end - cj4_start;
3393
3394     if (j4len > 1 && j4len*GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE > nsp_max)
3395     {
3396         /* Remove the last ci entry and process the cj4's again */
3397         nbl->nsci -= 1;
3398
3399         sci        = nbl->nsci;
3400         cj4        = cj4_start;
3401         nsp        = 0;
3402         nsp_sci    = 0;
3403         nsp_cj4_e  = 0;
3404         nsp_cj4    = 0;
3405         while (cj4 < cj4_end)
3406         {
3407             nsp_cj4_p = nsp_cj4;
3408             nsp_cj4   = 0;
3409             for(p=0; p<GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE; p++)
3410             {
3411                 nsp_cj4 += (nbl->cj4[cj4].imei[0].imask >> p) & 1;
3412             }
3413             nsp += nsp_cj4;
3414
3415             if (nsp > nsp_max && nsp > nsp_cj4)
3416             {
3417                 nbl->sci[sci].cj4_ind_end = cj4;
3418                 sci++;
3419                 nbl->nsci++;
3420                 if (nbl->nsci+1 > nbl->sci_nalloc)
3421                 {
3422                     nb_realloc_sci(nbl,nbl->nsci+1);
3423                 }
3424                 nbl->sci[sci].sci           = nbl->sci[nbl->nsci-1].sci;
3425                 nbl->sci[sci].shift         = nbl->sci[nbl->nsci-1].shift;
3426                 nbl->sci[sci].cj4_ind_start = cj4;
3427                 nsp_sci   = nsp - nsp_cj4;
3428                 nsp_cj4_e = nsp_cj4_p;
3429                 nsp       = nsp_cj4;
3430             }
3431
3432             cj4++;
3433         }
3434
3435         /* Put the remaining cj4's in a new ci entry */
3436         nbl->sci[sci].cj4_ind_end = cj4_end;
3437
3438         /* Possibly balance out the last two ci's
3439          * by moving the last cj4 of the second last ci.
3440          */
3441         if (nsp_sci - nsp_cj4_e >= nsp + nsp_cj4_e)
3442         {
3443             nbl->sci[sci-1].cj4_ind_end--;
3444             nbl->sci[sci].cj4_ind_start--;
3445         }
3446
3447         sci++;
3448         nbl->nsci++;
3449     }
3450 }
3451
3452 /* Clost this super/sub list i entry */
3453 static void close_ci_entry_supersub(nbnxn_pairlist_t *nbl,
3454                                     int nsp_max_av,
3455                                     gmx_bool progBal,int nc_bal,
3456                                     int thread,int nthread)
3457 {
3458     int j4len,tlen;
3459     int nb,b;
3460
3461     /* All content of the new ci entry have already been filled correctly,
3462      * we only need to increase the count here (for non empty lists).
3463      */
3464     j4len = nbl->sci[nbl->nsci].cj4_ind_end - nbl->sci[nbl->nsci].cj4_ind_start;
3465     if (j4len > 0)
3466     {
3467         /* We can only have complete blocks of 4 j-entries in a list,
3468          * so round the count up before closing.
3469          */
3470         nbl->ncj4         = ((nbl->work->cj_ind + 4-1) >> 2);
3471         nbl->work->cj_ind = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
3472
3473         nbl->nsci++;
3474
3475         if (nsp_max_av > 0)
3476         {
3477             split_sci_entry(nbl,nsp_max_av,progBal,nc_bal,thread,nthread);
3478         }
3479     }
3480 }
3481
3482 /* Syncs the working array before adding another grid pair to the list */
3483 static void sync_work(nbnxn_pairlist_t *nbl)
3484 {
3485     if (!nbl->bSimple)
3486     {
3487         nbl->work->cj_ind   = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
3488         nbl->work->cj4_init = nbl->ncj4;
3489     }
3490 }
3491
3492 /* Clears an nbnxn_pairlist_t data structure */
3493 static void clear_pairlist(nbnxn_pairlist_t *nbl)
3494 {
3495     nbl->nci           = 0;
3496     nbl->nsci          = 0;
3497     nbl->ncj           = 0;
3498     nbl->ncj4          = 0;
3499     nbl->nci_tot       = 0;
3500     nbl->nexcl         = 1;
3501
3502     nbl->work->ncj_noq = 0;
3503     nbl->work->ncj_hlj = 0;
3504 }
3505
3506 /* Sets a simple list i-cell bounding box, including PBC shift */
3507 static void set_icell_bb_simple(const float *bb,int ci,
3508                                 real shx,real shy,real shz,
3509                                 float *bb_ci)
3510 {
3511     int ia;
3512
3513     ia = ci*NNBSBB_B;
3514     bb_ci[BBL_X] = bb[ia+BBL_X] + shx;
3515     bb_ci[BBL_Y] = bb[ia+BBL_Y] + shy;
3516     bb_ci[BBL_Z] = bb[ia+BBL_Z] + shz;
3517     bb_ci[BBU_X] = bb[ia+BBU_X] + shx;
3518     bb_ci[BBU_Y] = bb[ia+BBU_Y] + shy;
3519     bb_ci[BBU_Z] = bb[ia+BBU_Z] + shz;
3520 }
3521
3522 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
3523 static void set_icell_bb_supersub(const float *bb,int ci,
3524                                   real shx,real shy,real shz,
3525                                   float *bb_ci)
3526 {
3527     int ia,m,i;
3528
3529 #ifdef NBNXN_BBXXXX
3530     ia = ci*(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX;
3531     for(m=0; m<(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX; m+=NNBSBB_XXXX)
3532     {
3533         for(i=0; i<STRIDE_8BB; i++)
3534         {
3535             bb_ci[m+0*STRIDE_8BB+i] = bb[ia+m+0*STRIDE_8BB+i] + shx;
3536             bb_ci[m+1*STRIDE_8BB+i] = bb[ia+m+1*STRIDE_8BB+i] + shy;
3537             bb_ci[m+2*STRIDE_8BB+i] = bb[ia+m+2*STRIDE_8BB+i] + shz;
3538             bb_ci[m+3*STRIDE_8BB+i] = bb[ia+m+3*STRIDE_8BB+i] + shx;
3539             bb_ci[m+4*STRIDE_8BB+i] = bb[ia+m+4*STRIDE_8BB+i] + shy;
3540             bb_ci[m+5*STRIDE_8BB+i] = bb[ia+m+5*STRIDE_8BB+i] + shz;
3541         }
3542     }
3543 #else
3544     ia = ci*GPU_NSUBCELL*NNBSBB_B;
3545     for(i=0; i<GPU_NSUBCELL*NNBSBB_B; i+=NNBSBB_B)
3546     {
3547         bb_ci[BBL_X] = bb[ia+BBL_X] + shx;
3548         bb_ci[BBL_Y] = bb[ia+BBL_Y] + shy;
3549         bb_ci[BBL_Z] = bb[ia+BBL_Z] + shz;
3550         bb_ci[BBU_X] = bb[ia+BBU_X] + shx;
3551         bb_ci[BBU_Y] = bb[ia+BBU_Y] + shy;
3552         bb_ci[BBU_Z] = bb[ia+BBU_Z] + shz;
3553     }
3554 #endif
3555 }
3556
3557 /* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
3558 static void icell_set_x_simple(int ci,
3559                                real shx,real shy,real shz,
3560                                int na_c,
3561                                int stride,const real *x,
3562                                nbnxn_list_work_t *work)
3563 {
3564     int  ia,i;
3565
3566     ia = ci*NBNXN_CPU_CLUSTER_I_SIZE;
3567
3568     for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE; i++)
3569     {
3570         work->x_ci[i*STRIDE_XYZ+XX] = x[(ia+i)*stride+XX] + shx;
3571         work->x_ci[i*STRIDE_XYZ+YY] = x[(ia+i)*stride+YY] + shy;
3572         work->x_ci[i*STRIDE_XYZ+ZZ] = x[(ia+i)*stride+ZZ] + shz;
3573     }
3574 }
3575
3576 /* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
3577 static void icell_set_x_supersub(int ci,
3578                                  real shx,real shy,real shz,
3579                                  int na_c,
3580                                  int stride,const real *x,
3581                                  nbnxn_list_work_t *work)
3582 {
3583     int  ia,i;
3584     real *x_ci;
3585
3586     x_ci = work->x_ci;
3587
3588     ia = ci*GPU_NSUBCELL*na_c;
3589     for(i=0; i<GPU_NSUBCELL*na_c; i++)
3590     {
3591         x_ci[i*DIM + XX] = x[(ia+i)*stride + XX] + shx;
3592         x_ci[i*DIM + YY] = x[(ia+i)*stride + YY] + shy;
3593         x_ci[i*DIM + ZZ] = x[(ia+i)*stride + ZZ] + shz;
3594     }
3595 }
3596
3597 #ifdef NBNXN_SEARCH_SSE
3598 /* Copies PBC shifted super-cell packed atom coordinates to working array */
3599 static void icell_set_x_supersub_sse8(int ci,
3600                                       real shx,real shy,real shz,
3601                                       int na_c,
3602                                       int stride,const real *x,
3603                                       nbnxn_list_work_t *work)
3604 {
3605     int  si,io,ia,i,j;
3606     real *x_ci;
3607
3608     x_ci = work->x_ci;
3609
3610     for(si=0; si<GPU_NSUBCELL; si++)
3611     {
3612         for(i=0; i<na_c; i+=STRIDE_8BB)
3613         {
3614             io = si*na_c + i;
3615             ia = ci*GPU_NSUBCELL*na_c + io;
3616             for(j=0; j<STRIDE_8BB; j++)
3617             {
3618                 x_ci[io*DIM + j + XX*STRIDE_8BB] = x[(ia+j)*stride+XX] + shx;
3619                 x_ci[io*DIM + j + YY*STRIDE_8BB] = x[(ia+j)*stride+YY] + shy;
3620                 x_ci[io*DIM + j + ZZ*STRIDE_8BB] = x[(ia+j)*stride+ZZ] + shz;
3621             }
3622         }
3623     }
3624 }
3625 #endif
3626
3627 static real nbnxn_rlist_inc_nonloc_fac = 0.6;
3628
3629 /* Due to the cluster size the effective pair-list is longer than
3630  * that of a simple atom pair-list. This function gives the extra distance.
3631  */
3632 real nbnxn_get_rlist_effective_inc(int cluster_size,real atom_density)
3633 {
3634     return ((0.5 + nbnxn_rlist_inc_nonloc_fac)*sqr(((cluster_size) - 1.0)/(cluster_size))*pow((cluster_size)/(atom_density),1.0/3.0));
3635 }
3636
3637 /* Estimates the interaction volume^2 for non-local interactions */
3638 static real nonlocal_vol2(const gmx_domdec_zones_t *zones,rvec ls,real r)
3639 {
3640     int  z,d;
3641     real cl,ca,za;
3642     real vold_est;
3643     real vol2_est_tot;
3644
3645     vol2_est_tot = 0;
3646
3647     /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
3648      * not home interaction volume^2. As these volumes are not additive,
3649      * this is an overestimate, but it would only be significant in the limit
3650      * of small cells, where we anyhow need to split the lists into
3651      * as small parts as possible.
3652      */
3653
3654     for(z=0; z<zones->n; z++)
3655     {
3656         if (zones->shift[z][XX] + zones->shift[z][YY] + zones->shift[z][ZZ] == 1)
3657         {
3658             cl = 0;
3659             ca = 1;
3660             za = 1;
3661             for(d=0; d<DIM; d++)
3662             {
3663                 if (zones->shift[z][d] == 0)
3664                 {
3665                     cl += 0.5*ls[d];
3666                     ca *= ls[d];
3667                     za *= zones->size[z].x1[d] - zones->size[z].x0[d];
3668                 }
3669             }
3670
3671             /* 4 octants of a sphere */
3672             vold_est  = 0.25*M_PI*r*r*r*r;
3673             /* 4 quarter pie slices on the edges */
3674             vold_est += 4*cl*M_PI/6.0*r*r*r;
3675             /* One rectangular volume on a face */
3676             vold_est += ca*0.5*r*r;
3677
3678             vol2_est_tot += vold_est*za;
3679         }
3680     }
3681
3682     return vol2_est_tot;
3683 }
3684
3685 /* Estimates the average size of a full j-list for super/sub setup */
3686 static int get_nsubpair_max(const nbnxn_search_t nbs,
3687                             int iloc,
3688                             real rlist,
3689                             int min_ci_balanced)
3690 {
3691     const nbnxn_grid_t *grid;
3692     rvec ls;
3693     real xy_diag2,r_eff_sup,vol_est,nsp_est,nsp_est_nl;
3694     int  nsubpair_max;
3695
3696     grid = &nbs->grid[0];
3697
3698     ls[XX] = (grid->c1[XX] - grid->c0[XX])/(grid->ncx*GPU_NSUBCELL_X);
3699     ls[YY] = (grid->c1[YY] - grid->c0[YY])/(grid->ncy*GPU_NSUBCELL_Y);
3700     ls[ZZ] = (grid->c1[ZZ] - grid->c0[ZZ])*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z);
3701
3702     /* The average squared length of the diagonal of a sub cell */
3703     xy_diag2 = ls[XX]*ls[XX] + ls[YY]*ls[YY] + ls[ZZ]*ls[ZZ];
3704
3705     /* The formulas below are a heuristic estimate of the average nsj per si*/
3706     r_eff_sup = rlist + nbnxn_rlist_inc_nonloc_fac*sqr((grid->na_c - 1.0)/grid->na_c)*sqrt(xy_diag2/3);
3707
3708     if (!nbs->DomDec || nbs->zones->n == 1)
3709     {
3710         nsp_est_nl = 0;
3711     }
3712     else
3713     {
3714         nsp_est_nl =
3715             sqr(grid->atom_density/grid->na_c)*
3716             nonlocal_vol2(nbs->zones,ls,r_eff_sup);
3717     }
3718
3719     if (LOCAL_I(iloc))
3720     {
3721         /* Sub-cell interacts with itself */
3722         vol_est  = ls[XX]*ls[YY]*ls[ZZ];
3723         /* 6/2 rectangular volume on the faces */
3724         vol_est += (ls[XX]*ls[YY] + ls[XX]*ls[ZZ] + ls[YY]*ls[ZZ])*r_eff_sup;
3725         /* 12/2 quarter pie slices on the edges */
3726         vol_est += 2*(ls[XX] + ls[YY] + ls[ZZ])*0.25*M_PI*sqr(r_eff_sup);
3727         /* 4 octants of a sphere */
3728         vol_est += 0.5*4.0/3.0*M_PI*pow(r_eff_sup,3);
3729
3730         nsp_est = grid->nsubc_tot*vol_est*grid->atom_density/grid->na_c;
3731
3732         /* Subtract the non-local pair count */
3733         nsp_est -= nsp_est_nl;
3734
3735         if (debug)
3736         {
3737             fprintf(debug,"nsp_est local %5.1f non-local %5.1f\n",
3738                     nsp_est,nsp_est_nl);
3739         }
3740     }
3741     else
3742     {
3743         nsp_est = nsp_est_nl;
3744     }
3745
3746     if (min_ci_balanced <= 0 || grid->nc >= min_ci_balanced || grid->nc == 0)
3747     {
3748         /* We don't need to worry */
3749         nsubpair_max = -1;
3750     }
3751     else
3752     {
3753         /* Thus the (average) maximum j-list size should be as follows */
3754         nsubpair_max = max(1,(int)(nsp_est/min_ci_balanced+0.5));
3755
3756         /* Since the target value is a maximum (this avoid high outliers,
3757          * which lead to load imbalance), not average, we get more lists
3758          * than we ask for (to compensate we need to add GPU_NSUBCELL*4/4).
3759          * But more importantly, the optimal GPU performance moves
3760          * to lower number of block for very small blocks.
3761          * To compensate we add the maximum pair count per cj4.
3762          */
3763         nsubpair_max += GPU_NSUBCELL*NBNXN_CPU_CLUSTER_I_SIZE;
3764     }
3765
3766     if (debug)
3767     {
3768         fprintf(debug,"nbl nsp estimate %.1f, nsubpair_max %d\n",
3769                 nsp_est,nsubpair_max);
3770     }
3771
3772     return nsubpair_max;
3773 }
3774
3775 /* Debug list print function */
3776 static void print_nblist_ci_cj(FILE *fp,const nbnxn_pairlist_t *nbl)
3777 {
3778     int i,j;
3779
3780     for(i=0; i<nbl->nci; i++)
3781     {
3782         fprintf(fp,"ci %4d  shift %2d  ncj %3d\n",
3783                 nbl->ci[i].ci,nbl->ci[i].shift,
3784                 nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start);
3785
3786         for(j=nbl->ci[i].cj_ind_start; j<nbl->ci[i].cj_ind_end; j++)
3787         {
3788             fprintf(fp,"  cj %5d  imask %x\n",
3789                     nbl->cj[j].cj,
3790                     nbl->cj[j].excl);
3791         }
3792     }
3793 }
3794
3795 /* Debug list print function */
3796 static void print_nblist_sci_cj(FILE *fp,const nbnxn_pairlist_t *nbl)
3797 {
3798     int i,j4,j;
3799
3800     for(i=0; i<nbl->nsci; i++)
3801     {
3802         fprintf(fp,"ci %4d  shift %2d  ncj4 %2d\n",
3803                 nbl->sci[i].sci,nbl->sci[i].shift,
3804                 nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start);
3805
3806         for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
3807         {
3808             for(j=0; j<4; j++)
3809             {
3810                 fprintf(fp,"  sj %5d  imask %x\n",
3811                         nbl->cj4[j4].cj[j],
3812                         nbl->cj4[j4].imei[0].imask);
3813             }
3814         }
3815     }
3816 }
3817
3818 /* Combine pair lists *nbl generated on multiple threads nblc */
3819 static void combine_nblists(int nnbl,nbnxn_pairlist_t **nbl,
3820                             nbnxn_pairlist_t *nblc)
3821 {
3822     int nsci,ncj4,nexcl;
3823     int n,i;
3824
3825     if (nblc->bSimple)
3826     {
3827         gmx_incons("combine_nblists does not support simple lists");
3828     }
3829
3830     nsci  = nblc->nsci;
3831     ncj4  = nblc->ncj4;
3832     nexcl = nblc->nexcl;
3833     for(i=0; i<nnbl; i++)
3834     {
3835         nsci  += nbl[i]->nsci;
3836         ncj4  += nbl[i]->ncj4;
3837         nexcl += nbl[i]->nexcl;
3838     }
3839
3840     if (nsci > nblc->sci_nalloc)
3841     {
3842         nb_realloc_sci(nblc,nsci);
3843     }
3844     if (ncj4 > nblc->cj4_nalloc)
3845     {
3846         nblc->cj4_nalloc = over_alloc_small(ncj4);
3847         nbnxn_realloc_void((void **)&nblc->cj4,
3848                            nblc->ncj4*sizeof(*nblc->cj4),
3849                            nblc->cj4_nalloc*sizeof(*nblc->cj4),
3850                            nblc->alloc,nblc->free);
3851     }
3852     if (nexcl > nblc->excl_nalloc)
3853     {
3854         nblc->excl_nalloc = over_alloc_small(nexcl);
3855         nbnxn_realloc_void((void **)&nblc->excl,
3856                            nblc->nexcl*sizeof(*nblc->excl),
3857                            nblc->excl_nalloc*sizeof(*nblc->excl),
3858                            nblc->alloc,nblc->free);
3859     }
3860
3861     /* Each thread should copy its own data to the combined arrays,
3862      * as otherwise data will go back and forth between different caches.
3863      */
3864 #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
3865     for(n=0; n<nnbl; n++)
3866     {
3867         int sci_offset;
3868         int cj4_offset;
3869         int ci_offset;
3870         int excl_offset;
3871         int i,j4;
3872         const nbnxn_pairlist_t *nbli;
3873
3874         /* Determine the offset in the combined data for our thread */
3875         sci_offset  = nblc->nsci;
3876         cj4_offset  = nblc->ncj4;
3877         ci_offset   = nblc->nci_tot;
3878         excl_offset = nblc->nexcl;
3879
3880         for(i=0; i<n; i++)
3881         {
3882             sci_offset  += nbl[i]->nsci;
3883             cj4_offset  += nbl[i]->ncj4;
3884             ci_offset   += nbl[i]->nci_tot;
3885             excl_offset += nbl[i]->nexcl;
3886         }
3887
3888         nbli = nbl[n];
3889
3890         for(i=0; i<nbli->nsci; i++)
3891         {
3892             nblc->sci[sci_offset+i]                = nbli->sci[i];
3893             nblc->sci[sci_offset+i].cj4_ind_start += cj4_offset;
3894             nblc->sci[sci_offset+i].cj4_ind_end   += cj4_offset;
3895         }
3896
3897         for(j4=0; j4<nbli->ncj4; j4++)
3898         {
3899             nblc->cj4[cj4_offset+j4] = nbli->cj4[j4];
3900             nblc->cj4[cj4_offset+j4].imei[0].excl_ind += excl_offset;
3901             nblc->cj4[cj4_offset+j4].imei[1].excl_ind += excl_offset;
3902         }
3903
3904         for(j4=0; j4<nbli->nexcl; j4++)
3905         {
3906             nblc->excl[excl_offset+j4] = nbli->excl[j4];
3907         }
3908     }
3909
3910     for(n=0; n<nnbl; n++)
3911     {
3912         nblc->nsci    += nbl[n]->nsci;
3913         nblc->ncj4    += nbl[n]->ncj4;
3914         nblc->nci_tot += nbl[n]->nci_tot;
3915         nblc->nexcl   += nbl[n]->nexcl;
3916     }
3917 }
3918
3919 /* Returns the next ci to be processes by our thread */
3920 static gmx_bool next_ci(const nbnxn_grid_t *grid,
3921                         int conv,
3922                         int nth,int ci_block,
3923                         int *ci_x,int *ci_y,
3924                         int *ci_b,int *ci)
3925 {
3926     (*ci_b)++;
3927     (*ci)++;
3928
3929     if (*ci_b == ci_block)
3930     {
3931         /* Jump to the next block assigned to this task */
3932         *ci   += (nth - 1)*ci_block;
3933         *ci_b  = 0;
3934     }
3935
3936     if (*ci >= grid->nc*conv)
3937     {
3938         return FALSE;
3939     }
3940
3941     while (*ci >= grid->cxy_ind[*ci_x*grid->ncy + *ci_y + 1]*conv)
3942     {
3943         *ci_y += 1;
3944         if (*ci_y == grid->ncy)
3945         {
3946             *ci_x += 1;
3947             *ci_y  = 0;
3948         }
3949     }
3950
3951     return TRUE;
3952 }
3953
3954 /* Returns the distance^2 for which we put cell pairs in the list
3955  * without checking atom pair distances. This is usually < rlist^2.
3956  */
3957 static float boundingbox_only_distance2(const nbnxn_grid_t *gridi,
3958                                         const nbnxn_grid_t *gridj,
3959                                         real rlist,
3960                                         gmx_bool simple)
3961 {
3962     /* If the distance between two sub-cell bounding boxes is less
3963      * than this distance, do not check the distance between
3964      * all particle pairs in the sub-cell, since then it is likely
3965      * that the box pair has atom pairs within the cut-off.
3966      * We use the nblist cut-off minus 0.5 times the average x/y diagonal
3967      * spacing of the sub-cells. Around 40% of the checked pairs are pruned.
3968      * Using more than 0.5 gains at most 0.5%.
3969      * If forces are calculated more than twice, the performance gain
3970      * in the force calculation outweighs the cost of checking.
3971      * Note that with subcell lists, the atom-pair distance check
3972      * is only performed when only 1 out of 8 sub-cells in within range,
3973      * this is because the GPU is much faster than the cpu.
3974      */
3975     real bbx,bby;
3976     real rbb2;
3977
3978     bbx = 0.5*(gridi->sx + gridj->sx);
3979     bby = 0.5*(gridi->sy + gridj->sy);
3980     if (!simple)
3981     {
3982         bbx /= GPU_NSUBCELL_X;
3983         bby /= GPU_NSUBCELL_Y;
3984     }
3985
3986     rbb2 = sqr(max(0,rlist - 0.5*sqrt(bbx*bbx + bby*bby)));
3987
3988 #ifndef GMX_DOUBLE
3989     return rbb2;
3990 #else
3991     return (float)((1+GMX_FLOAT_EPS)*rbb2);
3992 #endif
3993 }
3994
3995 static int get_ci_block_size(const nbnxn_grid_t *gridi,
3996                              gmx_bool bDomDec, int nth,
3997                              gmx_bool *bFBufferFlag)
3998 {
3999     const int ci_block_enum = 5;
4000     const int ci_block_denom = 11;
4001     const int ci_block_min_atoms = 16;
4002     int ci_block;
4003
4004     /* Here we decide how to distribute the blocks over the threads.
4005      * We use prime numbers to try to avoid that the grid size becomes
4006      * a multiple of the number of threads, which would lead to some
4007      * threads getting "inner" pairs and others getting boundary pairs,
4008      * which in turns will lead to load imbalance between threads.
4009      * Set the block size as 5/11/ntask times the average number of cells
4010      * in a y,z slab. This should ensure a quite uniform distribution
4011      * of the grid parts of the different thread along all three grid
4012      * zone boundaries with 3D domain decomposition. At the same time
4013      * the blocks will not become too small.
4014      */
4015     ci_block = (gridi->nc*ci_block_enum)/(ci_block_denom*gridi->ncx*nth);
4016
4017     /* Ensure the blocks are not too small: avoids cache invalidation */
4018     if (ci_block*gridi->na_sc < ci_block_min_atoms)
4019     {
4020         ci_block = (ci_block_min_atoms + gridi->na_sc - 1)/gridi->na_sc;
4021     }
4022     
4023     /* Without domain decomposition
4024      * or with less than 3 blocks per task, divide in nth blocks.
4025      */
4026     if (!bDomDec || ci_block*3*nth > gridi->nc)
4027     {
4028         ci_block = (gridi->nc + nth - 1)/nth;
4029         /* With non-interleaved blocks it makes sense to flag which
4030          * part of the force output thread buffer we access.
4031          * We use bit flags, so we have to check if it fits.
4032          */
4033         *bFBufferFlag = (nth > 1 && nth <= sizeof(unsigned int)*8);
4034     }
4035     else
4036     {
4037         *bFBufferFlag = FALSE;
4038     }
4039
4040     return ci_block;
4041 }
4042
4043 /* Generates the part of pair-list nbl assigned to our thread */
4044 static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
4045                                      const nbnxn_grid_t *gridi,
4046                                      const nbnxn_grid_t *gridj,
4047                                      nbnxn_search_work_t *work,
4048                                      const nbnxn_atomdata_t *nbat,
4049                                      const t_blocka *excl,
4050                                      real rlist,
4051                                      int nb_kernel_type,
4052                                      int ci_block,
4053                                      gmx_bool bFBufferFlag,
4054                                      int nsubpair_max,
4055                                      gmx_bool progBal,
4056                                      int min_ci_balanced,
4057                                      int th,int nth,
4058                                      nbnxn_pairlist_t *nbl)
4059 {
4060     int  na_cj_2log;
4061     matrix box;
4062     real rl2;
4063     float rbb2;
4064     int  d;
4065     int  ci_b,ci,ci_x,ci_y,ci_xy,cj;
4066     ivec shp;
4067     int  tx,ty,tz;
4068     int  shift;
4069     gmx_bool bMakeList;
4070     real shx,shy,shz;
4071     int  conv_i,cell0_i;
4072     const float *bb_i,*bbcz_i,*bbcz_j;
4073     const int *flags_i;
4074     real bx0,bx1,by0,by1,bz0,bz1;
4075     real bz1_frac;
4076     real d2cx,d2z,d2z_cx,d2z_cy,d2zx,d2zxy,d2xy;
4077     int  cxf,cxl,cyf,cyf_x,cyl;
4078     int  cx,cy;
4079     int  c0,c1,cs,cf,cl;
4080     int  ndistc;
4081     int  ncpcheck;
4082     int  gridj_flag_shift=0,cj_offset=0;
4083     unsigned *gridj_flag=NULL;
4084     int  ncj_old_i,ncj_old_j;
4085
4086     nbs_cycle_start(&work->cc[enbsCCsearch]);
4087
4088     if (gridj->bSimple != nbl->bSimple)
4089     {
4090         gmx_incons("Grid incompatible with pair-list");
4091     }
4092
4093     sync_work(nbl);
4094     nbl->na_sc = gridj->na_sc;
4095     nbl->na_ci = gridj->na_c;
4096     nbl->na_cj = nbnxn_kernel_to_cj_size(nb_kernel_type);
4097     na_cj_2log = get_2log(nbl->na_cj);
4098
4099     nbl->rlist  = rlist;
4100
4101     if (bFBufferFlag)
4102     {
4103         init_grid_flags(&work->gridi_flags,gridi);
4104         init_grid_flags(&work->gridj_flags,gridj);
4105
4106         /* To flag j-blocks for gridj, we need to convert j-clusters to flag blocks */
4107         gridj_flag_shift = 0;
4108         while ((nbl->na_cj<<gridj_flag_shift) < NBNXN_CELLBLOCK_SIZE*nbl->na_ci)
4109         {
4110             gridj_flag_shift++;
4111         }
4112         /* We will subtract the cell offset, which is not a multiple of the block size */
4113         cj_offset = ci_to_cj(get_2log(nbl->na_cj),gridj->cell0);
4114
4115         gridj_flag = work->gridj_flags.flag;
4116     }
4117
4118     copy_mat(nbs->box,box);
4119
4120     rl2 = nbl->rlist*nbl->rlist;
4121
4122     rbb2 = boundingbox_only_distance2(gridi,gridj,nbl->rlist,nbl->bSimple);
4123
4124     if (debug)
4125     {
4126         fprintf(debug,"nbl bounding box only distance %f\n",sqrt(rbb2));
4127     }
4128
4129     /* Set the shift range */
4130     for(d=0; d<DIM; d++)
4131     {
4132         /* Check if we need periodicity shifts.
4133          * Without PBC or with domain decomposition we don't need them.
4134          */
4135         if (d >= ePBC2npbcdim(nbs->ePBC) || nbs->dd_dim[d])
4136         {
4137             shp[d] = 0;
4138         }
4139         else
4140         {
4141             if (d == XX &&
4142                 box[XX][XX] - fabs(box[YY][XX]) - fabs(box[ZZ][XX]) < sqrt(rl2))
4143             {
4144                 shp[d] = 2;
4145             }
4146             else
4147             {
4148                 shp[d] = 1;
4149             }
4150         }
4151     }
4152
4153     if (nbl->bSimple && !gridi->bSimple)
4154     {
4155         conv_i  = gridi->na_sc/gridj->na_sc;
4156         bb_i    = gridi->bb_simple;
4157         bbcz_i  = gridi->bbcz_simple;
4158         flags_i = gridi->flags_simple;
4159     }
4160     else
4161     {
4162         conv_i  = 1;
4163         bb_i    = gridi->bb;
4164         bbcz_i  = gridi->bbcz;
4165         flags_i = gridi->flags;
4166     }
4167     cell0_i = gridi->cell0*conv_i;
4168
4169     bbcz_j = gridj->bbcz;
4170
4171     if (conv_i != 1)
4172     {
4173         /* Blocks of the conversion factor - 1 give a large repeat count
4174          * combined with a small block size. This should result in good
4175          * load balancing for both small and large domains.
4176          */
4177         ci_block = conv_i - 1;
4178     }
4179     if (debug)
4180     {
4181         fprintf(debug,"nbl nc_i %d col.av. %.1f ci_block %d\n",
4182                 gridi->nc,gridi->nc/(double)(gridi->ncx*gridi->ncy),ci_block);
4183     }
4184
4185     ndistc = 0;
4186     ncpcheck = 0;
4187
4188     /* Initially ci_b and ci to 1 before where we want them to start,
4189      * as they will both be incremented in next_ci.
4190      */
4191     ci_b = -1;
4192     ci   = th*ci_block - 1;
4193     ci_x = 0;
4194     ci_y = 0;
4195     while (next_ci(gridi,conv_i,nth,ci_block,&ci_x,&ci_y,&ci_b,&ci))
4196     {
4197         if (nbl->bSimple && flags_i[ci] == 0)
4198         {
4199             continue;
4200         }
4201
4202         ncj_old_i = nbl->ncj;
4203
4204         d2cx = 0;
4205         if (gridj != gridi && shp[XX] == 0)
4206         {
4207             if (nbl->bSimple)
4208             {
4209                 bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX];
4210             }
4211             else
4212             {
4213                 bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx;
4214             }
4215             if (bx1 < gridj->c0[XX])
4216             {
4217                 d2cx = sqr(gridj->c0[XX] - bx1);
4218
4219                 if (d2cx >= rl2)
4220                 {
4221                     continue;
4222                 }
4223             }
4224         }
4225
4226         ci_xy = ci_x*gridi->ncy + ci_y;
4227
4228         /* Loop over shift vectors in three dimensions */
4229         for (tz=-shp[ZZ]; tz<=shp[ZZ]; tz++)
4230         {
4231             shz = tz*box[ZZ][ZZ];
4232
4233             bz0 = bbcz_i[ci*NNBSBB_D  ] + shz;
4234             bz1 = bbcz_i[ci*NNBSBB_D+1] + shz;
4235
4236             if (tz == 0)
4237             {
4238                 d2z = 0;
4239             }
4240             else if (tz < 0)
4241             {
4242                 d2z = sqr(bz1);
4243             }
4244             else
4245             {
4246                 d2z = sqr(bz0 - box[ZZ][ZZ]);
4247             }
4248
4249             d2z_cx = d2z + d2cx;
4250
4251             if (d2z_cx >= rl2)
4252             {
4253                 continue;
4254             }
4255
4256             bz1_frac =
4257                 bz1/((real)(gridi->cxy_ind[ci_xy+1] - gridi->cxy_ind[ci_xy]));
4258             if (bz1_frac < 0)
4259             {
4260                 bz1_frac = 0;
4261             }
4262             /* The check with bz1_frac close to or larger than 1 comes later */
4263
4264             for (ty=-shp[YY]; ty<=shp[YY]; ty++)
4265             {
4266                 shy = ty*box[YY][YY] + tz*box[ZZ][YY];
4267
4268                 if (nbl->bSimple)
4269                 {
4270                     by0 = bb_i[ci*NNBSBB_B         +YY] + shy;
4271                     by1 = bb_i[ci*NNBSBB_B+NNBSBB_C+YY] + shy;
4272                 }
4273                 else
4274                 {
4275                     by0 = gridi->c0[YY] + (ci_y  )*gridi->sy + shy;
4276                     by1 = gridi->c0[YY] + (ci_y+1)*gridi->sy + shy;
4277                 }
4278
4279                 get_cell_range(by0,by1,
4280                                gridj->ncy,gridj->c0[YY],gridj->sy,gridj->inv_sy,
4281                                d2z_cx,rl2,
4282                                &cyf,&cyl);
4283
4284                 if (cyf > cyl)
4285                 {
4286                     continue;
4287                 }
4288
4289                 d2z_cy = d2z;
4290                 if (by1 < gridj->c0[YY])
4291                 {
4292                     d2z_cy += sqr(gridj->c0[YY] - by1);
4293                 }
4294                 else if (by0 > gridj->c1[YY])
4295                 {
4296                     d2z_cy += sqr(by0 - gridj->c1[YY]);
4297                 }
4298
4299                 for (tx=-shp[XX]; tx<=shp[XX]; tx++)
4300                 {
4301                     shift = XYZ2IS(tx,ty,tz);
4302
4303 #ifdef NBNXN_SHIFT_BACKWARD
4304                     if (gridi == gridj && shift > CENTRAL)
4305                     {
4306                         continue;
4307                     }
4308 #endif
4309
4310                     shx = tx*box[XX][XX] + ty*box[YY][XX] + tz*box[ZZ][XX];
4311
4312                     if (nbl->bSimple)
4313                     {
4314                         bx0 = bb_i[ci*NNBSBB_B         +XX] + shx;
4315                         bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX] + shx;
4316                     }
4317                     else
4318                     {
4319                         bx0 = gridi->c0[XX] + (ci_x  )*gridi->sx + shx;
4320                         bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx + shx;
4321                     }
4322
4323                     get_cell_range(bx0,bx1,
4324                                    gridj->ncx,gridj->c0[XX],gridj->sx,gridj->inv_sx,
4325                                    d2z_cy,rl2,
4326                                    &cxf,&cxl);
4327
4328                     if (cxf > cxl)
4329                     {
4330                         continue;
4331                     }
4332
4333                     if (nbl->bSimple)
4334                     {
4335                         new_ci_entry(nbl,cell0_i+ci,shift,flags_i[ci],
4336                                      nbl->work);
4337                     }
4338                     else
4339                     {
4340                         new_sci_entry(nbl,cell0_i+ci,shift,flags_i[ci],
4341                                       nbl->work);
4342                     }
4343
4344 #ifndef NBNXN_SHIFT_BACKWARD
4345                     if (cxf < ci_x)
4346 #else
4347                     if (shift == CENTRAL && gridi == gridj &&
4348                         cxf < ci_x)
4349 #endif
4350                     {
4351                         /* Leave the pairs with i > j.
4352                          * x is the major index, so skip half of it.
4353                          */
4354                         cxf = ci_x;
4355                     }
4356
4357                     if (nbl->bSimple)
4358                     {
4359                         set_icell_bb_simple(bb_i,ci,shx,shy,shz,
4360                                             nbl->work->bb_ci);
4361                     }
4362                     else
4363                     {
4364                         set_icell_bb_supersub(bb_i,ci,shx,shy,shz,
4365                                               nbl->work->bb_ci);
4366                     }
4367
4368                     nbs->icell_set_x(cell0_i+ci,shx,shy,shz,
4369                                      gridi->na_c,nbat->xstride,nbat->x,
4370                                      nbl->work);
4371
4372                     for(cx=cxf; cx<=cxl; cx++)
4373                     {
4374                         d2zx = d2z;
4375                         if (gridj->c0[XX] + cx*gridj->sx > bx1)
4376                         {
4377                             d2zx += sqr(gridj->c0[XX] + cx*gridj->sx - bx1);
4378                         }
4379                         else if (gridj->c0[XX] + (cx+1)*gridj->sx < bx0)
4380                         {
4381                             d2zx += sqr(gridj->c0[XX] + (cx+1)*gridj->sx - bx0);
4382                         }
4383
4384 #ifndef NBNXN_SHIFT_BACKWARD
4385                         if (gridi == gridj &&
4386                             cx == 0 && cyf < ci_y)
4387 #else
4388                         if (gridi == gridj &&
4389                             cx == 0 && shift == CENTRAL && cyf < ci_y)
4390 #endif
4391                         {
4392                             /* Leave the pairs with i > j.
4393                              * Skip half of y when i and j have the same x.
4394                              */
4395                             cyf_x = ci_y;
4396                         }
4397                         else
4398                         {
4399                             cyf_x = cyf;
4400                         }
4401
4402                         for(cy=cyf_x; cy<=cyl; cy++)
4403                         {
4404                             c0 = gridj->cxy_ind[cx*gridj->ncy+cy];
4405                             c1 = gridj->cxy_ind[cx*gridj->ncy+cy+1];
4406 #ifdef NBNXN_SHIFT_BACKWARD
4407                             if (gridi == gridj &&
4408                                 shift == CENTRAL && c0 < ci)
4409                             {
4410                                 c0 = ci;
4411                             }
4412 #endif
4413
4414                             d2zxy = d2zx;
4415                             if (gridj->c0[YY] + cy*gridj->sy > by1)
4416                             {
4417                                 d2zxy += sqr(gridj->c0[YY] + cy*gridj->sy - by1);
4418                             }
4419                             else if (gridj->c0[YY] + (cy+1)*gridj->sy < by0)
4420                             {
4421                                 d2zxy += sqr(gridj->c0[YY] + (cy+1)*gridj->sy - by0);
4422                             }
4423                             if (c1 > c0 && d2zxy < rl2)
4424                             {
4425                                 cs = c0 + (int)(bz1_frac*(c1 - c0));
4426                                 if (cs >= c1)
4427                                 {
4428                                     cs = c1 - 1;
4429                                 }
4430
4431                                 d2xy = d2zxy - d2z;
4432
4433                                 /* Find the lowest cell that can possibly
4434                                  * be within range.
4435                                  */
4436                                 cf = cs;
4437                                 while(cf > c0 &&
4438                                       (bbcz_j[cf*NNBSBB_D+1] >= bz0 ||
4439                                        d2xy + sqr(bbcz_j[cf*NNBSBB_D+1] - bz0) < rl2))
4440                                 {
4441                                     cf--;
4442                                 }
4443
4444                                 /* Find the highest cell that can possibly
4445                                  * be within range.
4446                                  */
4447                                 cl = cs;
4448                                 while(cl < c1-1 &&
4449                                       (bbcz_j[cl*NNBSBB_D] <= bz1 ||
4450                                        d2xy + sqr(bbcz_j[cl*NNBSBB_D] - bz1) < rl2))
4451                                 {
4452                                     cl++;
4453                                 }
4454
4455 #ifdef NBNXN_REFCODE
4456                                 {
4457                                     /* Simple reference code, for debugging,
4458                                      * overrides the more complex code above.
4459                                      */
4460                                     int k;
4461                                     cf = c1;
4462                                     cl = -1;
4463                                     for(k=c0; k<c1; k++)
4464                                     {
4465                                         if (box_dist2(bx0,bx1,by0,by1,bz0,bz1,
4466                                                       bb+k*NNBSBB_B) < rl2 &&
4467                                             k < cf)
4468                                         {
4469                                             cf = k;
4470                                         }
4471                                         if (box_dist2(bx0,bx1,by0,by1,bz0,bz1,
4472                                                       bb+k*NNBSBB_B) < rl2 &&
4473                                             k > cl)
4474                                         {
4475                                             cl = k;
4476                                         }
4477                                     }
4478                                 }
4479 #endif
4480
4481                                 if (gridi == gridj)
4482                                 {
4483                                     /* We want each atom/cell pair only once,
4484                                      * only use cj >= ci.
4485                                      */
4486 #ifndef NBNXN_SHIFT_BACKWARD
4487                                     cf = max(cf,ci);
4488 #else
4489                                     if (shift == CENTRAL)
4490                                     {
4491                                         cf = max(cf,ci);
4492                                     }
4493 #endif
4494                                 }
4495
4496                                 if (cf <= cl)
4497                                 {
4498                                     /* For f buffer flags with simple lists */
4499                                     ncj_old_j = nbl->ncj;
4500
4501                                     switch (nb_kernel_type)
4502                                     {
4503                                     case nbk4x4_PlainC:
4504                                         check_subcell_list_space_simple(nbl,cl-cf+1);
4505
4506                                         make_cluster_list_simple(gridj,
4507                                                                  nbl,ci,cf,cl,
4508                                                                  (gridi == gridj && shift == CENTRAL),
4509                                                                  nbat->x,
4510                                                                  rl2,rbb2,
4511                                                                  &ndistc);
4512                                         break;
4513 #ifdef NBNXN_SEARCH_SSE
4514                                     case nbk4xN_X86_SIMD128:
4515                                         check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
4516                                         make_cluster_list_x86_simd128(gridj,
4517                                                                       nbl,ci,cf,cl,
4518                                                                       (gridi == gridj && shift == CENTRAL),
4519                                                                       nbat->x,
4520                                                                       rl2,rbb2,
4521                                                                       &ndistc);
4522                                         break;
4523 #ifdef GMX_X86_AVX_256
4524                                     case nbk4xN_X86_SIMD256:
4525                                         check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
4526                                         make_cluster_list_x86_simd256(gridj,
4527                                                                       nbl,ci,cf,cl,
4528                                                                       (gridi == gridj && shift == CENTRAL),
4529                                                                       nbat->x,
4530                                                                       rl2,rbb2,
4531                                                                       &ndistc);
4532                                         break;
4533 #endif
4534 #endif
4535                                     case nbk8x8x8_PlainC:
4536                                     case nbk8x8x8_CUDA:
4537                                         check_subcell_list_space_supersub(nbl,cl-cf+1);
4538                                         for(cj=cf; cj<=cl; cj++)
4539                                         {
4540                                             make_cluster_list_supersub(nbs,gridi,gridj,
4541                                                                        nbl,ci,cj,
4542                                                                        (gridi == gridj && shift == CENTRAL && ci == cj),
4543                                                                        nbat->xstride,nbat->x,
4544                                                                        rl2,rbb2,
4545                                                                        &ndistc);
4546                                         }
4547                                         break;
4548                                     }
4549                                     ncpcheck += cl - cf + 1;
4550
4551                                     if (bFBufferFlag && nbl->ncj > ncj_old_j)
4552                                     {
4553                                         int cbf,cbl,cb;
4554
4555                                         cbf = (nbl->cj[ncj_old_j].cj - cj_offset) >> gridj_flag_shift;
4556                                         cbl = (nbl->cj[nbl->ncj-1].cj - cj_offset) >> gridj_flag_shift;
4557                                         for(cb=cbf; cb<=cbl; cb++)
4558                                         {
4559                                             gridj_flag[cb] = 1U<<th;
4560                                         }
4561                                     }
4562                                 }
4563                             }
4564                         }
4565                     }
4566
4567                     /* Set the exclusions for this ci list */
4568                     if (nbl->bSimple)
4569                     {
4570                         set_ci_top_excls(nbs,
4571                                          nbl,
4572                                          shift == CENTRAL && gridi == gridj,
4573                                          gridj->na_c_2log,
4574                                          na_cj_2log,
4575                                          &(nbl->ci[nbl->nci]),
4576                                          excl);
4577                     }
4578                     else
4579                     {
4580                         set_sci_top_excls(nbs,
4581                                           nbl,
4582                                           shift == CENTRAL && gridi == gridj,
4583                                           gridj->na_c_2log,
4584                                           &(nbl->sci[nbl->nsci]),
4585                                           excl);
4586                     }
4587
4588                     /* Close this ci list */
4589                     if (nbl->bSimple)
4590                     {
4591                         close_ci_entry_simple(nbl);
4592                     }
4593                     else
4594                     {
4595                         close_ci_entry_supersub(nbl,
4596                                                 nsubpair_max,
4597                                                 progBal,min_ci_balanced,
4598                                                 th,nth);
4599                     }
4600                 }
4601             }
4602         }
4603
4604         if (bFBufferFlag && nbl->ncj > ncj_old_i)
4605         {
4606             work->gridi_flags.flag[ci>>NBNXN_CELLBLOCK_SIZE_2LOG] = 1U<<th;
4607         }
4608     }
4609
4610     work->ndistc = ndistc;
4611
4612     nbs_cycle_stop(&work->cc[enbsCCsearch]);
4613
4614     if (debug)
4615     {
4616         fprintf(debug,"number of distance checks %d\n",ndistc);
4617         fprintf(debug,"ncpcheck %s %d\n",gridi==gridj ? "local" : "non-local",
4618                 ncpcheck);
4619
4620         if (nbl->bSimple)
4621         {
4622             print_nblist_statistics_simple(debug,nbl,nbs,rlist);
4623         }
4624         else
4625         {
4626             print_nblist_statistics_supersub(debug,nbl,nbs,rlist);
4627         }
4628
4629     }
4630 }
4631
4632 static void reduce_cellblock_flags(const nbnxn_search_t nbs,
4633                                    int nnbl,
4634                                    const nbnxn_grid_t *gridi,
4635                                    const nbnxn_grid_t *gridj)
4636 {
4637     int nbl,cb;
4638     const unsigned *flag;
4639
4640     if (gridi->cellblock_flags.bUse)
4641     {
4642         for(nbl=0; nbl<nnbl; nbl++)
4643         {
4644             flag = nbs->work[nbl].gridi_flags.flag;
4645             
4646             for(cb=0; cb<gridi->cellblock_flags.ncb; cb++)
4647             {
4648                 gridi->cellblock_flags.flag[cb] |= flag[cb];
4649             }
4650         }
4651     }
4652     if (gridj->cellblock_flags.bUse)
4653     {
4654         for(nbl=0; nbl<nnbl; nbl++)
4655         {
4656             flag = nbs->work[nbl].gridj_flags.flag;
4657
4658             for(cb=0; cb<gridj->cellblock_flags.ncb; cb++)
4659             {
4660                 gridj->cellblock_flags.flag[cb] |= flag[cb];
4661             }
4662         }
4663     }
4664 }
4665
4666 static void print_reduction_cost(const nbnxn_grid_t *grids,int ngrid,int nnbl)
4667 {
4668     int g,c0,c,cb,nbl;
4669     const nbnxn_grid_t *grid;
4670
4671     for(g=0; g<ngrid; g++)
4672     {
4673         grid = &grids[g];
4674
4675         c0 = 0;
4676         if (grid->cellblock_flags.bUse)
4677         {
4678             c  = 0;
4679             for(cb=0; cb<grid->cellblock_flags.ncb; cb++)
4680             {
4681                 for(nbl=0; nbl<nnbl; nbl++)
4682                 {
4683                     if (grid->cellblock_flags.flag[cb] == 1)
4684                     {
4685                         c0++;
4686                     }
4687                     else if (grid->cellblock_flags.flag[cb] & (1U<<nbl))
4688                     {
4689                         c++;
4690                     }
4691                 }
4692             }
4693         }
4694         else
4695         {
4696             c = nnbl*grid->cellblock_flags.ncb;
4697         }
4698         fprintf(debug,"nbnxn reduction buffers, grid %d: %d flag %d only buf. 0: %4.2f av. reduction: %4.2f\n",
4699                 g,nnbl,grid->cellblock_flags.bUse,
4700                 c0/(double)(grid->cellblock_flags.ncb),
4701                 c/(double)(grid->cellblock_flags.ncb));
4702     }
4703 }
4704
4705 /* Make a local or non-local pair-list, depending on iloc */
4706 void nbnxn_make_pairlist(const nbnxn_search_t nbs,
4707                          const nbnxn_atomdata_t *nbat,
4708                          const t_blocka *excl,
4709                          real rlist,
4710                          int min_ci_balanced,
4711                          nbnxn_pairlist_set_t *nbl_list,
4712                          int iloc,
4713                          int nb_kernel_type,
4714                          t_nrnb *nrnb)
4715 {
4716     nbnxn_grid_t *gridi,*gridj;
4717     int nzi,zi,zj0,zj1,zj;
4718     int nsubpair_max;
4719     int th;
4720     int nnbl;
4721     nbnxn_pairlist_t **nbl;
4722     int ci_block;
4723     gmx_bool CombineNBLists,bFBufferFlag;
4724     int np_tot,np_noq,np_hlj,nap;
4725
4726     nnbl            = nbl_list->nnbl;
4727     nbl             = nbl_list->nbl;
4728     CombineNBLists  = nbl_list->bCombined;
4729
4730     if (debug)
4731     {
4732         fprintf(debug,"ns making %d nblists\n", nnbl);
4733     }
4734
4735     if (nbl_list->bSimple)
4736     {
4737         switch (nb_kernel_type)
4738         {
4739 #ifdef NBNXN_SEARCH_SSE
4740         case nbk4xN_X86_SIMD128:
4741             nbs->icell_set_x = icell_set_x_x86_simd128;
4742             break;
4743 #ifdef GMX_X86_AVX_256
4744         case nbk4xN_X86_SIMD256:
4745             nbs->icell_set_x = icell_set_x_x86_simd256;
4746             break;
4747 #endif
4748 #endif
4749         default:
4750             nbs->icell_set_x = icell_set_x_simple;
4751             break;
4752         }
4753     }
4754     else
4755     {
4756 #ifdef NBNXN_SEARCH_SSE
4757         nbs->icell_set_x = icell_set_x_supersub_sse8;
4758 #else
4759         nbs->icell_set_x = icell_set_x_supersub;
4760 #endif
4761     }
4762
4763     if (LOCAL_I(iloc))
4764     {
4765         /* Only zone (grid) 0 vs 0 */
4766         nzi = 1;
4767         zj0 = 0;
4768         zj1 = 1;
4769     }
4770     else
4771     {
4772         nzi = nbs->zones->nizone;
4773     }
4774
4775     if (!nbl_list->bSimple && min_ci_balanced > 0)
4776     {
4777         nsubpair_max = get_nsubpair_max(nbs,iloc,rlist,min_ci_balanced);
4778     }
4779     else
4780     {
4781         nsubpair_max = 0;
4782     }
4783
4784     /* Clear all pair-lists */
4785     for(th=0; th<nnbl; th++)
4786     {
4787         clear_pairlist(nbl[th]);
4788     }
4789
4790     for(zi=0; zi<nzi; zi++)
4791     {
4792         gridi = &nbs->grid[zi];
4793
4794         if (NONLOCAL_I(iloc))
4795         {
4796             zj0 = nbs->zones->izone[zi].j0;
4797             zj1 = nbs->zones->izone[zi].j1;
4798             if (zi == 0)
4799             {
4800                 zj0++;
4801             }
4802         }
4803         for(zj=zj0; zj<zj1; zj++)
4804         {
4805             gridj = &nbs->grid[zj];
4806
4807             if (debug)
4808             {
4809                 fprintf(debug,"ns search grid %d vs %d\n",zi,zj);
4810             }
4811
4812             nbs_cycle_start(&nbs->cc[enbsCCsearch]);
4813
4814             if (nbl[0]->bSimple && !gridi->bSimple)
4815             {
4816                 /* Hybrid list, determine blocking later */
4817                 ci_block = 0;
4818                 bFBufferFlag = FALSE;
4819             }
4820             else
4821             {
4822                 ci_block = get_ci_block_size(gridi,nbs->DomDec,nnbl,
4823                                              &bFBufferFlag);
4824                 if (CombineNBLists)
4825                 {
4826                     bFBufferFlag = FALSE;
4827                 }
4828             }
4829             if (debug != NULL)
4830             {
4831                 fprintf(debug,"grid %d %d F buffer flags %d\n",
4832                         zi,zj,bFBufferFlag);
4833             }
4834
4835 #pragma omp parallel for num_threads(nnbl) schedule(static)
4836             for(th=0; th<nnbl; th++)
4837             {
4838                 if (CombineNBLists && th > 0)
4839                 {
4840                     clear_pairlist(nbl[th]);
4841                 }
4842
4843                 /* Divide the i super cell equally over the nblists */
4844                 nbnxn_make_pairlist_part(nbs,gridi,gridj,
4845                                          &nbs->work[th],nbat,excl,
4846                                          rlist,
4847                                          nb_kernel_type,
4848                                          ci_block,
4849                                          bFBufferFlag,
4850                                          nsubpair_max,
4851                                          (LOCAL_I(iloc) || nbs->zones->n <= 2),
4852                                          min_ci_balanced,
4853                                          th,nnbl,
4854                                          nbl[th]);
4855             }
4856             nbs_cycle_stop(&nbs->cc[enbsCCsearch]);
4857
4858             np_tot = 0;
4859             np_noq = 0;
4860             np_hlj = 0;
4861             for(th=0; th<nnbl; th++)
4862             {
4863                 inc_nrnb(nrnb,eNR_NBNXN_DIST2,nbs->work[th].ndistc);
4864
4865                 if (nbl_list->bSimple)
4866                 {
4867                     np_tot += nbl[th]->ncj;
4868                     np_noq += nbl[th]->work->ncj_noq;
4869                     np_hlj += nbl[th]->work->ncj_hlj;
4870                 }
4871                 else
4872                 {
4873                     /* This count ignores potential subsequent pair pruning */
4874                     np_tot += nbl[th]->nci_tot;
4875                 }
4876             }
4877             nap = nbl[0]->na_ci*nbl[0]->na_cj;
4878             nbl_list->natpair_ljq = (np_tot - np_noq)*nap - np_hlj*nap/2;
4879             nbl_list->natpair_lj  = np_noq*nap;
4880             nbl_list->natpair_q   = np_hlj*nap/2;
4881
4882             if (CombineNBLists && nnbl > 1)
4883             {
4884                 nbs_cycle_start(&nbs->cc[enbsCCcombine]);
4885
4886                 combine_nblists(nnbl-1,nbl+1,nbl[0]);
4887
4888                 nbs_cycle_stop(&nbs->cc[enbsCCcombine]);
4889             }
4890
4891             if (bFBufferFlag)
4892             {
4893                 reduce_cellblock_flags(nbs,nnbl,gridi,gridj);
4894             }
4895             else
4896             {
4897                 gridi->cellblock_flags.bUse = FALSE;
4898                 gridj->cellblock_flags.bUse = FALSE;
4899             }
4900         }
4901     }
4902
4903     /*
4904     print_supersub_nsp("nsubpair",nbl[0],iloc);
4905     */
4906
4907     /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
4908     if (LOCAL_I(iloc))
4909     {
4910         nbs->search_count++;
4911     }
4912     if (nbs->print_cycles &&
4913         (!nbs->DomDec || (nbs->DomDec && !LOCAL_I(iloc))) &&
4914         nbs->search_count % 100 == 0)
4915     {
4916         nbs_cycle_print(stderr,nbs);
4917     }
4918
4919     if (debug && (CombineNBLists && nnbl > 1))
4920     {
4921         if (nbl[0]->bSimple)
4922         {
4923             print_nblist_statistics_simple(debug,nbl[0],nbs,rlist);
4924         }
4925         else
4926         {
4927             print_nblist_statistics_supersub(debug,nbl[0],nbs,rlist);
4928         }
4929     }
4930
4931     if (gmx_debug_at)
4932     {
4933         if (nbl[0]->bSimple)
4934         {
4935             print_nblist_ci_cj(debug,nbl[0]);
4936         }
4937         else
4938         {
4939             print_nblist_sci_cj(debug,nbl[0]);
4940         }
4941
4942         print_reduction_cost(nbs->grid,nbs->ngrid,nnbl);
4943     }
4944 }