Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_search.c
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2  *
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  *          GROningen MAchine for Chemical Simulations
9  *
10  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
11  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
12  * Copyright (c) 2001-2012, The GROMACS development team,
13  * check out http://www.gromacs.org for more information.
14
15  * This program is free software; you can redistribute it and/or
16  * modify it under the terms of the GNU General Public License
17  * as published by the Free Software Foundation; either version 2
18  * of the License, or (at your option) any later version.
19  *
20  * If you want to redistribute modifications, please consider that
21  * scientific software is very special. Version control is crucial -
22  * bugs must be traceable. We will be happy to consider code for
23  * inclusion in the official distribution, but derived work must not
24  * be called official GROMACS. Details are found in the README & COPYING
25  * files - if they are missing, get the official version at www.gromacs.org.
26  *
27  * To help us fund GROMACS development, we humbly ask that you cite
28  * the papers on the package - you can find them in the top README file.
29  *
30  * For more info, check our website at http://www.gromacs.org
31  */
32
33 #ifdef HAVE_CONFIG_H
34 #include <config.h>
35 #endif
36
37 #include <math.h>
38 #include <string.h>
39 #include "sysstuff.h"
40 #include "smalloc.h"
41 #include "macros.h"
42 #include "maths.h"
43 #include "vec.h"
44 #include "pbc.h"
45 #include "nbnxn_consts.h"
46 #include "nbnxn_internal.h"
47 #include "nbnxn_atomdata.h"
48 #include "nbnxn_search.h"
49 #include "gmx_cyclecounter.h"
50 #include "gmxfio.h"
51 #include "gmx_omp_nthreads.h"
52 #include "nrnb.h"
53
54
55 /* Pair search box lower and upper corner in x,y,z.
56  * Store this in 4 iso 3 reals, which is useful with SSE.
57  * To avoid complicating the code we also use 4 without SSE.
58  */
59 #define NNBSBB_C         4
60 #define NNBSBB_B         (2*NNBSBB_C)
61 /* Pair search box lower and upper bound in z only. */
62 #define NNBSBB_D         2
63 /* Pair search box lower and upper corner x,y,z indices */
64 #define BBL_X  0
65 #define BBL_Y  1
66 #define BBL_Z  2
67 #define BBU_X  4
68 #define BBU_Y  5
69 #define BBU_Z  6
70
71
72 #ifdef NBNXN_SEARCH_SSE
73
74 #ifndef GMX_DOUBLE
75 #define NBNXN_SEARCH_SSE_SINGLE
76 #include "gmx_x86_simd_single.h"
77 #else
78 #include "gmx_x86_simd_double.h"
79 #endif
80
81 #if defined NBNXN_SEARCH_SSE_SINGLE && GPU_NSUBCELL == 8
82 #define NBNXN_8BB_SSE
83 #endif
84
85 /* The width of SSE/AVX128 with single precision for bounding boxes with GPU.
86  * Here AVX-256 turns out to be slightly slower than AVX-128.
87  */
88 #define STRIDE_8BB        4
89 #define STRIDE_8BB_2LOG   2
90
91
92 /* The functions below are macros as they are performance sensitive */
93
94 /* 4x4 list, pack=4: no complex conversion required */
95 /* i-cluster to j-cluster conversion */
96 #define CI_TO_CJ_J4(ci)   (ci)
97 /* cluster index to coordinate array index conversion */
98 #define X_IND_CI_J4(ci)  ((ci)*STRIDE_P4)
99 #define X_IND_CJ_J4(cj)  ((cj)*STRIDE_P4)
100
101 /* 4x2 list, pack=4: j-cluster size is half the packing width */
102 /* i-cluster to j-cluster conversion */
103 #define CI_TO_CJ_J2(ci)  ((ci)<<1)
104 /* cluster index to coordinate array index conversion */
105 #define X_IND_CI_J2(ci)  ((ci)*STRIDE_P4)
106 #define X_IND_CJ_J2(cj)  (((cj)>>1)*STRIDE_P4 + ((cj) & 1)*(PACK_X4>>1))
107
108 /* 4x8 list, pack=8: i-cluster size is half the packing width */
109 /* i-cluster to j-cluster conversion */
110 #define CI_TO_CJ_J8(ci)  ((ci)>>1)
111 /* cluster index to coordinate array index conversion */
112 #define X_IND_CI_J8(ci)  (((ci)>>1)*STRIDE_P8 + ((ci) & 1)*(PACK_X8>>1))
113 #define X_IND_CJ_J8(cj)  ((cj)*STRIDE_P8)
114
115 /* The j-cluster size is matched to the SIMD width */
116 #ifndef GMX_DOUBLE
117 /* 128 bits can hold 4 floats */
118 #define CI_TO_CJ_S128(ci)  CI_TO_CJ_J4(ci)
119 #define X_IND_CI_S128(ci)  X_IND_CI_J4(ci)
120 #define X_IND_CJ_S128(cj)  X_IND_CJ_J4(cj)
121 /* 256 bits can hold 8 floats */
122 #define CI_TO_CJ_S256(ci)  CI_TO_CJ_J8(ci)
123 #define X_IND_CI_S256(ci)  X_IND_CI_J8(ci)
124 #define X_IND_CJ_S256(cj)  X_IND_CJ_J8(cj)
125 #else
126 /* 128 bits can hold 2 doubles */
127 #define CI_TO_CJ_S128(ci)  CI_TO_CJ_J2(ci)
128 #define X_IND_CI_S128(ci)  X_IND_CI_J2(ci)
129 #define X_IND_CJ_S128(cj)  X_IND_CJ_J2(cj)
130 /* 256 bits can hold 4 doubles */
131 #define CI_TO_CJ_S256(ci)  CI_TO_CJ_J4(ci)
132 #define X_IND_CI_S256(ci)  X_IND_CI_J4(ci)
133 #define X_IND_CJ_S256(cj)  X_IND_CJ_J4(cj)
134 #endif
135
136 #endif /* NBNXN_SEARCH_SSE */
137
138
139 /* Interaction masks for 4xN atom interactions.
140  * Bit i*CJ_SIZE + j tells if atom i and j interact.
141  */
142 /* All interaction mask is the same for all kernels */
143 #define NBNXN_INT_MASK_ALL        0xffffffff
144 /* 4x4 kernel diagonal mask */
145 #define NBNXN_INT_MASK_DIAG       0x08ce
146 /* 4x2 kernel diagonal masks */
147 #define NBNXN_INT_MASK_DIAG_J2_0  0x0002
148 #define NBNXN_INT_MASK_DIAG_J2_1  0x002F
149 /* 4x8 kernel diagonal masks */
150 #define NBNXN_INT_MASK_DIAG_J8_0  0xf0f8fcfe
151 #define NBNXN_INT_MASK_DIAG_J8_1  0x0080c0e0
152
153
154 #ifdef NBNXN_SEARCH_SSE
155 /* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
156 #define NBNXN_BBXXXX
157 /* Size of bounding box corners quadruplet */
158 #define NNBSBB_XXXX      (NNBSBB_D*DIM*STRIDE_8BB)
159 #endif
160
161 /* We shift the i-particles backward for PBC.
162  * This leads to more conditionals than shifting forward.
163  * We do this to get more balanced pair lists.
164  */
165 #define NBNXN_SHIFT_BACKWARD
166
167
168 /* This define is a lazy way to avoid interdependence of the grid
169  * and searching data structures.
170  */
171 #define NBNXN_NA_SC_MAX (GPU_NSUBCELL*NBNXN_GPU_CLUSTER_SIZE)
172
173
174 static void nbs_cycle_clear(nbnxn_cycle_t *cc)
175 {
176     int i;
177
178     for(i=0; i<enbsCCnr; i++)
179     {
180         cc[i].count = 0;
181         cc[i].c     = 0;
182     }
183 }
184
185 static double Mcyc_av(const nbnxn_cycle_t *cc)
186 {
187     return (double)cc->c*1e-6/cc->count;
188 }
189
190 static void nbs_cycle_print(FILE *fp,const nbnxn_search_t nbs)
191 {
192     int n;
193     int t;
194
195     fprintf(fp,"\n");
196     fprintf(fp,"ns %4d grid %4.1f search %4.1f red.f %5.3f",
197             nbs->cc[enbsCCgrid].count,
198             Mcyc_av(&nbs->cc[enbsCCgrid]),
199             Mcyc_av(&nbs->cc[enbsCCsearch]),
200             Mcyc_av(&nbs->cc[enbsCCreducef]));
201
202     if (nbs->nthread_max > 1)
203     {
204         if (nbs->cc[enbsCCcombine].count > 0)
205         {
206             fprintf(fp," comb %5.2f",
207                     Mcyc_av(&nbs->cc[enbsCCcombine]));
208         }
209         fprintf(fp," s. th");
210         for(t=0; t<nbs->nthread_max; t++)
211         {
212             fprintf(fp," %4.1f",
213                     Mcyc_av(&nbs->work[t].cc[enbsCCsearch]));
214         }
215     }
216     fprintf(fp,"\n");
217 }
218
219 static void nbnxn_grid_init(nbnxn_grid_t * grid)
220 {
221     grid->cxy_na      = NULL;
222     grid->cxy_ind     = NULL;
223     grid->cxy_nalloc  = 0;
224     grid->bb          = NULL;
225     grid->bbj         = NULL;
226     grid->nc_nalloc   = 0;
227 }
228
229 static int get_2log(int n)
230 {
231     int log2;
232
233     log2 = 0;
234     while ((1<<log2) < n)
235     {
236         log2++;
237     }
238     if ((1<<log2) != n)
239     {
240         gmx_fatal(FARGS,"nbnxn na_c (%d) is not a power of 2",n);
241     }
242
243     return log2;
244 }
245
246 static int nbnxn_kernel_to_ci_size(int nb_kernel_type)
247 {
248     switch (nb_kernel_type)
249     {
250     case nbk4x4_PlainC:
251     case nbk4xN_X86_SIMD128:
252     case nbk4xN_X86_SIMD256:
253         return NBNXN_CPU_CLUSTER_I_SIZE;
254     case nbk8x8x8_CUDA:
255     case nbk8x8x8_PlainC:
256         /* The cluster size for super/sub lists is only set here.
257          * Any value should work for the pair-search and atomdata code.
258          * The kernels, of course, might require a particular value.
259          */
260         return NBNXN_GPU_CLUSTER_SIZE;
261     default:
262         gmx_incons("unknown kernel type");
263     }
264
265     return 0;
266 }
267
268 int nbnxn_kernel_to_cj_size(int nb_kernel_type)
269 {
270     switch (nb_kernel_type)
271     {
272     case nbk4x4_PlainC:
273         return NBNXN_CPU_CLUSTER_I_SIZE;
274     case nbk4xN_X86_SIMD128:
275         /* Number of reals that fit in SIMD (128 bits = 16 bytes) */
276         return 16/sizeof(real);
277     case nbk4xN_X86_SIMD256:
278         /* Number of reals that fit in SIMD (256 bits = 32 bytes) */
279         return 32/sizeof(real);
280     case nbk8x8x8_CUDA:
281     case nbk8x8x8_PlainC:
282         return nbnxn_kernel_to_ci_size(nb_kernel_type);
283     default:
284         gmx_incons("unknown kernel type");
285     }
286
287     return 0;
288 }
289
290 static int ci_to_cj(int na_cj_2log,int ci)
291 {
292     switch (na_cj_2log)
293     {
294     case 2: return  ci;     break;
295     case 1: return (ci<<1); break;
296     case 3: return (ci>>1); break;
297     }
298
299     return 0;
300 }
301
302 gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type)
303 {
304     if (nb_kernel_type == nbkNotSet)
305     {
306         gmx_fatal(FARGS, "Non-bonded kernel type not set for Verlet-style pair-list.");
307     }
308
309     switch (nb_kernel_type)
310     {
311     case nbk8x8x8_CUDA:
312     case nbk8x8x8_PlainC:
313         return FALSE;
314
315     case nbk4x4_PlainC:
316     case nbk4xN_X86_SIMD128:
317     case nbk4xN_X86_SIMD256:
318         return TRUE;
319
320     default:
321         gmx_incons("Invalid nonbonded kernel type passed!");
322         return FALSE;
323     }
324 }
325
326 void nbnxn_init_search(nbnxn_search_t * nbs_ptr,
327                        ivec *n_dd_cells,
328                        gmx_domdec_zones_t *zones,
329                        int nthread_max)
330 {
331     nbnxn_search_t nbs;
332     int d,g,t;
333
334     snew(nbs,1);
335     *nbs_ptr = nbs;
336
337     nbs->DomDec = (n_dd_cells != NULL);
338
339     clear_ivec(nbs->dd_dim);
340     nbs->ngrid = 1;
341     if (nbs->DomDec)
342     {
343         nbs->zones = zones;
344
345         for(d=0; d<DIM; d++)
346         {
347             if ((*n_dd_cells)[d] > 1)
348             {
349                 nbs->dd_dim[d] = 1;
350                 /* Each grid matches a DD zone */
351                 nbs->ngrid *= 2;
352             }
353         }
354     }
355
356     snew(nbs->grid,nbs->ngrid);
357     for(g=0; g<nbs->ngrid; g++)
358     {
359         nbnxn_grid_init(&nbs->grid[g]);
360     }
361     nbs->cell        = NULL;
362     nbs->cell_nalloc = 0;
363     nbs->a           = NULL;
364     nbs->a_nalloc    = 0;
365
366     nbs->nthread_max = nthread_max;
367
368     /* Initialize the work data structures for each thread */
369     snew(nbs->work,nbs->nthread_max);
370     for(t=0; t<nbs->nthread_max; t++)
371     {
372         nbs->work[t].cxy_na           = NULL;
373         nbs->work[t].cxy_na_nalloc    = 0;
374         nbs->work[t].sort_work        = NULL;
375         nbs->work[t].sort_work_nalloc = 0;
376     }
377
378     /* Initialize detailed nbsearch cycle counting */
379     nbs->print_cycles = (getenv("GMX_NBNXN_CYCLE") != 0);
380     nbs->search_count = 0;
381     nbs_cycle_clear(nbs->cc);
382     for(t=0; t<nbs->nthread_max; t++)
383     {
384         nbs_cycle_clear(nbs->work[t].cc);
385     }
386 }
387
388 static real grid_atom_density(int n,rvec corner0,rvec corner1)
389 {
390     rvec size;
391
392     rvec_sub(corner1,corner0,size);
393
394     return n/(size[XX]*size[YY]*size[ZZ]);
395 }
396
397 static int set_grid_size_xy(const nbnxn_search_t nbs,
398                             nbnxn_grid_t *grid,
399                             int n,rvec corner0,rvec corner1,
400                             real atom_density,
401                             int XFormat)
402 {
403     rvec size;
404     int  na_c;
405     real adens,tlen,tlen_x,tlen_y,nc_max;
406     int  t;
407
408     rvec_sub(corner1,corner0,size);
409
410     if (n > grid->na_sc)
411     {
412         /* target cell length */
413         if (grid->bSimple)
414         {
415             /* To minimize the zero interactions, we should make
416              * the largest of the i/j cell cubic.
417              */
418             na_c = max(grid->na_c,grid->na_cj);
419
420             /* Approximately cubic cells */
421             tlen   = pow(na_c/atom_density,1.0/3.0);
422             tlen_x = tlen;
423             tlen_y = tlen;
424         }
425         else
426         {
427             /* Approximately cubic sub cells */
428             tlen   = pow(grid->na_c/atom_density,1.0/3.0);
429             tlen_x = tlen*GPU_NSUBCELL_X;
430             tlen_y = tlen*GPU_NSUBCELL_Y;
431         }
432         /* We round ncx and ncy down, because we get less cell pairs
433          * in the nbsist when the fixed cell dimensions (x,y) are
434          * larger than the variable one (z) than the other way around.
435          */
436         grid->ncx = max(1,(int)(size[XX]/tlen_x));
437         grid->ncy = max(1,(int)(size[YY]/tlen_y));
438     }
439     else
440     {
441         grid->ncx = 1;
442         grid->ncy = 1;
443     }
444
445     /* We need one additional cell entry for particles moved by DD */
446     if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
447     {
448         grid->cxy_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
449         srenew(grid->cxy_na,grid->cxy_nalloc);
450         srenew(grid->cxy_ind,grid->cxy_nalloc+1);
451     }
452     for(t=0; t<nbs->nthread_max; t++)
453     {
454         if (grid->ncx*grid->ncy+1 > nbs->work[t].cxy_na_nalloc)
455         {
456             nbs->work[t].cxy_na_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
457             srenew(nbs->work[t].cxy_na,nbs->work[t].cxy_na_nalloc);
458         }
459     }
460
461     /* Worst case scenario of 1 atom in each last cell */
462     if (grid->na_cj <= grid->na_c)
463     {
464         nc_max = n/grid->na_sc + grid->ncx*grid->ncy;
465     }
466     else
467     {
468         nc_max = n/grid->na_sc + grid->ncx*grid->ncy*grid->na_cj/grid->na_c;
469     }
470
471     if (nc_max > grid->nc_nalloc)
472     {
473         int bb_nalloc;
474
475         grid->nc_nalloc = over_alloc_large(nc_max);
476         srenew(grid->nsubc,grid->nc_nalloc);
477         srenew(grid->bbcz,grid->nc_nalloc*NNBSBB_D);
478 #ifdef NBNXN_8BB_SSE
479         bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX;
480 #else
481         bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL*NNBSBB_B;
482 #endif
483         sfree_aligned(grid->bb);
484         /* This snew also zeros the contents, this avoid possible
485          * floating exceptions in SSE with the unused bb elements.
486          */
487         snew_aligned(grid->bb,bb_nalloc,16);
488
489         if (grid->bSimple)
490         {
491             if (grid->na_cj == grid->na_c)
492             {
493                 grid->bbj = grid->bb;
494             }
495             else
496             {
497                 sfree_aligned(grid->bbj);
498                 snew_aligned(grid->bbj,bb_nalloc*grid->na_c/grid->na_cj,16);
499             }
500         }
501
502         srenew(grid->flags,grid->nc_nalloc);
503     }
504
505     copy_rvec(corner0,grid->c0);
506     copy_rvec(corner1,grid->c1);
507     grid->sx = size[XX]/grid->ncx;
508     grid->sy = size[YY]/grid->ncy;
509     grid->inv_sx = 1/grid->sx;
510     grid->inv_sy = 1/grid->sy;
511
512     return nc_max;
513 }
514
515 #define SORT_GRID_OVERSIZE 2
516 #define SGSF (SORT_GRID_OVERSIZE + 1)
517
518 static void sort_atoms(int dim,gmx_bool Backwards,
519                        int *a,int n,rvec *x,
520                        real h0,real invh,int nsort,int *sort)
521 {
522     int i,c;
523     int zi,zim;
524     int cp,tmp;
525
526     if (n <= 1)
527     {
528         /* Nothing to do */
529         return;
530     }
531
532     /* For small oversize factors clearing the whole area is fastest.
533      * For large oversize we should clear the used elements after use.
534      */
535     for(i=0; i<nsort; i++)
536     {
537         sort[i] = -1;
538     }
539     /* Sort the particles using a simple index sort */
540     for(i=0; i<n; i++)
541     {
542         /* The cast takes care of float-point rounding effects below zero.
543          * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
544          * times the box height out of the box.
545          */
546         zi = (int)((x[a[i]][dim] - h0)*invh);
547
548 #ifdef DEBUG_NBNXN_GRIDDING
549         if (zi < 0 || zi >= nsort)
550         {
551             gmx_fatal(FARGS,"(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d\n",
552                       a[i],'x'+dim,x[a[i]][dim],h0,invh,zi,nsort);
553         }
554 #endif
555
556         /* Ideally this particle should go in sort cell zi,
557          * but that might already be in use,
558          * in that case find the first empty cell higher up
559          */
560         if (sort[zi] < 0)
561         {
562             sort[zi] = a[i];
563         }
564         else
565         {
566             /* We have multiple atoms in the same sorting slot.
567              * Sort on real z for minimal bounding box size.
568              * There is an extra check for identical z to ensure
569              * well-defined output order, independent of input order
570              * to ensure binary reproducibility after restarts.
571              */
572             while(sort[zi] >= 0 && ( x[a[i]][dim] >  x[sort[zi]][dim] ||
573                                     (x[a[i]][dim] == x[sort[zi]][dim] &&
574                                      a[i] > sort[zi])))
575             {
576                 zi++;
577             }
578
579             if (sort[zi] >= 0)
580             {
581                 /* Shift all elements by one slot until we find an empty slot */
582                 cp = sort[zi];
583                 zim = zi + 1;
584                 while (sort[zim] >= 0)
585                 {
586                     tmp = sort[zim];
587                     sort[zim] = cp;
588                     cp  = tmp;
589                     zim++;
590                 }
591                 sort[zim] = cp;
592             }
593             sort[zi] = a[i];
594         }
595     }
596
597     c = 0;
598     if (!Backwards)
599     {
600         for(zi=0; zi<nsort; zi++)
601         {
602             if (sort[zi] >= 0)
603             {
604                 a[c++] = sort[zi];
605             }
606         }
607     }
608     else
609     {
610         for(zi=nsort-1; zi>=0; zi--)
611         {
612             if (sort[zi] >= 0)
613             {
614                 a[c++] = sort[zi];
615             }
616         }
617     }
618     if (c < n)
619     {
620         gmx_incons("Lost particles while sorting");
621     }
622 }
623
624 #ifdef GMX_DOUBLE
625 #define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
626 #define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
627 #else
628 #define R2F_D(x) (x)
629 #define R2F_U(x) (x)
630 #endif
631
632 /* Coordinate order x,y,z, bb order xyz0 */
633 static void calc_bounding_box(int na,int stride,const real *x,float *bb)
634 {
635     int  i,j;
636     real xl,xh,yl,yh,zl,zh;
637
638     i = 0;
639     xl = x[i+XX];
640     xh = x[i+XX];
641     yl = x[i+YY];
642     yh = x[i+YY];
643     zl = x[i+ZZ];
644     zh = x[i+ZZ];
645     i += stride;
646     for(j=1; j<na; j++)
647     {
648         xl = min(xl,x[i+XX]);
649         xh = max(xh,x[i+XX]);
650         yl = min(yl,x[i+YY]);
651         yh = max(yh,x[i+YY]);
652         zl = min(zl,x[i+ZZ]);
653         zh = max(zh,x[i+ZZ]);
654         i += stride;
655     }
656     /* Note: possible double to float conversion here */
657     bb[BBL_X] = R2F_D(xl);
658     bb[BBL_Y] = R2F_D(yl);
659     bb[BBL_Z] = R2F_D(zl);
660     bb[BBU_X] = R2F_U(xh);
661     bb[BBU_Y] = R2F_U(yh);
662     bb[BBU_Z] = R2F_U(zh);
663 }
664
665 /* Packed coordinates, bb order xyz0 */
666 static void calc_bounding_box_x_x4(int na,const real *x,float *bb)
667 {
668     int  j;
669     real xl,xh,yl,yh,zl,zh;
670
671     xl = x[XX*PACK_X4];
672     xh = x[XX*PACK_X4];
673     yl = x[YY*PACK_X4];
674     yh = x[YY*PACK_X4];
675     zl = x[ZZ*PACK_X4];
676     zh = x[ZZ*PACK_X4];
677     for(j=1; j<na; j++)
678     {
679         xl = min(xl,x[j+XX*PACK_X4]);
680         xh = max(xh,x[j+XX*PACK_X4]);
681         yl = min(yl,x[j+YY*PACK_X4]);
682         yh = max(yh,x[j+YY*PACK_X4]);
683         zl = min(zl,x[j+ZZ*PACK_X4]);
684         zh = max(zh,x[j+ZZ*PACK_X4]);
685     }
686     /* Note: possible double to float conversion here */
687     bb[BBL_X] = R2F_D(xl);
688     bb[BBL_Y] = R2F_D(yl);
689     bb[BBL_Z] = R2F_D(zl);
690     bb[BBU_X] = R2F_U(xh);
691     bb[BBU_Y] = R2F_U(yh);
692     bb[BBU_Z] = R2F_U(zh);
693 }
694
695 /* Packed coordinates, bb order xyz0 */
696 static void calc_bounding_box_x_x8(int na,const real *x,float *bb)
697 {
698     int  j;
699     real xl,xh,yl,yh,zl,zh;
700
701     xl = x[XX*PACK_X8];
702     xh = x[XX*PACK_X8];
703     yl = x[YY*PACK_X8];
704     yh = x[YY*PACK_X8];
705     zl = x[ZZ*PACK_X8];
706     zh = x[ZZ*PACK_X8];
707     for(j=1; j<na; j++)
708     {
709         xl = min(xl,x[j+XX*PACK_X8]);
710         xh = max(xh,x[j+XX*PACK_X8]);
711         yl = min(yl,x[j+YY*PACK_X8]);
712         yh = max(yh,x[j+YY*PACK_X8]);
713         zl = min(zl,x[j+ZZ*PACK_X8]);
714         zh = max(zh,x[j+ZZ*PACK_X8]);
715     }
716     /* Note: possible double to float conversion here */
717     bb[BBL_X] = R2F_D(xl);
718     bb[BBL_Y] = R2F_D(yl);
719     bb[BBL_Z] = R2F_D(zl);
720     bb[BBU_X] = R2F_U(xh);
721     bb[BBU_Y] = R2F_U(yh);
722     bb[BBU_Z] = R2F_U(zh);
723 }
724
725 #ifdef NBNXN_SEARCH_SSE
726
727 /* Packed coordinates, bb order xyz0 */
728 static void calc_bounding_box_x_x4_halves(int na,const real *x,
729                                           float *bb,float *bbj)
730 {
731     calc_bounding_box_x_x4(min(na,2),x,bbj);
732
733     if (na > 2)
734     {
735         calc_bounding_box_x_x4(min(na-2,2),x+(PACK_X4>>1),bbj+NNBSBB_B);
736     }
737     else
738     {
739         /* Set the "empty" bounding box to the same as the first one,
740          * so we don't need to treat special cases in the rest of the code.
741          */
742         _mm_store_ps(bbj+NNBSBB_B         ,_mm_load_ps(bbj));
743         _mm_store_ps(bbj+NNBSBB_B+NNBSBB_C,_mm_load_ps(bbj+NNBSBB_C));
744     }
745
746     _mm_store_ps(bb         ,_mm_min_ps(_mm_load_ps(bbj),
747                                         _mm_load_ps(bbj+NNBSBB_B)));
748     _mm_store_ps(bb+NNBSBB_C,_mm_max_ps(_mm_load_ps(bbj+NNBSBB_C),
749                                         _mm_load_ps(bbj+NNBSBB_B+NNBSBB_C)));
750 }
751
752 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
753 static void calc_bounding_box_xxxx(int na,int stride,const real *x,float *bb)
754 {
755     int  i,j;
756     real xl,xh,yl,yh,zl,zh;
757
758     i = 0;
759     xl = x[i+XX];
760     xh = x[i+XX];
761     yl = x[i+YY];
762     yh = x[i+YY];
763     zl = x[i+ZZ];
764     zh = x[i+ZZ];
765     i += stride;
766     for(j=1; j<na; j++)
767     {
768         xl = min(xl,x[i+XX]);
769         xh = max(xh,x[i+XX]);
770         yl = min(yl,x[i+YY]);
771         yh = max(yh,x[i+YY]);
772         zl = min(zl,x[i+ZZ]);
773         zh = max(zh,x[i+ZZ]);
774         i += stride;
775     }
776     /* Note: possible double to float conversion here */
777     bb[0*STRIDE_8BB] = R2F_D(xl);
778     bb[1*STRIDE_8BB] = R2F_D(yl);
779     bb[2*STRIDE_8BB] = R2F_D(zl);
780     bb[3*STRIDE_8BB] = R2F_U(xh);
781     bb[4*STRIDE_8BB] = R2F_U(yh);
782     bb[5*STRIDE_8BB] = R2F_U(zh);
783 }
784
785 #endif /* NBNXN_SEARCH_SSE */
786
787 #ifdef NBNXN_SEARCH_SSE_SINGLE
788
789 /* Coordinate order xyz?, bb order xyz0 */
790 static void calc_bounding_box_sse(int na,const float *x,float *bb)
791 {
792     __m128 bb_0_SSE,bb_1_SSE;
793     __m128 x_SSE;
794
795     int  i;
796
797     bb_0_SSE = _mm_load_ps(x);
798     bb_1_SSE = bb_0_SSE;
799
800     for(i=1; i<na; i++)
801     {
802         x_SSE    = _mm_load_ps(x+i*NNBSBB_C);
803         bb_0_SSE = _mm_min_ps(bb_0_SSE,x_SSE);
804         bb_1_SSE = _mm_max_ps(bb_1_SSE,x_SSE);
805     }
806
807     _mm_store_ps(bb  ,bb_0_SSE);
808     _mm_store_ps(bb+4,bb_1_SSE);
809 }
810
811 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
812 static void calc_bounding_box_xxxx_sse(int na,const float *x,
813                                        float *bb_work,
814                                        real *bb)
815 {
816     calc_bounding_box_sse(na,x,bb_work);
817
818     bb[0*STRIDE_8BB] = bb_work[BBL_X];
819     bb[1*STRIDE_8BB] = bb_work[BBL_Y];
820     bb[2*STRIDE_8BB] = bb_work[BBL_Z];
821     bb[3*STRIDE_8BB] = bb_work[BBU_X];
822     bb[4*STRIDE_8BB] = bb_work[BBU_Y];
823     bb[5*STRIDE_8BB] = bb_work[BBU_Z];
824 }
825
826 #endif /* NBNXN_SEARCH_SSE_SINGLE */
827
828 #ifdef NBNXN_SEARCH_SSE
829
830 /* Combines pairs of consecutive bounding boxes */
831 static void combine_bounding_box_pairs(nbnxn_grid_t *grid,const float *bb)
832 {
833     int    i,j,sc2,nc2,c2;
834     __m128 min_SSE,max_SSE;
835
836     for(i=0; i<grid->ncx*grid->ncy; i++)
837     {
838         /* Starting bb in a column is expected to be 2-aligned */
839         sc2 = grid->cxy_ind[i]>>1;
840         /* For odd numbers skip the last bb here */
841         nc2 = (grid->cxy_na[i]+3)>>(2+1);
842         for(c2=sc2; c2<sc2+nc2; c2++)
843         {
844             min_SSE = _mm_min_ps(_mm_load_ps(bb+(c2*4+0)*NNBSBB_C),
845                                  _mm_load_ps(bb+(c2*4+2)*NNBSBB_C));
846             max_SSE = _mm_max_ps(_mm_load_ps(bb+(c2*4+1)*NNBSBB_C),
847                                  _mm_load_ps(bb+(c2*4+3)*NNBSBB_C));
848             _mm_store_ps(grid->bbj+(c2*2+0)*NNBSBB_C,min_SSE);
849             _mm_store_ps(grid->bbj+(c2*2+1)*NNBSBB_C,max_SSE);
850         }
851         if (((grid->cxy_na[i]+3)>>2) & 1)
852         {
853             /* Copy the last bb for odd bb count in this column */
854             for(j=0; j<NNBSBB_C; j++)
855             {
856                 grid->bbj[(c2*2+0)*NNBSBB_C+j] = bb[(c2*4+0)*NNBSBB_C+j];
857                 grid->bbj[(c2*2+1)*NNBSBB_C+j] = bb[(c2*4+1)*NNBSBB_C+j];
858             }
859         }
860     }
861 }
862
863 #endif
864
865
866 /* Prints the average bb size, used for debug output */
867 static void print_bbsizes_simple(FILE *fp,
868                                  const nbnxn_search_t nbs,
869                                  const nbnxn_grid_t *grid)
870 {
871     int  c,d;
872     dvec ba;
873
874     clear_dvec(ba);
875     for(c=0; c<grid->nc; c++)
876     {
877         for(d=0; d<DIM; d++)
878         {
879             ba[d] += grid->bb[c*NNBSBB_B+NNBSBB_C+d] - grid->bb[c*NNBSBB_B+d];
880         }
881     }
882     dsvmul(1.0/grid->nc,ba,ba);
883
884     fprintf(fp,"ns bb: %4.2f %4.2f %4.2f  %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
885             nbs->box[XX][XX]/grid->ncx,
886             nbs->box[YY][YY]/grid->ncy,
887             nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/grid->nc,
888             ba[XX],ba[YY],ba[ZZ],
889             ba[XX]*grid->ncx/nbs->box[XX][XX],
890             ba[YY]*grid->ncy/nbs->box[YY][YY],
891             ba[ZZ]*grid->nc/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
892 }
893
894 /* Prints the average bb size, used for debug output */
895 static void print_bbsizes_supersub(FILE *fp,
896                                    const nbnxn_search_t nbs,
897                                    const nbnxn_grid_t *grid)
898 {
899     int  ns,c,s;
900     dvec ba;
901
902     clear_dvec(ba);
903     ns = 0;
904     for(c=0; c<grid->nc; c++)
905     {
906 #ifdef NBNXN_BBXXXX
907         for(s=0; s<grid->nsubc[c]; s+=STRIDE_8BB)
908         {
909             int cs_w,i,d;
910
911             cs_w = (c*GPU_NSUBCELL + s)/STRIDE_8BB;
912             for(i=0; i<STRIDE_8BB; i++)
913             {
914                 for(d=0; d<DIM; d++)
915                 {
916                     ba[d] +=
917                         grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_8BB+i] -
918                         grid->bb[cs_w*NNBSBB_XXXX+     d *STRIDE_8BB+i];
919                 }
920             }
921         }
922 #else
923         for(s=0; s<grid->nsubc[c]; s++)
924         {
925             int cs,d;
926
927             cs = c*GPU_NSUBCELL + s;
928             for(d=0; d<DIM; d++)
929             {
930                 ba[d] +=
931                     grid->bb[cs*NNBSBB_B+NNBSBB_C+d] -
932                     grid->bb[cs*NNBSBB_B         +d];
933             }
934         }
935 #endif
936         ns += grid->nsubc[c];
937     }
938     dsvmul(1.0/ns,ba,ba);
939
940     fprintf(fp,"ns bb: %4.2f %4.2f %4.2f  %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
941             nbs->box[XX][XX]/(grid->ncx*GPU_NSUBCELL_X),
942             nbs->box[YY][YY]/(grid->ncy*GPU_NSUBCELL_Y),
943             nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z),
944             ba[XX],ba[YY],ba[ZZ],
945             ba[XX]*grid->ncx*GPU_NSUBCELL_X/nbs->box[XX][XX],
946             ba[YY]*grid->ncy*GPU_NSUBCELL_Y/nbs->box[YY][YY],
947             ba[ZZ]*grid->nc*GPU_NSUBCELL_Z/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
948 }
949
950 /* Potentially sorts atoms on LJ coefficients !=0 and ==0.
951  * Also sets interaction flags.
952  */
953 void sort_on_lj(nbnxn_atomdata_t *nbat,int na_c,
954                 int a0,int a1,const int *atinfo,
955                 int *order,
956                 int *flags)
957 {
958     int subc,s,a,n1,n2,a_lj_max,i,j;
959     int sort1[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
960     int sort2[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
961     gmx_bool haveQ;
962
963     *flags = 0;
964
965     subc = 0;
966     for(s=a0; s<a1; s+=na_c)
967     {
968         /* Make lists for this (sub-)cell on atoms with and without LJ */
969         n1 = 0;
970         n2 = 0;
971         haveQ = FALSE;
972         a_lj_max = -1;
973         for(a=s; a<min(s+na_c,a1); a++)
974         {
975             haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
976
977             if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
978             {
979                 sort1[n1++] = order[a];
980                 a_lj_max = a;
981             }
982             else
983             {
984                 sort2[n2++] = order[a];
985             }
986         }
987
988         /* If we don't have atom with LJ, there's nothing to sort */
989         if (n1 > 0)
990         {
991             *flags |= NBNXN_CI_DO_LJ(subc);
992
993             if (2*n1 <= na_c)
994             {
995                 /* Only sort when strictly necessary. Ordering particles
996                  * Ordering particles can lead to less accurate summation
997                  * due to rounding, both for LJ and Coulomb interactions.
998                  */
999                 if (2*(a_lj_max - s) >= na_c)
1000                 {
1001                     for(i=0; i<n1; i++)
1002                     {
1003                         order[a0+i] = sort1[i];
1004                     }
1005                     for(j=0; j<n2; j++)
1006                     {
1007                         order[a0+n1+j] = sort2[j];
1008                     }
1009                 }
1010
1011                 *flags |= NBNXN_CI_HALF_LJ(subc);
1012             }
1013         }
1014         if (haveQ)
1015         {
1016             *flags |= NBNXN_CI_DO_COUL(subc);
1017         }
1018         subc++;
1019     }
1020 }
1021
1022 /* Fill a pair search cell with atoms.
1023  * Potentially sorts atoms and sets the interaction flags.
1024  */
1025 void fill_cell(const nbnxn_search_t nbs,
1026                nbnxn_grid_t *grid,
1027                nbnxn_atomdata_t *nbat,
1028                int a0,int a1,
1029                const int *atinfo,
1030                rvec *x,
1031                int sx,int sy, int sz,
1032                float *bb_work)
1033 {
1034     int    na,a;
1035     size_t offset;
1036     float  *bb_ptr;
1037
1038     na = a1 - a0;
1039
1040     if (grid->bSimple)
1041     {
1042         sort_on_lj(nbat,grid->na_c,a0,a1,atinfo,nbs->a,
1043                    grid->flags+(a0>>grid->na_c_2log)-grid->cell0);
1044     }
1045
1046     /* Now we have sorted the atoms, set the cell indices */
1047     for(a=a0; a<a1; a++)
1048     {
1049         nbs->cell[nbs->a[a]] = a;
1050     }
1051
1052     copy_rvec_to_nbat_real(nbs->a+a0,a1-a0,grid->na_c,x,
1053                            nbat->XFormat,nbat->x,a0,
1054                            sx,sy,sz);
1055
1056     if (nbat->XFormat == nbatX4)
1057     {
1058         /* Store the bounding boxes as xyz.xyz. */
1059         offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
1060         bb_ptr = grid->bb + offset;
1061
1062 #if defined GMX_DOUBLE && defined NBNXN_SEARCH_SSE
1063         if (2*grid->na_cj == grid->na_c)
1064         {
1065             calc_bounding_box_x_x4_halves(na,nbat->x+X4_IND_A(a0),bb_ptr,
1066                                           grid->bbj+offset*2);
1067         }
1068         else
1069 #endif
1070         {
1071             calc_bounding_box_x_x4(na,nbat->x+X4_IND_A(a0),bb_ptr);
1072         }
1073     }
1074     else if (nbat->XFormat == nbatX8)
1075     {
1076         /* Store the bounding boxes as xyz.xyz. */
1077         offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
1078         bb_ptr = grid->bb + offset;
1079
1080         calc_bounding_box_x_x8(na,nbat->x+X8_IND_A(a0),bb_ptr);
1081     }
1082 #ifdef NBNXN_BBXXXX
1083     else if (!grid->bSimple)
1084     {
1085         /* Store the bounding boxes in a format convenient
1086          * for SSE calculations: xxxxyyyyzzzz...
1087                              */
1088         bb_ptr =
1089             grid->bb +
1090             ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_8BB_2LOG))*NNBSBB_XXXX +
1091             (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_8BB-1));
1092
1093 #ifdef NBNXN_SEARCH_SSE_SINGLE
1094         if (nbat->XFormat == nbatXYZQ)
1095         {
1096             calc_bounding_box_xxxx_sse(na,nbat->x+a0*nbat->xstride,
1097                                        bb_work,bb_ptr);
1098         }
1099         else
1100 #endif
1101         {
1102             calc_bounding_box_xxxx(na,nbat->xstride,nbat->x+a0*nbat->xstride,
1103                                    bb_ptr);
1104         }
1105         if (gmx_debug_at)
1106         {
1107             fprintf(debug,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
1108                     sx,sy,sz,
1109                     bb_ptr[0*STRIDE_8BB],bb_ptr[3*STRIDE_8BB],
1110                     bb_ptr[1*STRIDE_8BB],bb_ptr[4*STRIDE_8BB],
1111                     bb_ptr[2*STRIDE_8BB],bb_ptr[5*STRIDE_8BB]);
1112         }
1113     }
1114 #endif
1115     else
1116     {
1117         /* Store the bounding boxes as xyz.xyz. */
1118         bb_ptr = grid->bb+((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
1119
1120         calc_bounding_box(na,nbat->xstride,nbat->x+a0*nbat->xstride,
1121                           bb_ptr);
1122
1123         if (gmx_debug_at)
1124         {
1125             int bbo;
1126             bbo = (a0 - grid->cell0*grid->na_sc)/grid->na_c;
1127             fprintf(debug,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
1128                     sx,sy,sz,
1129                     (grid->bb+bbo*NNBSBB_B)[BBL_X],
1130                     (grid->bb+bbo*NNBSBB_B)[BBU_X],
1131                     (grid->bb+bbo*NNBSBB_B)[BBL_Y],
1132                     (grid->bb+bbo*NNBSBB_B)[BBU_Y],
1133                     (grid->bb+bbo*NNBSBB_B)[BBL_Z],
1134                     (grid->bb+bbo*NNBSBB_B)[BBU_Z]);
1135         }
1136     }
1137 }
1138
1139 /* Spatially sort the atoms within one grid column */
1140 static void sort_columns_simple(const nbnxn_search_t nbs,
1141                                 int dd_zone,
1142                                 nbnxn_grid_t *grid,
1143                                 int a0,int a1,
1144                                 const int *atinfo,
1145                                 rvec *x,
1146                                 nbnxn_atomdata_t *nbat,
1147                                 int cxy_start,int cxy_end,
1148                                 int *sort_work)
1149 {
1150     int  cxy;
1151     int  cx,cy,cz,ncz,cfilled,c;
1152     int  na,ash,ind,a;
1153     int  na_c,ash_c;
1154
1155     if (debug)
1156     {
1157         fprintf(debug,"cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1158                 grid->cell0,cxy_start,cxy_end,a0,a1);
1159     }
1160
1161     /* Sort the atoms within each x,y column in 3 dimensions */
1162     for(cxy=cxy_start; cxy<cxy_end; cxy++)
1163     {
1164         cx = cxy/grid->ncy;
1165         cy = cxy - cx*grid->ncy;
1166
1167         na  = grid->cxy_na[cxy];
1168         ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
1169         ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
1170
1171         /* Sort the atoms within each x,y column on z coordinate */
1172         sort_atoms(ZZ,FALSE,
1173                    nbs->a+ash,na,x,
1174                    grid->c0[ZZ],
1175                    ncz*grid->na_sc*SORT_GRID_OVERSIZE/nbs->box[ZZ][ZZ],
1176                    ncz*grid->na_sc*SGSF,sort_work);
1177
1178         /* Fill the ncz cells in this column */
1179         cfilled = grid->cxy_ind[cxy];
1180         for(cz=0; cz<ncz; cz++)
1181         {
1182             c  = grid->cxy_ind[cxy] + cz ;
1183
1184             ash_c = ash + cz*grid->na_sc;
1185             na_c  = min(grid->na_sc,na-(ash_c-ash));
1186
1187             fill_cell(nbs,grid,nbat,
1188                       ash_c,ash_c+na_c,atinfo,x,
1189                       grid->na_sc*cx + (dd_zone >> 2),
1190                       grid->na_sc*cy + (dd_zone & 3),
1191                       grid->na_sc*cz,
1192                       NULL);
1193
1194             /* This copy to bbcz is not really necessary.
1195              * But it allows to use the same grid search code
1196              * for the simple and supersub cell setups.
1197              */
1198             if (na_c > 0)
1199             {
1200                 cfilled = c;
1201             }
1202             grid->bbcz[c*NNBSBB_D  ] = grid->bb[cfilled*NNBSBB_B+2];
1203             grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled*NNBSBB_B+6];
1204         }
1205
1206         /* Set the unused atom indices to -1 */
1207         for(ind=na; ind<ncz*grid->na_sc; ind++)
1208         {
1209             nbs->a[ash+ind] = -1;
1210         }
1211     }
1212 }
1213
1214 /* Spatially sort the atoms within one grid column */
1215 static void sort_columns_supersub(const nbnxn_search_t nbs,
1216                                   int dd_zone,
1217                                   nbnxn_grid_t *grid,
1218                                   int a0,int a1,
1219                                   const int *atinfo,
1220                                   rvec *x,
1221                                   nbnxn_atomdata_t *nbat,
1222                                   int cxy_start,int cxy_end,
1223                                   int *sort_work)
1224 {
1225     int  cxy;
1226     int  cx,cy,cz=-1,c=-1,ncz;
1227     int  na,ash,na_c,ind,a;
1228     int  subdiv_z,sub_z,na_z,ash_z;
1229     int  subdiv_y,sub_y,na_y,ash_y;
1230     int  subdiv_x,sub_x,na_x,ash_x;
1231
1232     /* cppcheck-suppress unassignedVariable */
1233     float bb_work_array[NNBSBB_B+3],*bb_work_align;
1234
1235     bb_work_align = (float *)(((size_t)(bb_work_array+3)) & (~((size_t)15)));
1236
1237     if (debug)
1238     {
1239         fprintf(debug,"cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1240                 grid->cell0,cxy_start,cxy_end,a0,a1);
1241     }
1242
1243     subdiv_x = grid->na_c;
1244     subdiv_y = GPU_NSUBCELL_X*subdiv_x;
1245     subdiv_z = GPU_NSUBCELL_Y*subdiv_y;
1246
1247     /* Sort the atoms within each x,y column in 3 dimensions */
1248     for(cxy=cxy_start; cxy<cxy_end; cxy++)
1249     {
1250         cx = cxy/grid->ncy;
1251         cy = cxy - cx*grid->ncy;
1252
1253         na  = grid->cxy_na[cxy];
1254         ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
1255         ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
1256
1257         /* Sort the atoms within each x,y column on z coordinate */
1258         sort_atoms(ZZ,FALSE,
1259                    nbs->a+ash,na,x,
1260                    grid->c0[ZZ],
1261                    ncz*grid->na_sc*SORT_GRID_OVERSIZE/nbs->box[ZZ][ZZ],
1262                    ncz*grid->na_sc*SGSF,sort_work);
1263
1264         /* This loop goes over the supercells and subcells along z at once */
1265         for(sub_z=0; sub_z<ncz*GPU_NSUBCELL_Z; sub_z++)
1266         {
1267             ash_z = ash + sub_z*subdiv_z;
1268             na_z  = min(subdiv_z,na-(ash_z-ash));
1269
1270             /* We have already sorted on z */
1271
1272             if (sub_z % GPU_NSUBCELL_Z == 0)
1273             {
1274                 cz = sub_z/GPU_NSUBCELL_Z;
1275                 c  = grid->cxy_ind[cxy] + cz ;
1276
1277                 /* The number of atoms in this supercell */
1278                 na_c = min(grid->na_sc,na-(ash_z-ash));
1279
1280                 grid->nsubc[c] = min(GPU_NSUBCELL,(na_c+grid->na_c-1)/grid->na_c);
1281
1282                 /* Store the z-boundaries of the super cell */
1283                 grid->bbcz[c*NNBSBB_D  ] = x[nbs->a[ash_z]][ZZ];
1284                 grid->bbcz[c*NNBSBB_D+1] = x[nbs->a[ash_z+na_c-1]][ZZ];
1285             }
1286
1287 #if GPU_NSUBCELL_Y > 1
1288             /* Sort the atoms along y */
1289             sort_atoms(YY,(sub_z & 1),
1290                        nbs->a+ash_z,na_z,x,
1291                        grid->c0[YY]+cy*grid->sy,grid->inv_sy,
1292                        subdiv_y*SGSF,sort_work);
1293 #endif
1294
1295             for(sub_y=0; sub_y<GPU_NSUBCELL_Y; sub_y++)
1296             {
1297                 ash_y = ash_z + sub_y*subdiv_y;
1298                 na_y  = min(subdiv_y,na-(ash_y-ash));
1299
1300 #if GPU_NSUBCELL_X > 1
1301                 /* Sort the atoms along x */
1302                 sort_atoms(XX,((cz*GPU_NSUBCELL_Y + sub_y) & 1),
1303                            nbs->a+ash_y,na_y,x,
1304                            grid->c0[XX]+cx*grid->sx,grid->inv_sx,
1305                            subdiv_x*SGSF,sort_work);
1306 #endif
1307
1308                 for(sub_x=0; sub_x<GPU_NSUBCELL_X; sub_x++)
1309                 {
1310                     ash_x = ash_y + sub_x*subdiv_x;
1311                     na_x  = min(subdiv_x,na-(ash_x-ash));
1312
1313                     fill_cell(nbs,grid,nbat,
1314                               ash_x,ash_x+na_x,atinfo,x,
1315                               grid->na_c*(cx*GPU_NSUBCELL_X+sub_x) + (dd_zone >> 2),
1316                               grid->na_c*(cy*GPU_NSUBCELL_Y+sub_y) + (dd_zone & 3),
1317                               grid->na_c*sub_z,
1318                               bb_work_align);
1319                 }
1320             }
1321         }
1322
1323         /* Set the unused atom indices to -1 */
1324         for(ind=na; ind<ncz*grid->na_sc; ind++)
1325         {
1326             nbs->a[ash+ind] = -1;
1327         }
1328     }
1329 }
1330
1331 /* Determine in which grid column atoms should go */
1332 static void calc_column_indices(nbnxn_grid_t *grid,
1333                                 int a0,int a1,
1334                                 rvec *x,const int *move,
1335                                 int thread,int nthread,
1336                                 int *cell,
1337                                 int *cxy_na)
1338 {
1339     int  n0,n1,i;
1340     int  cx,cy;
1341
1342     /* We add one extra cell for particles which moved during DD */
1343     for(i=0; i<grid->ncx*grid->ncy+1; i++)
1344     {
1345         cxy_na[i] = 0;
1346     }
1347
1348     n0 = a0 + (int)((thread+0)*(a1 - a0))/nthread;
1349     n1 = a0 + (int)((thread+1)*(a1 - a0))/nthread;
1350     for(i=n0; i<n1; i++)
1351     {
1352         if (move == NULL || move[i] >= 0)
1353         {
1354             /* We need to be careful with rounding,
1355              * particles might be a few bits outside the local box.
1356              * The int cast takes care of the lower bound,
1357              * we need to explicitly take care of the upper bound.
1358              */
1359             cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
1360             if (cx == grid->ncx)
1361             {
1362                 cx = grid->ncx - 1;
1363             }
1364             cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
1365             if (cy == grid->ncy)
1366             {
1367                 cy = grid->ncy - 1;
1368             }
1369             /* For the moment cell contains only the, grid local,
1370              * x and y indices, not z.
1371              */
1372             cell[i] = cx*grid->ncy + cy;
1373
1374 #ifdef DEBUG_NBNXN_GRIDDING
1375             if (cell[i] < 0 || cell[i] >= grid->ncx*grid->ncy)
1376             {
1377                 gmx_fatal(FARGS,
1378                           "grid cell cx %d cy %d out of range (max %d %d)\n"
1379                           "atom %f %f %f, grid->c0 %f %f",
1380                           cx,cy,grid->ncx,grid->ncy,
1381                           x[i][XX],x[i][YY],x[i][ZZ],grid->c0[XX],grid->c0[YY]);
1382             }
1383 #endif
1384         }
1385         else
1386         {
1387             /* Put this moved particle after the end of the grid,
1388              * so we can process it later without using conditionals.
1389              */
1390             cell[i] = grid->ncx*grid->ncy;
1391         }
1392
1393         cxy_na[cell[i]]++;
1394     }
1395 }
1396
1397 /* Determine in which grid cells the atoms should go */
1398 static void calc_cell_indices(const nbnxn_search_t nbs,
1399                               int dd_zone,
1400                               nbnxn_grid_t *grid,
1401                               int a0,int a1,
1402                               const int *atinfo,
1403                               rvec *x,
1404                               const int *move,
1405                               nbnxn_atomdata_t *nbat)
1406 {
1407     int  n0,n1,i;
1408     int  cx,cy,cxy,ncz_max,ncz;
1409     int  nthread,thread;
1410     int  *cxy_na,cxy_na_i;
1411
1412     nthread = gmx_omp_nthreads_get(emntPairsearch);
1413
1414 #pragma omp parallel for num_threads(nthread) schedule(static)
1415     for(thread=0; thread<nthread; thread++)
1416     {
1417         calc_column_indices(grid,a0,a1,x,move,thread,nthread,
1418                             nbs->cell,nbs->work[thread].cxy_na);
1419     }
1420
1421     /* Make the cell index as a function of x and y */
1422     ncz_max = 0;
1423     ncz = 0;
1424     grid->cxy_ind[0] = 0;
1425     for(i=0; i<grid->ncx*grid->ncy+1; i++)
1426     {
1427         /* We set ncz_max at the beginning of the loop iso at the end
1428          * to skip i=grid->ncx*grid->ncy which are moved particles
1429          * that do not need to be ordered on the grid.
1430          */
1431         if (ncz > ncz_max)
1432         {
1433             ncz_max = ncz;
1434         }
1435         cxy_na_i = nbs->work[0].cxy_na[i];
1436         for(thread=1; thread<nthread; thread++)
1437         {
1438             cxy_na_i += nbs->work[thread].cxy_na[i];
1439         }
1440         ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
1441         if (nbat->XFormat == nbatX8)
1442         {
1443             /* Make the number of cell a multiple of 2 */
1444             ncz = (ncz + 1) & ~1;
1445         }
1446         grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
1447         /* Clear cxy_na, so we can reuse the array below */
1448         grid->cxy_na[i] = 0;
1449     }
1450     grid->nc = grid->cxy_ind[grid->ncx*grid->ncy] - grid->cxy_ind[0];
1451
1452     nbat->natoms = (grid->cell0 + grid->nc)*grid->na_sc;
1453
1454     if (debug)
1455     {
1456         fprintf(debug,"ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1457                 grid->na_sc,grid->na_c,grid->nc,
1458                 grid->ncx,grid->ncy,grid->nc/((double)(grid->ncx*grid->ncy)),
1459                 ncz_max);
1460         if (gmx_debug_at)
1461         {
1462             i = 0;
1463             for(cy=0; cy<grid->ncy; cy++)
1464             {
1465                 for(cx=0; cx<grid->ncx; cx++)
1466                 {
1467                     fprintf(debug," %2d",grid->cxy_ind[i+1]-grid->cxy_ind[i]);
1468                     i++;
1469                 }
1470                 fprintf(debug,"\n");
1471             }
1472         }
1473     }
1474
1475     /* Make sure the work array for sorting is large enough */
1476     if (ncz_max*grid->na_sc*SGSF > nbs->work[0].sort_work_nalloc)
1477     {
1478         for(thread=0; thread<nbs->nthread_max; thread++)
1479         {
1480             nbs->work[thread].sort_work_nalloc =
1481                 over_alloc_large(ncz_max*grid->na_sc*SGSF);
1482             srenew(nbs->work[thread].sort_work,
1483                    nbs->work[thread].sort_work_nalloc);
1484         }
1485     }
1486
1487     /* Now we know the dimensions we can fill the grid.
1488      * This is the first, unsorted fill. We sort the columns after this.
1489      */
1490     for(i=a0; i<a1; i++)
1491     {
1492         /* At this point nbs->cell contains the local grid x,y indices */
1493         cxy = nbs->cell[i];
1494         nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
1495     }
1496
1497     /* Set the cell indices for the moved particles */
1498     n0 = grid->nc*grid->na_sc;
1499     n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
1500     for(i=n0; i<n1; i++)
1501     {
1502         nbs->cell[nbs->a[i]] = i;
1503     }
1504
1505     /* Sort the super-cell columns along z into the sub-cells. */
1506 #pragma omp parallel for num_threads(nbs->nthread_max) schedule(static)
1507     for(thread=0; thread<nbs->nthread_max; thread++)
1508     {
1509         if (grid->bSimple)
1510         {
1511             sort_columns_simple(nbs,dd_zone,grid,a0,a1,atinfo,x,nbat,
1512                                 ((thread+0)*grid->ncx*grid->ncy)/nthread,
1513                                 ((thread+1)*grid->ncx*grid->ncy)/nthread,
1514                                 nbs->work[thread].sort_work);
1515         }
1516         else
1517         {
1518             sort_columns_supersub(nbs,dd_zone,grid,a0,a1,atinfo,x,nbat,
1519                                   ((thread+0)*grid->ncx*grid->ncy)/nthread,
1520                                   ((thread+1)*grid->ncx*grid->ncy)/nthread,
1521                                   nbs->work[thread].sort_work);
1522         }
1523     }
1524
1525 #ifdef NBNXN_SEARCH_SSE
1526     if (grid->bSimple && nbat->XFormat == nbatX8)
1527     {
1528         combine_bounding_box_pairs(grid,grid->bb);
1529     }
1530 #endif
1531
1532     if (!grid->bSimple)
1533     {
1534         grid->nsubc_tot = 0;
1535         for(i=0; i<grid->nc; i++)
1536         {
1537             grid->nsubc_tot += grid->nsubc[i];
1538         }
1539     }
1540
1541     if (debug)
1542     {
1543         if (grid->bSimple)
1544         {
1545             print_bbsizes_simple(debug,nbs,grid);
1546         }
1547         else
1548         {
1549             fprintf(debug,"ns non-zero sub-cells: %d average atoms %.2f\n",
1550                     grid->nsubc_tot,(a1-a0)/(double)grid->nsubc_tot);
1551
1552             print_bbsizes_supersub(debug,nbs,grid);
1553         }
1554     }
1555 }
1556
1557 static void init_buffer_flags(nbnxn_buffer_flags_t *flags,
1558                               int natoms)
1559 {
1560     int b;
1561
1562     flags->nflag = (natoms + NBNXN_BUFFERFLAG_SIZE - 1)/NBNXN_BUFFERFLAG_SIZE;
1563     if (flags->nflag > flags->flag_nalloc)
1564     {
1565         flags->flag_nalloc = over_alloc_large(flags->nflag);
1566         srenew(flags->flag,flags->flag_nalloc);
1567     }
1568     for(b=0; b<flags->nflag; b++)
1569     {
1570         flags->flag[b] = 0;
1571     }
1572 }
1573
1574 /* Sets up a grid and puts the atoms on the grid.
1575  * This function only operates on one domain of the domain decompostion.
1576  * Note that without domain decomposition there is only one domain.
1577  */
1578 void nbnxn_put_on_grid(nbnxn_search_t nbs,
1579                        int ePBC,matrix box,
1580                        int dd_zone,
1581                        rvec corner0,rvec corner1,
1582                        int a0,int a1,
1583                        real atom_density,
1584                        const int *atinfo,
1585                        rvec *x,
1586                        int nmoved,int *move,
1587                        int nb_kernel_type,
1588                        nbnxn_atomdata_t *nbat)
1589 {
1590     nbnxn_grid_t *grid;
1591     int n;
1592     int nc_max_grid,nc_max;
1593
1594     grid = &nbs->grid[dd_zone];
1595
1596     nbs_cycle_start(&nbs->cc[enbsCCgrid]);
1597
1598     grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
1599
1600     grid->na_c      = nbnxn_kernel_to_ci_size(nb_kernel_type);
1601     grid->na_cj     = nbnxn_kernel_to_cj_size(nb_kernel_type);
1602     grid->na_sc     = (grid->bSimple ? 1 : GPU_NSUBCELL)*grid->na_c;
1603     grid->na_c_2log = get_2log(grid->na_c);
1604
1605     nbat->na_c = grid->na_c;
1606
1607     if (dd_zone == 0)
1608     {
1609         grid->cell0 = 0;
1610     }
1611     else
1612     {
1613         grid->cell0 =
1614             (nbs->grid[dd_zone-1].cell0 + nbs->grid[dd_zone-1].nc)*
1615             nbs->grid[dd_zone-1].na_sc/grid->na_sc;
1616     }
1617
1618     n = a1 - a0;
1619
1620     if (dd_zone == 0)
1621     {
1622         nbs->ePBC = ePBC;
1623         copy_mat(box,nbs->box);
1624
1625         if (atom_density >= 0)
1626         {
1627             grid->atom_density = atom_density;
1628         }
1629         else
1630         {
1631             grid->atom_density = grid_atom_density(n-nmoved,corner0,corner1);
1632         }
1633
1634         grid->cell0 = 0;
1635
1636         nbs->natoms_local    = a1 - nmoved;
1637         /* We assume that nbnxn_put_on_grid is called first
1638          * for the local atoms (dd_zone=0).
1639          */
1640         nbs->natoms_nonlocal = a1 - nmoved;
1641     }
1642     else
1643     {
1644         nbs->natoms_nonlocal = max(nbs->natoms_nonlocal,a1);
1645     }
1646
1647     nc_max_grid = set_grid_size_xy(nbs,grid,n-nmoved,corner0,corner1,
1648                                    nbs->grid[0].atom_density,
1649                                    nbat->XFormat);
1650
1651     nc_max = grid->cell0 + nc_max_grid;
1652
1653     if (a1 > nbs->cell_nalloc)
1654     {
1655         nbs->cell_nalloc = over_alloc_large(a1);
1656         srenew(nbs->cell,nbs->cell_nalloc);
1657     }
1658
1659     /* To avoid conditionals we store the moved particles at the end of a,
1660      * make sure we have enough space.
1661      */
1662     if (nc_max*grid->na_sc + nmoved > nbs->a_nalloc)
1663     {
1664         nbs->a_nalloc = over_alloc_large(nc_max*grid->na_sc + nmoved);
1665         srenew(nbs->a,nbs->a_nalloc);
1666     }
1667
1668     /* We need padding up to a multiple of the buffer flag size: simply add */
1669     if (nc_max*grid->na_sc + NBNXN_BUFFERFLAG_SIZE > nbat->nalloc)
1670     {
1671         nbnxn_atomdata_realloc(nbat,nc_max*grid->na_sc+NBNXN_BUFFERFLAG_SIZE);
1672     }
1673
1674     calc_cell_indices(nbs,dd_zone,grid,a0,a1,atinfo,x,move,nbat);
1675
1676     if (dd_zone == 0)
1677     {
1678         nbat->natoms_local = nbat->natoms;
1679     }
1680
1681     nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
1682 }
1683
1684 /* Calls nbnxn_put_on_grid for all non-local domains */
1685 void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs,
1686                                 const gmx_domdec_zones_t *zones,
1687                                 const int *atinfo,
1688                                 rvec *x,
1689                                 int nb_kernel_type,
1690                                 nbnxn_atomdata_t *nbat)
1691 {
1692     int  zone,d;
1693     rvec c0,c1;
1694
1695     for(zone=1; zone<zones->n; zone++)
1696     {
1697         for(d=0; d<DIM; d++)
1698         {
1699             c0[d] = zones->size[zone].bb_x0[d];
1700             c1[d] = zones->size[zone].bb_x1[d];
1701         }
1702
1703         nbnxn_put_on_grid(nbs,nbs->ePBC,NULL,
1704                           zone,c0,c1,
1705                           zones->cg_range[zone],
1706                           zones->cg_range[zone+1],
1707                           -1,
1708                           atinfo,
1709                           x,
1710                           0,NULL,
1711                           nb_kernel_type,
1712                           nbat);
1713     }
1714 }
1715
1716 /* Add simple grid type information to the local super/sub grid */
1717 void nbnxn_grid_add_simple(nbnxn_search_t nbs,
1718                            nbnxn_atomdata_t *nbat)
1719 {
1720     nbnxn_grid_t *grid;
1721     float *bbcz,*bb;
1722     int ncd,sc;
1723
1724     grid = &nbs->grid[0];
1725
1726     if (grid->bSimple)
1727     {
1728         gmx_incons("nbnxn_grid_simple called with a simple grid");
1729     }
1730
1731     ncd = grid->na_sc/NBNXN_CPU_CLUSTER_I_SIZE;
1732
1733     if (grid->nc*ncd > grid->nc_nalloc_simple)
1734     {
1735         grid->nc_nalloc_simple = over_alloc_large(grid->nc*ncd);
1736         srenew(grid->bbcz_simple,grid->nc_nalloc_simple*NNBSBB_D);
1737         srenew(grid->bb_simple,grid->nc_nalloc_simple*NNBSBB_B);
1738         srenew(grid->flags_simple,grid->nc_nalloc_simple);
1739         if (nbat->XFormat)
1740         {
1741             sfree_aligned(grid->bbj);
1742             snew_aligned(grid->bbj,grid->nc_nalloc_simple/2,16);
1743         }
1744     }
1745
1746     bbcz = grid->bbcz_simple;
1747     bb   = grid->bb_simple;
1748
1749 #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
1750     for(sc=0; sc<grid->nc; sc++)
1751     {
1752         int c,tx,na;
1753
1754         for(c=0; c<ncd; c++)
1755         {
1756             tx = sc*ncd + c;
1757
1758             na = NBNXN_CPU_CLUSTER_I_SIZE;
1759             while (na > 0 &&
1760                    nbat->type[tx*NBNXN_CPU_CLUSTER_I_SIZE+na-1] == nbat->ntype-1)
1761             {
1762                 na--;
1763             }
1764
1765             if (na > 0)
1766             {
1767                 switch (nbat->XFormat)
1768                 {
1769                 case nbatX4:
1770                     /* PACK_X4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
1771                     calc_bounding_box_x_x4(na,nbat->x+tx*STRIDE_P4,
1772                                            bb+tx*NNBSBB_B);
1773                     break;
1774                 case nbatX8:
1775                     /* PACK_X8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
1776                     calc_bounding_box_x_x8(na,nbat->x+X8_IND_A(tx*NBNXN_CPU_CLUSTER_I_SIZE),
1777                                            bb+tx*NNBSBB_B);
1778                     break;
1779                 default:
1780                     calc_bounding_box(na,nbat->xstride,
1781                                       nbat->x+tx*NBNXN_CPU_CLUSTER_I_SIZE*nbat->xstride,
1782                                       bb+tx*NNBSBB_B);
1783                     break;
1784                 }
1785                 bbcz[tx*NNBSBB_D+0] = bb[tx*NNBSBB_B         +ZZ];
1786                 bbcz[tx*NNBSBB_D+1] = bb[tx*NNBSBB_B+NNBSBB_C+ZZ];
1787
1788                 /* No interaction optimization yet here */
1789                 grid->flags_simple[tx] = NBNXN_CI_DO_LJ(0) | NBNXN_CI_DO_COUL(0);
1790             }
1791             else
1792             {
1793                 grid->flags_simple[tx] = 0;
1794             }
1795         }
1796     }
1797
1798 #ifdef NBNXN_SEARCH_SSE
1799     if (grid->bSimple && nbat->XFormat == nbatX8)
1800     {
1801         combine_bounding_box_pairs(grid,grid->bb_simple);
1802     }
1803 #endif
1804 }
1805
1806 void nbnxn_get_ncells(nbnxn_search_t nbs,int *ncx,int *ncy)
1807 {
1808     *ncx = nbs->grid[0].ncx;
1809     *ncy = nbs->grid[0].ncy;
1810 }
1811
1812 void nbnxn_get_atomorder(nbnxn_search_t nbs,int **a,int *n)
1813 {
1814     const nbnxn_grid_t *grid;
1815
1816     grid = &nbs->grid[0];
1817
1818     /* Return the atom order for the home cell (index 0) */
1819     *a  = nbs->a;
1820
1821     *n = grid->cxy_ind[grid->ncx*grid->ncy]*grid->na_sc;
1822 }
1823
1824 void nbnxn_set_atomorder(nbnxn_search_t nbs)
1825 {
1826     nbnxn_grid_t *grid;
1827     int ao,cx,cy,cxy,cz,j;
1828
1829     /* Set the atom order for the home cell (index 0) */
1830     grid = &nbs->grid[0];
1831
1832     ao = 0;
1833     for(cx=0; cx<grid->ncx; cx++)
1834     {
1835         for(cy=0; cy<grid->ncy; cy++)
1836         {
1837             cxy = cx*grid->ncy + cy;
1838             j   = grid->cxy_ind[cxy]*grid->na_sc;
1839             for(cz=0; cz<grid->cxy_na[cxy]; cz++)
1840             {
1841                 nbs->a[j]     = ao;
1842                 nbs->cell[ao] = j;
1843                 ao++;
1844                 j++;
1845             }
1846         }
1847     }
1848 }
1849
1850 /* Determines the cell range along one dimension that
1851  * the bounding box b0 - b1 sees.
1852  */
1853 static void get_cell_range(real b0,real b1,
1854                            int nc,real c0,real s,real invs,
1855                            real d2,real r2,int *cf,int *cl)
1856 {
1857     *cf = max((int)((b0 - c0)*invs),0);
1858
1859     while (*cf > 0 && d2 + sqr((b0 - c0) - (*cf-1+1)*s) < r2)
1860     {
1861         (*cf)--;
1862     }
1863
1864     *cl = min((int)((b1 - c0)*invs),nc-1);
1865     while (*cl < nc-1 && d2 + sqr((*cl+1)*s - (b1 - c0)) < r2)
1866     {
1867         (*cl)++;
1868     }
1869 }
1870
1871 /* Reference code calculating the distance^2 between two bounding boxes */
1872 static float box_dist2(float bx0,float bx1,float by0,
1873                        float by1,float bz0,float bz1,
1874                        const float *bb)
1875 {
1876     float d2;
1877     float dl,dh,dm,dm0;
1878
1879     d2 = 0;
1880
1881     dl  = bx0 - bb[BBU_X];
1882     dh  = bb[BBL_X] - bx1;
1883     dm  = max(dl,dh);
1884     dm0 = max(dm,0);
1885     d2 += dm0*dm0;
1886
1887     dl  = by0 - bb[BBU_Y];
1888     dh  = bb[BBL_Y] - by1;
1889     dm  = max(dl,dh);
1890     dm0 = max(dm,0);
1891     d2 += dm0*dm0;
1892
1893     dl  = bz0 - bb[BBU_Z];
1894     dh  = bb[BBL_Z] - bz1;
1895     dm  = max(dl,dh);
1896     dm0 = max(dm,0);
1897     d2 += dm0*dm0;
1898
1899     return d2;
1900 }
1901
1902 /* Plain C code calculating the distance^2 between two bounding boxes */
1903 static float subc_bb_dist2(int si,const float *bb_i_ci,
1904                            int csj,const float *bb_j_all)
1905 {
1906     const float *bb_i,*bb_j;
1907     float d2;
1908     float dl,dh,dm,dm0;
1909
1910     bb_i = bb_i_ci  +  si*NNBSBB_B;
1911     bb_j = bb_j_all + csj*NNBSBB_B;
1912
1913     d2 = 0;
1914
1915     dl  = bb_i[BBL_X] - bb_j[BBU_X];
1916     dh  = bb_j[BBL_X] - bb_i[BBU_X];
1917     dm  = max(dl,dh);
1918     dm0 = max(dm,0);
1919     d2 += dm0*dm0;
1920
1921     dl  = bb_i[BBL_Y] - bb_j[BBU_Y];
1922     dh  = bb_j[BBL_Y] - bb_i[BBU_Y];
1923     dm  = max(dl,dh);
1924     dm0 = max(dm,0);
1925     d2 += dm0*dm0;
1926
1927     dl  = bb_i[BBL_Z] - bb_j[BBU_Z];
1928     dh  = bb_j[BBL_Z] - bb_i[BBU_Z];
1929     dm  = max(dl,dh);
1930     dm0 = max(dm,0);
1931     d2 += dm0*dm0;
1932
1933     return d2;
1934 }
1935
1936 #ifdef NBNXN_SEARCH_SSE
1937
1938 /* SSE code for bb distance for bb format xyz0 */
1939 static float subc_bb_dist2_sse(int na_c,
1940                               int si,const float *bb_i_ci,
1941                               int csj,const float *bb_j_all)
1942 {
1943     const float *bb_i,*bb_j;
1944
1945     __m128 bb_i_SSE0,bb_i_SSE1;
1946     __m128 bb_j_SSE0,bb_j_SSE1;
1947     __m128 dl_SSE;
1948     __m128 dh_SSE;
1949     __m128 dm_SSE;
1950     __m128 dm0_SSE;
1951     __m128 d2_SSE;
1952 #ifndef GMX_X86_SSE4_1
1953     float d2_array[7],*d2_align;
1954
1955     d2_align = (float *)(((size_t)(d2_array+3)) & (~((size_t)15)));
1956 #else
1957     float d2;
1958 #endif
1959
1960     bb_i = bb_i_ci  +  si*NNBSBB_B;
1961     bb_j = bb_j_all + csj*NNBSBB_B;
1962
1963     bb_i_SSE0 = _mm_load_ps(bb_i);
1964     bb_i_SSE1 = _mm_load_ps(bb_i+NNBSBB_C);
1965     bb_j_SSE0 = _mm_load_ps(bb_j);
1966     bb_j_SSE1 = _mm_load_ps(bb_j+NNBSBB_C);
1967
1968     dl_SSE    = _mm_sub_ps(bb_i_SSE0,bb_j_SSE1);
1969     dh_SSE    = _mm_sub_ps(bb_j_SSE0,bb_i_SSE1);
1970
1971     dm_SSE    = _mm_max_ps(dl_SSE,dh_SSE);
1972     dm0_SSE   = _mm_max_ps(dm_SSE,_mm_setzero_ps());
1973 #ifndef GMX_X86_SSE4_1
1974     d2_SSE    = _mm_mul_ps(dm0_SSE,dm0_SSE);
1975
1976     _mm_store_ps(d2_align,d2_SSE);
1977
1978     return d2_align[0] + d2_align[1] + d2_align[2];
1979 #else
1980     /* SSE4.1 dot product of components 0,1,2 */
1981     d2_SSE    = _mm_dp_ps(dm0_SSE,dm0_SSE,0x71);
1982
1983     _mm_store_ss(&d2,d2_SSE);
1984
1985     return d2;
1986 #endif
1987 }
1988
1989 /* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
1990 #define SUBC_BB_DIST2_SSE_XXXX_INNER(si,bb_i,d2) \
1991 {                                                \
1992     int    shi;                                  \
1993                                                  \
1994     __m128 dx_0,dy_0,dz_0;                       \
1995     __m128 dx_1,dy_1,dz_1;                       \
1996                                                  \
1997     __m128 mx,my,mz;                             \
1998     __m128 m0x,m0y,m0z;                          \
1999                                                  \
2000     __m128 d2x,d2y,d2z;                          \
2001     __m128 d2s,d2t;                              \
2002                                                  \
2003     shi = si*NNBSBB_D*DIM;                       \
2004                                                  \
2005     xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_8BB);   \
2006     yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_8BB);   \
2007     zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_8BB);   \
2008     xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_8BB);   \
2009     yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_8BB);   \
2010     zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_8BB);   \
2011                                                  \
2012     dx_0 = _mm_sub_ps(xi_l,xj_h);                \
2013     dy_0 = _mm_sub_ps(yi_l,yj_h);                \
2014     dz_0 = _mm_sub_ps(zi_l,zj_h);                \
2015                                                  \
2016     dx_1 = _mm_sub_ps(xj_l,xi_h);                \
2017     dy_1 = _mm_sub_ps(yj_l,yi_h);                \
2018     dz_1 = _mm_sub_ps(zj_l,zi_h);                \
2019                                                  \
2020     mx   = _mm_max_ps(dx_0,dx_1);                \
2021     my   = _mm_max_ps(dy_0,dy_1);                \
2022     mz   = _mm_max_ps(dz_0,dz_1);                \
2023                                                  \
2024     m0x  = _mm_max_ps(mx,zero);                  \
2025     m0y  = _mm_max_ps(my,zero);                  \
2026     m0z  = _mm_max_ps(mz,zero);                  \
2027                                                  \
2028     d2x  = _mm_mul_ps(m0x,m0x);                  \
2029     d2y  = _mm_mul_ps(m0y,m0y);                  \
2030     d2z  = _mm_mul_ps(m0z,m0z);                  \
2031                                                  \
2032     d2s  = _mm_add_ps(d2x,d2y);                  \
2033     d2t  = _mm_add_ps(d2s,d2z);                  \
2034                                                  \
2035     _mm_store_ps(d2+si,d2t);                     \
2036 }
2037
2038 /* SSE code for nsi bb distances for bb format xxxxyyyyzzzz */
2039 static void subc_bb_dist2_sse_xxxx(const float *bb_j,
2040                                    int nsi,const float *bb_i,
2041                                    float *d2)
2042 {
2043     __m128 xj_l,yj_l,zj_l;
2044     __m128 xj_h,yj_h,zj_h;
2045     __m128 xi_l,yi_l,zi_l;
2046     __m128 xi_h,yi_h,zi_h;
2047
2048     __m128 zero;
2049
2050     zero = _mm_setzero_ps();
2051
2052     xj_l = _mm_set1_ps(bb_j[0*STRIDE_8BB]);
2053     yj_l = _mm_set1_ps(bb_j[1*STRIDE_8BB]);
2054     zj_l = _mm_set1_ps(bb_j[2*STRIDE_8BB]);
2055     xj_h = _mm_set1_ps(bb_j[3*STRIDE_8BB]);
2056     yj_h = _mm_set1_ps(bb_j[4*STRIDE_8BB]);
2057     zj_h = _mm_set1_ps(bb_j[5*STRIDE_8BB]);
2058
2059     /* Here we "loop" over si (0,STRIDE_8BB) from 0 to nsi with step STRIDE_8BB.
2060      * But as we know the number of iterations is 1 or 2, we unroll manually.
2061      */
2062     SUBC_BB_DIST2_SSE_XXXX_INNER(0,bb_i,d2);
2063     if (STRIDE_8BB < nsi)
2064     {
2065         SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_8BB,bb_i,d2);
2066     }
2067 }
2068
2069 #endif /* NBNXN_SEARCH_SSE */
2070
2071 /* Plain C function which determines if any atom pair between two cells
2072  * is within distance sqrt(rl2).
2073  */
2074 static gmx_bool subc_in_range_x(int na_c,
2075                                 int si,const real *x_i,
2076                                 int csj,int stride,const real *x_j,
2077                                 real rl2)
2078 {
2079     int  i,j,i0,j0;
2080     real d2;
2081
2082     for(i=0; i<na_c; i++)
2083     {
2084         i0 = (si*na_c + i)*DIM;
2085         for(j=0; j<na_c; j++)
2086         {
2087             j0 = (csj*na_c + j)*stride;
2088
2089             d2 = sqr(x_i[i0  ] - x_j[j0  ]) +
2090                  sqr(x_i[i0+1] - x_j[j0+1]) +
2091                  sqr(x_i[i0+2] - x_j[j0+2]);
2092
2093             if (d2 < rl2)
2094             {
2095                 return TRUE;
2096             }
2097         }
2098     }
2099
2100     return FALSE;
2101 }
2102
2103 /* SSE function which determines if any atom pair between two cells,
2104  * both with 8 atoms, is within distance sqrt(rl2).
2105  */
2106 static gmx_bool subc_in_range_sse8(int na_c,
2107                                    int si,const real *x_i,
2108                                    int csj,int stride,const real *x_j,
2109                                    real rl2)
2110 {
2111 #ifdef NBNXN_SEARCH_SSE_SINGLE
2112     __m128 ix_SSE0,iy_SSE0,iz_SSE0;
2113     __m128 ix_SSE1,iy_SSE1,iz_SSE1;
2114
2115     __m128 rc2_SSE;
2116
2117     int na_c_sse;
2118     int j0,j1;
2119
2120     rc2_SSE   = _mm_set1_ps(rl2);
2121
2122     na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_8BB;
2123     ix_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_8BB);
2124     iy_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_8BB);
2125     iz_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_8BB);
2126     ix_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_8BB);
2127     iy_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_8BB);
2128     iz_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_8BB);
2129
2130     /* We loop from the outer to the inner particles to maximize
2131      * the chance that we find a pair in range quickly and return.
2132      */
2133     j0 = csj*na_c;
2134     j1 = j0 + na_c - 1;
2135     while (j0 < j1)
2136     {
2137         __m128 jx0_SSE,jy0_SSE,jz0_SSE;
2138         __m128 jx1_SSE,jy1_SSE,jz1_SSE;
2139
2140         __m128 dx_SSE0,dy_SSE0,dz_SSE0;
2141         __m128 dx_SSE1,dy_SSE1,dz_SSE1;
2142         __m128 dx_SSE2,dy_SSE2,dz_SSE2;
2143         __m128 dx_SSE3,dy_SSE3,dz_SSE3;
2144
2145         __m128 rsq_SSE0;
2146         __m128 rsq_SSE1;
2147         __m128 rsq_SSE2;
2148         __m128 rsq_SSE3;
2149
2150         __m128 wco_SSE0;
2151         __m128 wco_SSE1;
2152         __m128 wco_SSE2;
2153         __m128 wco_SSE3;
2154         __m128 wco_any_SSE01,wco_any_SSE23,wco_any_SSE;
2155
2156         jx0_SSE = _mm_load1_ps(x_j+j0*stride+0);
2157         jy0_SSE = _mm_load1_ps(x_j+j0*stride+1);
2158         jz0_SSE = _mm_load1_ps(x_j+j0*stride+2);
2159
2160         jx1_SSE = _mm_load1_ps(x_j+j1*stride+0);
2161         jy1_SSE = _mm_load1_ps(x_j+j1*stride+1);
2162         jz1_SSE = _mm_load1_ps(x_j+j1*stride+2);
2163
2164         /* Calculate distance */
2165         dx_SSE0            = _mm_sub_ps(ix_SSE0,jx0_SSE);
2166         dy_SSE0            = _mm_sub_ps(iy_SSE0,jy0_SSE);
2167         dz_SSE0            = _mm_sub_ps(iz_SSE0,jz0_SSE);
2168         dx_SSE1            = _mm_sub_ps(ix_SSE1,jx0_SSE);
2169         dy_SSE1            = _mm_sub_ps(iy_SSE1,jy0_SSE);
2170         dz_SSE1            = _mm_sub_ps(iz_SSE1,jz0_SSE);
2171         dx_SSE2            = _mm_sub_ps(ix_SSE0,jx1_SSE);
2172         dy_SSE2            = _mm_sub_ps(iy_SSE0,jy1_SSE);
2173         dz_SSE2            = _mm_sub_ps(iz_SSE0,jz1_SSE);
2174         dx_SSE3            = _mm_sub_ps(ix_SSE1,jx1_SSE);
2175         dy_SSE3            = _mm_sub_ps(iy_SSE1,jy1_SSE);
2176         dz_SSE3            = _mm_sub_ps(iz_SSE1,jz1_SSE);
2177
2178         /* rsq = dx*dx+dy*dy+dz*dz */
2179         rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
2180         rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
2181         rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
2182         rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
2183
2184         wco_SSE0           = _mm_cmplt_ps(rsq_SSE0,rc2_SSE);
2185         wco_SSE1           = _mm_cmplt_ps(rsq_SSE1,rc2_SSE);
2186         wco_SSE2           = _mm_cmplt_ps(rsq_SSE2,rc2_SSE);
2187         wco_SSE3           = _mm_cmplt_ps(rsq_SSE3,rc2_SSE);
2188
2189         wco_any_SSE01      = _mm_or_ps(wco_SSE0,wco_SSE1);
2190         wco_any_SSE23      = _mm_or_ps(wco_SSE2,wco_SSE3);
2191         wco_any_SSE        = _mm_or_ps(wco_any_SSE01,wco_any_SSE23);
2192
2193         if (_mm_movemask_ps(wco_any_SSE))
2194         {
2195             return TRUE;
2196         }
2197
2198         j0++;
2199         j1--;
2200     }
2201     return FALSE;
2202
2203 #else
2204     /* No SSE */
2205     gmx_incons("SSE function called without SSE support");
2206
2207     return TRUE;
2208 #endif
2209 }
2210
2211 /* Returns the j sub-cell for index cj_ind */
2212 static int nbl_cj(const nbnxn_pairlist_t *nbl,int cj_ind)
2213 {
2214     return nbl->cj4[cj_ind>>2].cj[cj_ind & 3];
2215 }
2216
2217 /* Returns the i-interaction mask of the j sub-cell for index cj_ind */
2218 static unsigned nbl_imask0(const nbnxn_pairlist_t *nbl,int cj_ind)
2219 {
2220     return nbl->cj4[cj_ind>>2].imei[0].imask;
2221 }
2222
2223 /* Ensures there is enough space for extra extra exclusion masks */
2224 static void check_excl_space(nbnxn_pairlist_t *nbl,int extra)
2225 {
2226     if (nbl->nexcl+extra > nbl->excl_nalloc)
2227     {
2228         nbl->excl_nalloc = over_alloc_small(nbl->nexcl+extra);
2229         nbnxn_realloc_void((void **)&nbl->excl,
2230                            nbl->nexcl*sizeof(*nbl->excl),
2231                            nbl->excl_nalloc*sizeof(*nbl->excl),
2232                            nbl->alloc,nbl->free);
2233     }
2234 }
2235
2236 /* Ensures there is enough space for ncell extra j-cells in the list */
2237 static void check_subcell_list_space_simple(nbnxn_pairlist_t *nbl,
2238                                             int ncell)
2239 {
2240     int cj_max;
2241
2242     cj_max = nbl->ncj + ncell;
2243
2244     if (cj_max > nbl->cj_nalloc)
2245     {
2246         nbl->cj_nalloc = over_alloc_small(cj_max);
2247         nbnxn_realloc_void((void **)&nbl->cj,
2248                            nbl->ncj*sizeof(*nbl->cj),
2249                            nbl->cj_nalloc*sizeof(*nbl->cj),
2250                            nbl->alloc,nbl->free);
2251     }
2252 }
2253
2254 /* Ensures there is enough space for ncell extra j-subcells in the list */
2255 static void check_subcell_list_space_supersub(nbnxn_pairlist_t *nbl,
2256                                               int nsupercell)
2257 {
2258     int ncj4_max,j4,j,w,t;
2259
2260 #define NWARP       2
2261 #define WARP_SIZE  32
2262
2263     /* We can have maximally nsupercell*GPU_NSUBCELL sj lists */
2264     /* We can store 4 j-subcell - i-supercell pairs in one struct.
2265      * since we round down, we need one extra entry.
2266      */
2267     ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL + 4-1) >> 2);
2268
2269     if (ncj4_max > nbl->cj4_nalloc)
2270     {
2271         nbl->cj4_nalloc = over_alloc_small(ncj4_max);
2272         nbnxn_realloc_void((void **)&nbl->cj4,
2273                            nbl->work->cj4_init*sizeof(*nbl->cj4),
2274                            nbl->cj4_nalloc*sizeof(*nbl->cj4),
2275                            nbl->alloc,nbl->free);
2276     }
2277
2278     if (ncj4_max > nbl->work->cj4_init)
2279     {
2280         for(j4=nbl->work->cj4_init; j4<ncj4_max; j4++)
2281         {
2282             /* No i-subcells and no excl's in the list initially */
2283             for(w=0; w<NWARP; w++)
2284             {
2285                 nbl->cj4[j4].imei[w].imask    = 0U;
2286                 nbl->cj4[j4].imei[w].excl_ind = 0;
2287
2288             }
2289         }
2290         nbl->work->cj4_init = ncj4_max;
2291     }
2292 }
2293
2294 /* Set all excl masks for one GPU warp no exclusions */
2295 static void set_no_excls(nbnxn_excl_t *excl)
2296 {
2297     int t;
2298
2299     for(t=0; t<WARP_SIZE; t++)
2300     {
2301         /* Turn all interaction bits on */
2302         excl->pair[t] = NBNXN_INT_MASK_ALL;
2303     }
2304 }
2305
2306 /* Initializes a single nbnxn_pairlist_t data structure */
2307 static void nbnxn_init_pairlist(nbnxn_pairlist_t *nbl,
2308                                 gmx_bool bSimple,
2309                                 nbnxn_alloc_t *alloc,
2310                                 nbnxn_free_t  *free)
2311 {
2312     if (alloc == NULL)
2313     {
2314         nbl->alloc = nbnxn_alloc_aligned;
2315     }
2316     else
2317     {
2318         nbl->alloc = alloc;
2319     }
2320     if (free == NULL)
2321     {
2322         nbl->free = nbnxn_free_aligned;
2323     }
2324     else
2325     {
2326         nbl->free = free;
2327     }
2328
2329     nbl->bSimple     = bSimple;
2330     nbl->na_sc       = 0;
2331     nbl->na_ci       = 0;
2332     nbl->na_cj       = 0;
2333     nbl->nci         = 0;
2334     nbl->ci          = NULL;
2335     nbl->ci_nalloc   = 0;
2336     nbl->ncj         = 0;
2337     nbl->cj          = NULL;
2338     nbl->cj_nalloc   = 0;
2339     nbl->ncj4        = 0;
2340     /* We need one element extra in sj, so alloc initially with 1 */
2341     nbl->cj4_nalloc  = 0;
2342     nbl->cj4         = NULL;
2343     nbl->nci_tot     = 0;
2344
2345     if (!nbl->bSimple)
2346     {
2347         nbl->excl        = NULL;
2348         nbl->excl_nalloc = 0;
2349         nbl->nexcl       = 0;
2350         check_excl_space(nbl,1);
2351         nbl->nexcl       = 1;
2352         set_no_excls(&nbl->excl[0]);
2353     }
2354
2355     snew(nbl->work,1);
2356 #ifdef NBNXN_BBXXXX
2357     snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,16);
2358 #else
2359     snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,16);
2360 #endif
2361     snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,16);
2362 #ifdef NBNXN_SEARCH_SSE
2363     snew_aligned(nbl->work->x_ci_x86_simd128,1,16);
2364 #ifdef GMX_X86_AVX_256
2365     snew_aligned(nbl->work->x_ci_x86_simd256,1,32);
2366 #endif
2367 #endif
2368     snew_aligned(nbl->work->d2,GPU_NSUBCELL,16);
2369 }
2370
2371 void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
2372                              gmx_bool bSimple, gmx_bool bCombined,
2373                              nbnxn_alloc_t *alloc,
2374                              nbnxn_free_t  *free)
2375 {
2376     int i;
2377
2378     nbl_list->bSimple   = bSimple;
2379     nbl_list->bCombined = bCombined;
2380
2381     nbl_list->nnbl = gmx_omp_nthreads_get(emntNonbonded);
2382
2383     if (!nbl_list->bCombined &&
2384         nbl_list->nnbl > NBNXN_BUFFERFLAG_MAX_THREADS)
2385     {
2386         gmx_fatal(FARGS,"%d OpenMP threads were requested. Since the non-bonded force buffer reduction is prohibitively slow with more than %d threads, we do not allow this. Use %d or less OpenMP threads.",
2387                   nbl_list->nnbl,NBNXN_BUFFERFLAG_MAX_THREADS,NBNXN_BUFFERFLAG_MAX_THREADS);
2388     }
2389
2390     snew(nbl_list->nbl,nbl_list->nnbl);
2391     /* Execute in order to avoid memory interleaving between threads */
2392 #pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)
2393     for(i=0; i<nbl_list->nnbl; i++)
2394     {
2395         /* Allocate the nblist data structure locally on each thread
2396          * to optimize memory access for NUMA architectures.
2397          */
2398         snew(nbl_list->nbl[i],1);
2399
2400         /* Only list 0 is used on the GPU, use normal allocation for i>0 */
2401         if (i == 0)
2402         {
2403             nbnxn_init_pairlist(nbl_list->nbl[i],nbl_list->bSimple,alloc,free);
2404         }
2405         else
2406         {
2407             nbnxn_init_pairlist(nbl_list->nbl[i],nbl_list->bSimple,NULL,NULL);
2408         }
2409     }
2410 }
2411
2412 /* Print statistics of a pair list, used for debug output */
2413 static void print_nblist_statistics_simple(FILE *fp,const nbnxn_pairlist_t *nbl,
2414                                            const nbnxn_search_t nbs,real rl)
2415 {
2416     const nbnxn_grid_t *grid;
2417     int cs[SHIFTS];
2418     int s,i,j;
2419     int npexcl;
2420
2421     /* This code only produces correct statistics with domain decomposition */
2422     grid = &nbs->grid[0];
2423
2424     fprintf(fp,"nbl nci %d ncj %d\n",
2425             nbl->nci,nbl->ncj);
2426     fprintf(fp,"nbl na_sc %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
2427             nbl->na_sc,rl,nbl->ncj,nbl->ncj/(double)grid->nc,
2428             nbl->ncj/(double)grid->nc*grid->na_sc,
2429             nbl->ncj/(double)grid->nc*grid->na_sc/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nc*grid->na_sc/det(nbs->box)));
2430
2431     fprintf(fp,"nbl average j cell list length %.1f\n",
2432             0.25*nbl->ncj/(double)nbl->nci);
2433
2434     for(s=0; s<SHIFTS; s++)
2435     {
2436         cs[s] = 0;
2437     }
2438     npexcl = 0;
2439     for(i=0; i<nbl->nci; i++)
2440     {
2441         cs[nbl->ci[i].shift & NBNXN_CI_SHIFT] +=
2442             nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start;
2443
2444         j = nbl->ci[i].cj_ind_start;
2445         while (j < nbl->ci[i].cj_ind_end &&
2446                nbl->cj[j].excl != NBNXN_INT_MASK_ALL)
2447         {
2448             npexcl++;
2449             j++;
2450         }
2451     }
2452     fprintf(fp,"nbl cell pairs, total: %d excl: %d %.1f%%\n",
2453             nbl->ncj,npexcl,100*npexcl/(double)nbl->ncj);
2454     for(s=0; s<SHIFTS; s++)
2455     {
2456         if (cs[s] > 0)
2457         {
2458             fprintf(fp,"nbl shift %2d ncj %3d\n",s,cs[s]);
2459         }
2460     }
2461 }
2462
2463 /* Print statistics of a pair lists, used for debug output */
2464 static void print_nblist_statistics_supersub(FILE *fp,const nbnxn_pairlist_t *nbl,
2465                                              const nbnxn_search_t nbs,real rl)
2466 {
2467     const nbnxn_grid_t *grid;
2468     int i,j4,j,si,b;
2469     int c[GPU_NSUBCELL+1];
2470
2471     /* This code only produces correct statistics with domain decomposition */
2472     grid = &nbs->grid[0];
2473
2474     fprintf(fp,"nbl nsci %d ncj4 %d nsi %d excl4 %d\n",
2475             nbl->nsci,nbl->ncj4,nbl->nci_tot,nbl->nexcl);
2476     fprintf(fp,"nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
2477             nbl->na_ci,rl,nbl->nci_tot,nbl->nci_tot/(double)grid->nsubc_tot,
2478             nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c,
2479             nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nsubc_tot*grid->na_c/det(nbs->box)));
2480
2481     fprintf(fp,"nbl average j super cell list length %.1f\n",
2482             0.25*nbl->ncj4/(double)nbl->nsci);
2483     fprintf(fp,"nbl average i sub cell list length %.1f\n",
2484             nbl->nci_tot/(0.25*nbl->ncj4));
2485
2486     for(si=0; si<=GPU_NSUBCELL; si++)
2487     {
2488         c[si] = 0;
2489     }
2490     for(i=0; i<nbl->nsci; i++)
2491     {
2492         for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
2493         {
2494             for(j=0; j<4; j++)
2495             {
2496                 b = 0;
2497                 for(si=0; si<GPU_NSUBCELL; si++)
2498                 {
2499                     if (nbl->cj4[j4].imei[0].imask & (1U << (j*GPU_NSUBCELL + si)))
2500                     {
2501                         b++;
2502                     }
2503                 }
2504                 c[b]++;
2505             }
2506         }
2507     }
2508     for(b=0; b<=GPU_NSUBCELL; b++)
2509     {
2510         fprintf(fp,"nbl j-list #i-subcell %d %7d %4.1f\n",
2511                 b,c[b],100.0*c[b]/(double)(nbl->ncj4*NBNXN_GPU_JGROUP_SIZE));
2512     }
2513 }
2514
2515 /* Print the full pair list, used for debug output */
2516 static void print_supersub_nsp(const char *fn,
2517                                const nbnxn_pairlist_t *nbl,
2518                                int iloc)
2519 {
2520     char buf[STRLEN];
2521     FILE *fp;
2522     int i,nsp,j4,p;
2523
2524     sprintf(buf,"%s_%s.xvg",fn,NONLOCAL_I(iloc) ? "nl" : "l");
2525     fp = ffopen(buf,"w");
2526
2527     for(i=0; i<nbl->nci; i++)
2528     {
2529         nsp = 0;
2530         for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
2531         {
2532             for(p=0; p<NBNXN_GPU_JGROUP_SIZE*GPU_NSUBCELL; p++)
2533             {
2534                 nsp += (nbl->cj4[j4].imei[0].imask >> p) & 1;
2535             }
2536         }
2537         fprintf(fp,"%4d %3d %3d\n",
2538                 i,
2539                 nsp,
2540                 nbl->sci[i].cj4_ind_end-nbl->sci[i].cj4_ind_start);
2541     }
2542
2543     fclose(fp);
2544 }
2545
2546 /* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp */
2547 static void low_get_nbl_exclusions(nbnxn_pairlist_t *nbl,int cj4,
2548                                    int warp,nbnxn_excl_t **excl)
2549 {
2550     if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
2551     {
2552         /* No exclusions set, make a new list entry */
2553         nbl->cj4[cj4].imei[warp].excl_ind = nbl->nexcl;
2554         nbl->nexcl++;
2555         *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
2556         set_no_excls(*excl);
2557     }
2558     else
2559     {
2560         /* We already have some exclusions, new ones can be added to the list */
2561         *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
2562     }
2563 }
2564
2565 /* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp,
2566  * allocates extra memory, if necessary.
2567  */
2568 static void get_nbl_exclusions_1(nbnxn_pairlist_t *nbl,int cj4,
2569                                  int warp,nbnxn_excl_t **excl)
2570 {
2571     if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
2572     {
2573         /* We need to make a new list entry, check if we have space */
2574         check_excl_space(nbl,1);
2575     }
2576     low_get_nbl_exclusions(nbl,cj4,warp,excl);
2577 }
2578
2579 /* Returns pointers to the exclusion mask for cj4-unit cj4 for both warps,
2580  * allocates extra memory, if necessary.
2581  */
2582 static void get_nbl_exclusions_2(nbnxn_pairlist_t *nbl,int cj4,
2583                                  nbnxn_excl_t **excl_w0,
2584                                  nbnxn_excl_t **excl_w1)
2585 {
2586     /* Check for space we might need */
2587     check_excl_space(nbl,2);
2588
2589     low_get_nbl_exclusions(nbl,cj4,0,excl_w0);
2590     low_get_nbl_exclusions(nbl,cj4,1,excl_w1);
2591 }
2592
2593 /* Sets the self exclusions i=j and pair exclusions i>j */
2594 static void set_self_and_newton_excls_supersub(nbnxn_pairlist_t *nbl,
2595                                                int cj4_ind,int sj_offset,
2596                                                int si)
2597 {
2598     nbnxn_excl_t *excl[2];
2599     int  ei,ej,w;
2600
2601     /* Here we only set the set self and double pair exclusions */
2602
2603     get_nbl_exclusions_2(nbl,cj4_ind,&excl[0],&excl[1]);
2604
2605     /* Only minor < major bits set */
2606     for(ej=0; ej<nbl->na_ci; ej++)
2607     {
2608         w = (ej>>2);
2609         for(ei=ej; ei<nbl->na_ci; ei++)
2610         {
2611             excl[w]->pair[(ej&(4-1))*nbl->na_ci+ei] &=
2612                 ~(1U << (sj_offset*GPU_NSUBCELL+si));
2613         }
2614     }
2615 }
2616
2617 /* Returns a diagonal or off-diagonal interaction mask for plain C lists */
2618 static unsigned int get_imask(gmx_bool rdiag,int ci,int cj)
2619 {
2620     return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
2621 }
2622
2623 #ifdef NBNXN_SEARCH_SSE
2624 /* Returns a diagonal or off-diagonal interaction mask for SIMD128 lists */
2625 static unsigned int get_imask_x86_simd128(gmx_bool rdiag,int ci,int cj)
2626 {
2627 #ifndef GMX_DOUBLE /* cj-size = 4 */
2628     return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
2629 #else              /* cj-size = 2 */
2630     return (rdiag && ci*2 == cj ? NBNXN_INT_MASK_DIAG_J2_0 :
2631             (rdiag && ci*2+1 == cj ? NBNXN_INT_MASK_DIAG_J2_1 :
2632              NBNXN_INT_MASK_ALL));
2633 #endif
2634 }
2635
2636 #ifdef GMX_X86_AVX_256
2637 /* Returns a diagonal or off-diagonal interaction mask for SIMD256 lists */
2638 static unsigned int get_imask_x86_simd256(gmx_bool rdiag,int ci,int cj)
2639 {
2640 #ifndef GMX_DOUBLE /* cj-size = 8 */
2641     return (rdiag && ci == cj*2 ? NBNXN_INT_MASK_DIAG_J8_0 :
2642             (rdiag && ci == cj*2+1 ? NBNXN_INT_MASK_DIAG_J8_1 :
2643              NBNXN_INT_MASK_ALL));
2644 #else              /* cj-size = 2 */
2645     return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
2646 #endif
2647 }
2648 #endif
2649 #endif /* NBNXN_SEARCH_SSE */
2650
2651 /* Plain C code for making a pair list of cell ci vs cell cjf-cjl.
2652  * Checks bounding box distances and possibly atom pair distances.
2653  */
2654 static void make_cluster_list_simple(const nbnxn_grid_t *gridj,
2655                                      nbnxn_pairlist_t *nbl,
2656                                      int ci,int cjf,int cjl,
2657                                      gmx_bool remove_sub_diag,
2658                                      const real *x_j,
2659                                      real rl2,float rbb2,
2660                                      int *ndistc)
2661 {
2662     const nbnxn_list_work_t *work;
2663
2664     const float *bb_ci;
2665     const real  *x_ci;
2666
2667     gmx_bool   InRange;
2668     real       d2;
2669     int        cjf_gl,cjl_gl,cj;
2670
2671     work = nbl->work;
2672
2673     bb_ci = nbl->work->bb_ci;
2674     x_ci  = nbl->work->x_ci;
2675
2676     InRange = FALSE;
2677     while (!InRange && cjf <= cjl)
2678     {
2679         d2 = subc_bb_dist2(0,bb_ci,cjf,gridj->bb);
2680         *ndistc += 2;
2681
2682         /* Check if the distance is within the distance where
2683          * we use only the bounding box distance rbb,
2684          * or within the cut-off and there is at least one atom pair
2685          * within the cut-off.
2686          */
2687         if (d2 < rbb2)
2688         {
2689             InRange = TRUE;
2690         }
2691         else if (d2 < rl2)
2692         {
2693             int i,j;
2694
2695             cjf_gl = gridj->cell0 + cjf;
2696             for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
2697             {
2698                 for(j=0; j<NBNXN_CPU_CLUSTER_I_SIZE; j++)
2699                 {
2700                     InRange = InRange ||
2701                         (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
2702                          sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
2703                          sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
2704                 }
2705             }
2706             *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
2707         }
2708         if (!InRange)
2709         {
2710             cjf++;
2711         }
2712     }
2713     if (!InRange)
2714     {
2715         return;
2716     }
2717
2718     InRange = FALSE;
2719     while (!InRange && cjl > cjf)
2720     {
2721         d2 = subc_bb_dist2(0,bb_ci,cjl,gridj->bb);
2722         *ndistc += 2;
2723
2724         /* Check if the distance is within the distance where
2725          * we use only the bounding box distance rbb,
2726          * or within the cut-off and there is at least one atom pair
2727          * within the cut-off.
2728          */
2729         if (d2 < rbb2)
2730         {
2731             InRange = TRUE;
2732         }
2733         else if (d2 < rl2)
2734         {
2735             int i,j;
2736
2737             cjl_gl = gridj->cell0 + cjl;
2738             for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
2739             {
2740                 for(j=0; j<NBNXN_CPU_CLUSTER_I_SIZE; j++)
2741                 {
2742                     InRange = InRange ||
2743                         (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
2744                          sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
2745                          sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
2746                 }
2747             }
2748             *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
2749         }
2750         if (!InRange)
2751         {
2752             cjl--;
2753         }
2754     }
2755
2756     if (cjf <= cjl)
2757     {
2758         for(cj=cjf; cj<=cjl; cj++)
2759         {
2760             /* Store cj and the interaction mask */
2761             nbl->cj[nbl->ncj].cj   = gridj->cell0 + cj;
2762             nbl->cj[nbl->ncj].excl = get_imask(remove_sub_diag,ci,cj);
2763             nbl->ncj++;
2764         }
2765         /* Increase the closing index in i super-cell list */
2766         nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
2767     }
2768 }
2769
2770 #ifdef NBNXN_SEARCH_SSE
2771 /* Include make_cluster_list_x86_simd128/256 */
2772 #define GMX_MM128_HERE
2773 #include "gmx_x86_simd_macros.h"
2774 #define STRIDE_S  PACK_X4
2775 #include "nbnxn_search_x86_simd.h"
2776 #undef STRIDE_S
2777 #undef GMX_MM128_HERE
2778 #ifdef GMX_X86_AVX_256
2779 /* Include make_cluster_list_x86_simd128/256 */
2780 #define GMX_MM256_HERE
2781 #include "gmx_x86_simd_macros.h"
2782 #define STRIDE_S  GMX_X86_SIMD_WIDTH_HERE
2783 #include "nbnxn_search_x86_simd.h"
2784 #undef STRIDE_S
2785 #undef GMX_MM256_HERE
2786 #endif
2787 #endif
2788
2789 /* Plain C or SSE code for making a pair list of super-cell sci vs scj.
2790  * Checks bounding box distances and possibly atom pair distances.
2791  */
2792 static void make_cluster_list_supersub(const nbnxn_search_t nbs,
2793                                        const nbnxn_grid_t *gridi,
2794                                        const nbnxn_grid_t *gridj,
2795                                        nbnxn_pairlist_t *nbl,
2796                                        int sci,int scj,
2797                                        gmx_bool sci_equals_scj,
2798                                        int stride,const real *x,
2799                                        real rl2,float rbb2,
2800                                        int *ndistc)
2801 {
2802     int  na_c;
2803     int  npair;
2804     int  cjo,ci1,ci,cj,cj_gl;
2805     int  cj4_ind,cj_offset;
2806     unsigned imask;
2807     nbnxn_cj4_t *cj4;
2808     const float *bb_ci;
2809     const real *x_ci;
2810     float *d2l,d2;
2811     int  w;
2812 #define PRUNE_LIST_CPU_ONE
2813 #ifdef PRUNE_LIST_CPU_ONE
2814     int  ci_last=-1;
2815 #endif
2816
2817     d2l = nbl->work->d2;
2818
2819     bb_ci = nbl->work->bb_ci;
2820     x_ci  = nbl->work->x_ci;
2821
2822     na_c = gridj->na_c;
2823
2824     for(cjo=0; cjo<gridj->nsubc[scj]; cjo++)
2825     {
2826         cj4_ind   = (nbl->work->cj_ind >> 2);
2827         cj_offset = nbl->work->cj_ind - cj4_ind*NBNXN_GPU_JGROUP_SIZE;
2828         cj4       = &nbl->cj4[cj4_ind];
2829
2830         cj = scj*GPU_NSUBCELL + cjo;
2831
2832         cj_gl = gridj->cell0*GPU_NSUBCELL + cj;
2833
2834         /* Initialize this j-subcell i-subcell list */
2835         cj4->cj[cj_offset] = cj_gl;
2836         imask              = 0;
2837
2838         if (sci_equals_scj)
2839         {
2840             ci1 = cjo + 1;
2841         }
2842         else
2843         {
2844             ci1 = gridi->nsubc[sci];
2845         }
2846
2847 #ifdef NBNXN_BBXXXX
2848         /* Determine all ci1 bb distances in one call with SSE */
2849         subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_8BB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_8BB-1)),
2850                                ci1,bb_ci,d2l);
2851         *ndistc += na_c*2;
2852 #endif
2853
2854         npair = 0;
2855         /* We use a fixed upper-bound instead of ci1 to help optimization */
2856         for(ci=0; ci<GPU_NSUBCELL; ci++)
2857         {
2858             if (ci == ci1)
2859             {
2860                 break;
2861             }
2862
2863 #ifndef NBNXN_BBXXXX
2864             /* Determine the bb distance between ci and cj */
2865             d2l[ci] = subc_bb_dist2(ci,bb_ci,cj,gridj->bb);
2866             *ndistc += 2;
2867 #endif
2868             d2 = d2l[ci];
2869
2870 #ifdef PRUNE_LIST_CPU_ALL
2871             /* Check if the distance is within the distance where
2872              * we use only the bounding box distance rbb,
2873              * or within the cut-off and there is at least one atom pair
2874              * within the cut-off. This check is very costly.
2875              */
2876             *ndistc += na_c*na_c;
2877             if (d2 < rbb2 ||
2878                 (d2 < rl2 && subc_in_range_x(na_c,ci,x_ci,cj_gl,stride,x,rl2)))
2879 #else
2880             /* Check if the distance between the two bounding boxes
2881              * in within the pair-list cut-off.
2882              */
2883             if (d2 < rl2)
2884 #endif
2885             {
2886                 /* Flag this i-subcell to be taken into account */
2887                 imask |= (1U << (cj_offset*GPU_NSUBCELL+ci));
2888
2889 #ifdef PRUNE_LIST_CPU_ONE
2890                 ci_last = ci;
2891 #endif
2892
2893                 npair++;
2894             }
2895         }
2896
2897 #ifdef PRUNE_LIST_CPU_ONE
2898         /* If we only found 1 pair, check if any atoms are actually
2899          * within the cut-off, so we could get rid of it.
2900          */
2901         if (npair == 1 && d2l[ci_last] >= rbb2)
2902         {
2903             /* Avoid using function pointers here, as it's slower */
2904             if (
2905 #ifdef NBNXN_8BB_SSE
2906                 !subc_in_range_sse8
2907 #else
2908                 !subc_in_range_x
2909 #endif
2910                                 (na_c,ci_last,x_ci,cj_gl,stride,x,rl2))
2911             {
2912                 imask &= ~(1U << (cj_offset*GPU_NSUBCELL+ci_last));
2913                 npair--;
2914             }
2915         }
2916 #endif
2917
2918         if (npair > 0)
2919         {
2920             /* We have a useful sj entry, close it now */
2921
2922             /* Set the exclucions for the ci== sj entry.
2923              * Here we don't bother to check if this entry is actually flagged,
2924              * as it will nearly always be in the list.
2925              */
2926             if (sci_equals_scj)
2927             {
2928                 set_self_and_newton_excls_supersub(nbl,cj4_ind,cj_offset,cjo);
2929             }
2930
2931             /* Copy the cluster interaction mask to the list */
2932             for(w=0; w<NWARP; w++)
2933             {
2934                 cj4->imei[w].imask |= imask;
2935             }
2936
2937             nbl->work->cj_ind++;
2938
2939             /* Keep the count */
2940             nbl->nci_tot += npair;
2941
2942             /* Increase the closing index in i super-cell list */
2943             nbl->sci[nbl->nsci].cj4_ind_end = ((nbl->work->cj_ind+4-1)>>2);
2944         }
2945     }
2946 }
2947
2948 /* Set all atom-pair exclusions from the topology stored in excl
2949  * as masks in the pair-list for simple list i-entry nbl_ci
2950  */
2951 static void set_ci_top_excls(const nbnxn_search_t nbs,
2952                              nbnxn_pairlist_t *nbl,
2953                              gmx_bool diagRemoved,
2954                              int na_ci_2log,
2955                              int na_cj_2log,
2956                              const nbnxn_ci_t *nbl_ci,
2957                              const t_blocka *excl)
2958 {
2959     const int *cell;
2960     int ci;
2961     int cj_ind_first,cj_ind_last;
2962     int cj_first,cj_last;
2963     int ndirect;
2964     int i,ai,aj,si,eind,ge,se;
2965     int found,cj_ind_0,cj_ind_1,cj_ind_m;
2966     int cj_m;
2967     gmx_bool Found_si;
2968     int si_ind;
2969     nbnxn_excl_t *nbl_excl;
2970     int inner_i,inner_e;
2971
2972     cell = nbs->cell;
2973
2974     if (nbl_ci->cj_ind_end == nbl_ci->cj_ind_start)
2975     {
2976         /* Empty list */
2977         return;
2978     }
2979
2980     ci = nbl_ci->ci;
2981
2982     cj_ind_first = nbl_ci->cj_ind_start;
2983     cj_ind_last  = nbl->ncj - 1;
2984
2985     cj_first = nbl->cj[cj_ind_first].cj;
2986     cj_last  = nbl->cj[cj_ind_last].cj;
2987
2988     /* Determine how many contiguous j-cells we have starting
2989      * from the first i-cell. This number can be used to directly
2990      * calculate j-cell indices for excluded atoms.
2991      */
2992     ndirect = 0;
2993     if (na_ci_2log == na_cj_2log)
2994     {
2995         while (cj_ind_first + ndirect <= cj_ind_last &&
2996                nbl->cj[cj_ind_first+ndirect].cj == ci + ndirect)
2997         {
2998             ndirect++;
2999         }
3000     }
3001 #ifdef NBNXN_SEARCH_SSE
3002     else
3003     {
3004         while (cj_ind_first + ndirect <= cj_ind_last &&
3005                nbl->cj[cj_ind_first+ndirect].cj == ci_to_cj(na_cj_2log,ci) + ndirect)
3006         {
3007             ndirect++;
3008         }
3009     }
3010 #endif
3011
3012     /* Loop over the atoms in the i super-cell */
3013     for(i=0; i<nbl->na_sc; i++)
3014     {
3015         ai = nbs->a[ci*nbl->na_sc+i];
3016         if (ai >= 0)
3017         {
3018             si  = (i>>na_ci_2log);
3019
3020             /* Loop over the topology-based exclusions for this i-atom */
3021             for(eind=excl->index[ai]; eind<excl->index[ai+1]; eind++)
3022             {
3023                 aj = excl->a[eind];
3024
3025                 if (aj == ai)
3026                 {
3027                     /* The self exclusion are already set, save some time */
3028                     continue;
3029                 }
3030
3031                 ge = cell[aj];
3032
3033                 /* Without shifts we only calculate interactions j>i
3034                  * for one-way pair-lists.
3035                  */
3036                 if (diagRemoved && ge <= ci*nbl->na_sc + i)
3037                 {
3038                     continue;
3039                 }
3040
3041                 se = (ge >> na_cj_2log);
3042
3043                 /* Could the cluster se be in our list? */
3044                 if (se >= cj_first && se <= cj_last)
3045                 {
3046                     if (se < cj_first + ndirect)
3047                     {
3048                         /* We can calculate cj_ind directly from se */
3049                         found = cj_ind_first + se - cj_first;
3050                     }
3051                     else
3052                     {
3053                         /* Search for se using bisection */
3054                         found = -1;
3055                         cj_ind_0 = cj_ind_first + ndirect;
3056                         cj_ind_1 = cj_ind_last + 1;
3057                         while (found == -1 && cj_ind_0 < cj_ind_1)
3058                         {
3059                             cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
3060
3061                             cj_m = nbl->cj[cj_ind_m].cj;
3062
3063                             if (se == cj_m)
3064                             {
3065                                 found = cj_ind_m;
3066                             }
3067                             else if (se < cj_m)
3068                             {
3069                                 cj_ind_1 = cj_ind_m;
3070                             }
3071                             else
3072                             {
3073                                 cj_ind_0 = cj_ind_m + 1;
3074                             }
3075                         }
3076                     }
3077
3078                     if (found >= 0)
3079                     {
3080                         inner_i = i  - (si << na_ci_2log);
3081                         inner_e = ge - (se << na_cj_2log);
3082
3083                         nbl->cj[found].excl &= ~(1U<<((inner_i<<na_cj_2log) + inner_e));
3084                     }
3085                 }
3086             }
3087         }
3088     }
3089 }
3090
3091 /* Set all atom-pair exclusions from the topology stored in excl
3092  * as masks in the pair-list for i-super-cell entry nbl_sci
3093  */
3094 static void set_sci_top_excls(const nbnxn_search_t nbs,
3095                               nbnxn_pairlist_t *nbl,
3096                               gmx_bool diagRemoved,
3097                               int na_c_2log,
3098                               const nbnxn_sci_t *nbl_sci,
3099                               const t_blocka *excl)
3100 {
3101     const int *cell;
3102     int na_c;
3103     int sci;
3104     int cj_ind_first,cj_ind_last;
3105     int cj_first,cj_last;
3106     int ndirect;
3107     int i,ai,aj,si,eind,ge,se;
3108     int found,cj_ind_0,cj_ind_1,cj_ind_m;
3109     int cj_m;
3110     gmx_bool Found_si;
3111     int si_ind;
3112     nbnxn_excl_t *nbl_excl;
3113     int inner_i,inner_e,w;
3114
3115     cell = nbs->cell;
3116
3117     na_c = nbl->na_ci;
3118
3119     if (nbl_sci->cj4_ind_end == nbl_sci->cj4_ind_start)
3120     {
3121         /* Empty list */
3122         return;
3123     }
3124
3125     sci = nbl_sci->sci;
3126
3127     cj_ind_first = nbl_sci->cj4_ind_start*NBNXN_GPU_JGROUP_SIZE;
3128     cj_ind_last  = nbl->work->cj_ind - 1;
3129
3130     cj_first = nbl->cj4[nbl_sci->cj4_ind_start].cj[0];
3131     cj_last  = nbl_cj(nbl,cj_ind_last);
3132
3133     /* Determine how many contiguous j-clusters we have starting
3134      * from the first i-cluster. This number can be used to directly
3135      * calculate j-cluster indices for excluded atoms.
3136      */
3137     ndirect = 0;
3138     while (cj_ind_first + ndirect <= cj_ind_last &&
3139            nbl_cj(nbl,cj_ind_first+ndirect) == sci*GPU_NSUBCELL + ndirect)
3140     {
3141         ndirect++;
3142     }
3143
3144     /* Loop over the atoms in the i super-cell */
3145     for(i=0; i<nbl->na_sc; i++)
3146     {
3147         ai = nbs->a[sci*nbl->na_sc+i];
3148         if (ai >= 0)
3149         {
3150             si  = (i>>na_c_2log);
3151
3152             /* Loop over the topology-based exclusions for this i-atom */
3153             for(eind=excl->index[ai]; eind<excl->index[ai+1]; eind++)
3154             {
3155                 aj = excl->a[eind];
3156
3157                 if (aj == ai)
3158                 {
3159                     /* The self exclusion are already set, save some time */
3160                     continue;
3161                 }
3162
3163                 ge = cell[aj];
3164
3165                 /* Without shifts we only calculate interactions j>i
3166                  * for one-way pair-lists.
3167                  */
3168                 if (diagRemoved && ge <= sci*nbl->na_sc + i)
3169                 {
3170                     continue;
3171                 }
3172
3173                 se = ge>>na_c_2log;
3174                 /* Could the cluster se be in our list? */
3175                 if (se >= cj_first && se <= cj_last)
3176                 {
3177                     if (se < cj_first + ndirect)
3178                     {
3179                         /* We can calculate cj_ind directly from se */
3180                         found = cj_ind_first + se - cj_first;
3181                     }
3182                     else
3183                     {
3184                         /* Search for se using bisection */
3185                         found = -1;
3186                         cj_ind_0 = cj_ind_first + ndirect;
3187                         cj_ind_1 = cj_ind_last + 1;
3188                         while (found == -1 && cj_ind_0 < cj_ind_1)
3189                         {
3190                             cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
3191
3192                             cj_m = nbl_cj(nbl,cj_ind_m);
3193
3194                             if (se == cj_m)
3195                             {
3196                                 found = cj_ind_m;
3197                             }
3198                             else if (se < cj_m)
3199                             {
3200                                 cj_ind_1 = cj_ind_m;
3201                             }
3202                             else
3203                             {
3204                                 cj_ind_0 = cj_ind_m + 1;
3205                             }
3206                         }
3207                     }
3208
3209                     if (found >= 0)
3210                     {
3211                         inner_i = i  - si*na_c;
3212                         inner_e = ge - se*na_c;
3213
3214 /* Macro for getting the index of atom a within a cluster */
3215 #define AMODI(a)  ((a) & (NBNXN_CPU_CLUSTER_I_SIZE - 1))
3216 /* Macro for converting an atom number to a cluster number */
3217 #define A2CI(a)   ((a) >> NBNXN_CPU_CLUSTER_I_SIZE_2LOG)
3218
3219                         if (nbl_imask0(nbl,found) & (1U << (AMODI(found)*GPU_NSUBCELL + si)))
3220                         {
3221                             w       = (inner_e >> 2);
3222
3223                             get_nbl_exclusions_1(nbl,A2CI(found),w,&nbl_excl);
3224
3225                             nbl_excl->pair[AMODI(inner_e)*nbl->na_ci+inner_i] &=
3226                                 ~(1U << (AMODI(found)*GPU_NSUBCELL + si));
3227                         }
3228
3229 #undef AMODI
3230 #undef A2CI
3231                     }
3232                 }
3233             }
3234         }
3235     }
3236 }
3237
3238 /* Reallocate the simple ci list for at least n entries */
3239 static void nb_realloc_ci(nbnxn_pairlist_t *nbl,int n)
3240 {
3241     nbl->ci_nalloc = over_alloc_small(n);
3242     nbnxn_realloc_void((void **)&nbl->ci,
3243                        nbl->nci*sizeof(*nbl->ci),
3244                        nbl->ci_nalloc*sizeof(*nbl->ci),
3245                        nbl->alloc,nbl->free);
3246 }
3247
3248 /* Reallocate the super-cell sci list for at least n entries */
3249 static void nb_realloc_sci(nbnxn_pairlist_t *nbl,int n)
3250 {
3251     nbl->sci_nalloc = over_alloc_small(n);
3252     nbnxn_realloc_void((void **)&nbl->sci,
3253                        nbl->nsci*sizeof(*nbl->sci),
3254                        nbl->sci_nalloc*sizeof(*nbl->sci),
3255                        nbl->alloc,nbl->free);
3256 }
3257
3258 /* Make a new ci entry at index nbl->nci */
3259 static void new_ci_entry(nbnxn_pairlist_t *nbl,int ci,int shift,int flags,
3260                          nbnxn_list_work_t *work)
3261 {
3262     if (nbl->nci + 1 > nbl->ci_nalloc)
3263     {
3264         nb_realloc_ci(nbl,nbl->nci+1);
3265     }
3266     nbl->ci[nbl->nci].ci            = ci;
3267     nbl->ci[nbl->nci].shift         = shift;
3268     /* Store the interaction flags along with the shift */
3269     nbl->ci[nbl->nci].shift        |= flags;
3270     nbl->ci[nbl->nci].cj_ind_start  = nbl->ncj;
3271     nbl->ci[nbl->nci].cj_ind_end    = nbl->ncj;
3272 }
3273
3274 /* Make a new sci entry at index nbl->nsci */
3275 static void new_sci_entry(nbnxn_pairlist_t *nbl,int sci,int shift,int flags,
3276                           nbnxn_list_work_t *work)
3277 {
3278     if (nbl->nsci + 1 > nbl->sci_nalloc)
3279     {
3280         nb_realloc_sci(nbl,nbl->nsci+1);
3281     }
3282     nbl->sci[nbl->nsci].sci           = sci;
3283     nbl->sci[nbl->nsci].shift         = shift;
3284     nbl->sci[nbl->nsci].cj4_ind_start = nbl->ncj4;
3285     nbl->sci[nbl->nsci].cj4_ind_end   = nbl->ncj4;
3286 }
3287
3288 /* Sort the simple j-list cj on exclusions.
3289  * Entries with exclusions will all be sorted to the beginning of the list.
3290  */
3291 static void sort_cj_excl(nbnxn_cj_t *cj,int ncj,
3292                          nbnxn_list_work_t *work)
3293 {
3294     int jnew,j;
3295
3296     if (ncj > work->cj_nalloc)
3297     {
3298         work->cj_nalloc = over_alloc_large(ncj);
3299         srenew(work->cj,work->cj_nalloc);
3300     }
3301
3302     /* Make a list of the j-cells involving exclusions */
3303     jnew = 0;
3304     for(j=0; j<ncj; j++)
3305     {
3306         if (cj[j].excl != NBNXN_INT_MASK_ALL)
3307         {
3308             work->cj[jnew++] = cj[j];
3309         }
3310     }
3311     /* Check if there are exclusions at all or not just the first entry */
3312     if (!((jnew == 0) ||
3313           (jnew == 1 && cj[0].excl != NBNXN_INT_MASK_ALL)))
3314     {
3315         for(j=0; j<ncj; j++)
3316         {
3317             if (cj[j].excl == NBNXN_INT_MASK_ALL)
3318             {
3319                 work->cj[jnew++] = cj[j];
3320             }
3321         }
3322         for(j=0; j<ncj; j++)
3323         {
3324             cj[j] = work->cj[j];
3325         }
3326     }
3327 }
3328
3329 /* Close this simple list i entry */
3330 static void close_ci_entry_simple(nbnxn_pairlist_t *nbl)
3331 {
3332     int jlen;
3333
3334     /* All content of the new ci entry have already been filled correctly,
3335      * we only need to increase the count here (for non empty lists).
3336      */
3337     jlen = nbl->ci[nbl->nci].cj_ind_end - nbl->ci[nbl->nci].cj_ind_start;
3338     if (jlen > 0)
3339     {
3340         sort_cj_excl(nbl->cj+nbl->ci[nbl->nci].cj_ind_start,jlen,nbl->work);
3341
3342         if (nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0))
3343         {
3344             nbl->work->ncj_hlj += jlen;
3345         }
3346         else if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
3347         {
3348             nbl->work->ncj_noq += jlen;
3349         }
3350
3351         nbl->nci++;
3352     }
3353 }
3354
3355 /* Split sci entry for load balancing on the GPU.
3356  * As we only now the current count on our own thread,
3357  * we will need to estimate the current total amount of i-entries.
3358  * As the lists get concatenated later, this estimate depends
3359  * both on nthread and our own thread index thread.
3360  */
3361 static void split_sci_entry(nbnxn_pairlist_t *nbl,
3362                             int nsp_max_av,gmx_bool progBal,int nc_bal,
3363                             int thread,int nthread)
3364 {
3365     int nsci_est;
3366     int nsp_max;
3367     int cj4_start,cj4_end,j4len,cj4;
3368     int sci;
3369     int nsp,nsp_sci,nsp_cj4,nsp_cj4_e,nsp_cj4_p;
3370     int p;
3371
3372     /* Estimate the total numbers of ci's of the nblist combined
3373      * over all threads using the target number of ci's.
3374      */
3375     nsci_est = nc_bal*thread/nthread + nbl->nsci;
3376     if (progBal)
3377     {
3378         /* The first ci blocks should be larger, to avoid overhead.
3379          * The last ci blocks should be smaller, to improve load balancing.
3380          */
3381         nsp_max = max(1,
3382                       nsp_max_av*nc_bal*3/(2*(nsci_est - 1 + nc_bal)));
3383     }
3384     else
3385     {
3386         nsp_max = nsp_max_av;
3387     }
3388
3389     cj4_start = nbl->sci[nbl->nsci-1].cj4_ind_start;
3390     cj4_end   = nbl->sci[nbl->nsci-1].cj4_ind_end;
3391     j4len = cj4_end - cj4_start;
3392
3393     if (j4len > 1 && j4len*GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE > nsp_max)
3394     {
3395         /* Remove the last ci entry and process the cj4's again */
3396         nbl->nsci -= 1;
3397
3398         sci        = nbl->nsci;
3399         cj4        = cj4_start;
3400         nsp        = 0;
3401         nsp_sci    = 0;
3402         nsp_cj4_e  = 0;
3403         nsp_cj4    = 0;
3404         while (cj4 < cj4_end)
3405         {
3406             nsp_cj4_p = nsp_cj4;
3407             nsp_cj4   = 0;
3408             for(p=0; p<GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE; p++)
3409             {
3410                 nsp_cj4 += (nbl->cj4[cj4].imei[0].imask >> p) & 1;
3411             }
3412             nsp += nsp_cj4;
3413
3414             if (nsp > nsp_max && nsp > nsp_cj4)
3415             {
3416                 nbl->sci[sci].cj4_ind_end = cj4;
3417                 sci++;
3418                 nbl->nsci++;
3419                 if (nbl->nsci+1 > nbl->sci_nalloc)
3420                 {
3421                     nb_realloc_sci(nbl,nbl->nsci+1);
3422                 }
3423                 nbl->sci[sci].sci           = nbl->sci[nbl->nsci-1].sci;
3424                 nbl->sci[sci].shift         = nbl->sci[nbl->nsci-1].shift;
3425                 nbl->sci[sci].cj4_ind_start = cj4;
3426                 nsp_sci   = nsp - nsp_cj4;
3427                 nsp_cj4_e = nsp_cj4_p;
3428                 nsp       = nsp_cj4;
3429             }
3430
3431             cj4++;
3432         }
3433
3434         /* Put the remaining cj4's in a new ci entry */
3435         nbl->sci[sci].cj4_ind_end = cj4_end;
3436
3437         /* Possibly balance out the last two ci's
3438          * by moving the last cj4 of the second last ci.
3439          */
3440         if (nsp_sci - nsp_cj4_e >= nsp + nsp_cj4_e)
3441         {
3442             nbl->sci[sci-1].cj4_ind_end--;
3443             nbl->sci[sci].cj4_ind_start--;
3444         }
3445
3446         sci++;
3447         nbl->nsci++;
3448     }
3449 }
3450
3451 /* Clost this super/sub list i entry */
3452 static void close_ci_entry_supersub(nbnxn_pairlist_t *nbl,
3453                                     int nsp_max_av,
3454                                     gmx_bool progBal,int nc_bal,
3455                                     int thread,int nthread)
3456 {
3457     int j4len,tlen;
3458     int nb,b;
3459
3460     /* All content of the new ci entry have already been filled correctly,
3461      * we only need to increase the count here (for non empty lists).
3462      */
3463     j4len = nbl->sci[nbl->nsci].cj4_ind_end - nbl->sci[nbl->nsci].cj4_ind_start;
3464     if (j4len > 0)
3465     {
3466         /* We can only have complete blocks of 4 j-entries in a list,
3467          * so round the count up before closing.
3468          */
3469         nbl->ncj4         = ((nbl->work->cj_ind + 4-1) >> 2);
3470         nbl->work->cj_ind = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
3471
3472         nbl->nsci++;
3473
3474         if (nsp_max_av > 0)
3475         {
3476             split_sci_entry(nbl,nsp_max_av,progBal,nc_bal,thread,nthread);
3477         }
3478     }
3479 }
3480
3481 /* Syncs the working array before adding another grid pair to the list */
3482 static void sync_work(nbnxn_pairlist_t *nbl)
3483 {
3484     if (!nbl->bSimple)
3485     {
3486         nbl->work->cj_ind   = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
3487         nbl->work->cj4_init = nbl->ncj4;
3488     }
3489 }
3490
3491 /* Clears an nbnxn_pairlist_t data structure */
3492 static void clear_pairlist(nbnxn_pairlist_t *nbl)
3493 {
3494     nbl->nci           = 0;
3495     nbl->nsci          = 0;
3496     nbl->ncj           = 0;
3497     nbl->ncj4          = 0;
3498     nbl->nci_tot       = 0;
3499     nbl->nexcl         = 1;
3500
3501     nbl->work->ncj_noq = 0;
3502     nbl->work->ncj_hlj = 0;
3503 }
3504
3505 /* Sets a simple list i-cell bounding box, including PBC shift */
3506 static void set_icell_bb_simple(const float *bb,int ci,
3507                                 real shx,real shy,real shz,
3508                                 float *bb_ci)
3509 {
3510     int ia;
3511
3512     ia = ci*NNBSBB_B;
3513     bb_ci[BBL_X] = bb[ia+BBL_X] + shx;
3514     bb_ci[BBL_Y] = bb[ia+BBL_Y] + shy;
3515     bb_ci[BBL_Z] = bb[ia+BBL_Z] + shz;
3516     bb_ci[BBU_X] = bb[ia+BBU_X] + shx;
3517     bb_ci[BBU_Y] = bb[ia+BBU_Y] + shy;
3518     bb_ci[BBU_Z] = bb[ia+BBU_Z] + shz;
3519 }
3520
3521 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
3522 static void set_icell_bb_supersub(const float *bb,int ci,
3523                                   real shx,real shy,real shz,
3524                                   float *bb_ci)
3525 {
3526     int ia,m,i;
3527
3528 #ifdef NBNXN_BBXXXX
3529     ia = ci*(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX;
3530     for(m=0; m<(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX; m+=NNBSBB_XXXX)
3531     {
3532         for(i=0; i<STRIDE_8BB; i++)
3533         {
3534             bb_ci[m+0*STRIDE_8BB+i] = bb[ia+m+0*STRIDE_8BB+i] + shx;
3535             bb_ci[m+1*STRIDE_8BB+i] = bb[ia+m+1*STRIDE_8BB+i] + shy;
3536             bb_ci[m+2*STRIDE_8BB+i] = bb[ia+m+2*STRIDE_8BB+i] + shz;
3537             bb_ci[m+3*STRIDE_8BB+i] = bb[ia+m+3*STRIDE_8BB+i] + shx;
3538             bb_ci[m+4*STRIDE_8BB+i] = bb[ia+m+4*STRIDE_8BB+i] + shy;
3539             bb_ci[m+5*STRIDE_8BB+i] = bb[ia+m+5*STRIDE_8BB+i] + shz;
3540         }
3541     }
3542 #else
3543     ia = ci*GPU_NSUBCELL*NNBSBB_B;
3544     for(i=0; i<GPU_NSUBCELL*NNBSBB_B; i+=NNBSBB_B)
3545     {
3546         bb_ci[i+BBL_X] = bb[ia+i+BBL_X] + shx;
3547         bb_ci[i+BBL_Y] = bb[ia+i+BBL_Y] + shy;
3548         bb_ci[i+BBL_Z] = bb[ia+i+BBL_Z] + shz;
3549         bb_ci[i+BBU_X] = bb[ia+i+BBU_X] + shx;
3550         bb_ci[i+BBU_Y] = bb[ia+i+BBU_Y] + shy;
3551         bb_ci[i+BBU_Z] = bb[ia+i+BBU_Z] + shz;
3552     }
3553 #endif
3554 }
3555
3556 /* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
3557 static void icell_set_x_simple(int ci,
3558                                real shx,real shy,real shz,
3559                                int na_c,
3560                                int stride,const real *x,
3561                                nbnxn_list_work_t *work)
3562 {
3563     int  ia,i;
3564
3565     ia = ci*NBNXN_CPU_CLUSTER_I_SIZE;
3566
3567     for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE; i++)
3568     {
3569         work->x_ci[i*STRIDE_XYZ+XX] = x[(ia+i)*stride+XX] + shx;
3570         work->x_ci[i*STRIDE_XYZ+YY] = x[(ia+i)*stride+YY] + shy;
3571         work->x_ci[i*STRIDE_XYZ+ZZ] = x[(ia+i)*stride+ZZ] + shz;
3572     }
3573 }
3574
3575 /* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
3576 static void icell_set_x_supersub(int ci,
3577                                  real shx,real shy,real shz,
3578                                  int na_c,
3579                                  int stride,const real *x,
3580                                  nbnxn_list_work_t *work)
3581 {
3582     int  ia,i;
3583     real *x_ci;
3584
3585     x_ci = work->x_ci;
3586
3587     ia = ci*GPU_NSUBCELL*na_c;
3588     for(i=0; i<GPU_NSUBCELL*na_c; i++)
3589     {
3590         x_ci[i*DIM + XX] = x[(ia+i)*stride + XX] + shx;
3591         x_ci[i*DIM + YY] = x[(ia+i)*stride + YY] + shy;
3592         x_ci[i*DIM + ZZ] = x[(ia+i)*stride + ZZ] + shz;
3593     }
3594 }
3595
3596 #ifdef NBNXN_SEARCH_SSE
3597 /* Copies PBC shifted super-cell packed atom coordinates to working array */
3598 static void icell_set_x_supersub_sse8(int ci,
3599                                       real shx,real shy,real shz,
3600                                       int na_c,
3601                                       int stride,const real *x,
3602                                       nbnxn_list_work_t *work)
3603 {
3604     int  si,io,ia,i,j;
3605     real *x_ci;
3606
3607     x_ci = work->x_ci;
3608
3609     for(si=0; si<GPU_NSUBCELL; si++)
3610     {
3611         for(i=0; i<na_c; i+=STRIDE_8BB)
3612         {
3613             io = si*na_c + i;
3614             ia = ci*GPU_NSUBCELL*na_c + io;
3615             for(j=0; j<STRIDE_8BB; j++)
3616             {
3617                 x_ci[io*DIM + j + XX*STRIDE_8BB] = x[(ia+j)*stride+XX] + shx;
3618                 x_ci[io*DIM + j + YY*STRIDE_8BB] = x[(ia+j)*stride+YY] + shy;
3619                 x_ci[io*DIM + j + ZZ*STRIDE_8BB] = x[(ia+j)*stride+ZZ] + shz;
3620             }
3621         }
3622     }
3623 }
3624 #endif
3625
3626 static real nbnxn_rlist_inc_nonloc_fac = 0.6;
3627
3628 /* Due to the cluster size the effective pair-list is longer than
3629  * that of a simple atom pair-list. This function gives the extra distance.
3630  */
3631 real nbnxn_get_rlist_effective_inc(int cluster_size,real atom_density)
3632 {
3633     return ((0.5 + nbnxn_rlist_inc_nonloc_fac)*sqr(((cluster_size) - 1.0)/(cluster_size))*pow((cluster_size)/(atom_density),1.0/3.0));
3634 }
3635
3636 /* Estimates the interaction volume^2 for non-local interactions */
3637 static real nonlocal_vol2(const gmx_domdec_zones_t *zones,rvec ls,real r)
3638 {
3639     int  z,d;
3640     real cl,ca,za;
3641     real vold_est;
3642     real vol2_est_tot;
3643
3644     vol2_est_tot = 0;
3645
3646     /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
3647      * not home interaction volume^2. As these volumes are not additive,
3648      * this is an overestimate, but it would only be significant in the limit
3649      * of small cells, where we anyhow need to split the lists into
3650      * as small parts as possible.
3651      */
3652
3653     for(z=0; z<zones->n; z++)
3654     {
3655         if (zones->shift[z][XX] + zones->shift[z][YY] + zones->shift[z][ZZ] == 1)
3656         {
3657             cl = 0;
3658             ca = 1;
3659             za = 1;
3660             for(d=0; d<DIM; d++)
3661             {
3662                 if (zones->shift[z][d] == 0)
3663                 {
3664                     cl += 0.5*ls[d];
3665                     ca *= ls[d];
3666                     za *= zones->size[z].x1[d] - zones->size[z].x0[d];
3667                 }
3668             }
3669
3670             /* 4 octants of a sphere */
3671             vold_est  = 0.25*M_PI*r*r*r*r;
3672             /* 4 quarter pie slices on the edges */
3673             vold_est += 4*cl*M_PI/6.0*r*r*r;
3674             /* One rectangular volume on a face */
3675             vold_est += ca*0.5*r*r;
3676
3677             vol2_est_tot += vold_est*za;
3678         }
3679     }
3680
3681     return vol2_est_tot;
3682 }
3683
3684 /* Estimates the average size of a full j-list for super/sub setup */
3685 static int get_nsubpair_max(const nbnxn_search_t nbs,
3686                             int iloc,
3687                             real rlist,
3688                             int min_ci_balanced)
3689 {
3690     const nbnxn_grid_t *grid;
3691     rvec ls;
3692     real xy_diag2,r_eff_sup,vol_est,nsp_est,nsp_est_nl;
3693     int  nsubpair_max;
3694
3695     grid = &nbs->grid[0];
3696
3697     ls[XX] = (grid->c1[XX] - grid->c0[XX])/(grid->ncx*GPU_NSUBCELL_X);
3698     ls[YY] = (grid->c1[YY] - grid->c0[YY])/(grid->ncy*GPU_NSUBCELL_Y);
3699     ls[ZZ] = (grid->c1[ZZ] - grid->c0[ZZ])*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z);
3700
3701     /* The average squared length of the diagonal of a sub cell */
3702     xy_diag2 = ls[XX]*ls[XX] + ls[YY]*ls[YY] + ls[ZZ]*ls[ZZ];
3703
3704     /* The formulas below are a heuristic estimate of the average nsj per si*/
3705     r_eff_sup = rlist + nbnxn_rlist_inc_nonloc_fac*sqr((grid->na_c - 1.0)/grid->na_c)*sqrt(xy_diag2/3);
3706
3707     if (!nbs->DomDec || nbs->zones->n == 1)
3708     {
3709         nsp_est_nl = 0;
3710     }
3711     else
3712     {
3713         nsp_est_nl =
3714             sqr(grid->atom_density/grid->na_c)*
3715             nonlocal_vol2(nbs->zones,ls,r_eff_sup);
3716     }
3717
3718     if (LOCAL_I(iloc))
3719     {
3720         /* Sub-cell interacts with itself */
3721         vol_est  = ls[XX]*ls[YY]*ls[ZZ];
3722         /* 6/2 rectangular volume on the faces */
3723         vol_est += (ls[XX]*ls[YY] + ls[XX]*ls[ZZ] + ls[YY]*ls[ZZ])*r_eff_sup;
3724         /* 12/2 quarter pie slices on the edges */
3725         vol_est += 2*(ls[XX] + ls[YY] + ls[ZZ])*0.25*M_PI*sqr(r_eff_sup);
3726         /* 4 octants of a sphere */
3727         vol_est += 0.5*4.0/3.0*M_PI*pow(r_eff_sup,3);
3728
3729         nsp_est = grid->nsubc_tot*vol_est*grid->atom_density/grid->na_c;
3730
3731         /* Subtract the non-local pair count */
3732         nsp_est -= nsp_est_nl;
3733
3734         if (debug)
3735         {
3736             fprintf(debug,"nsp_est local %5.1f non-local %5.1f\n",
3737                     nsp_est,nsp_est_nl);
3738         }
3739     }
3740     else
3741     {
3742         nsp_est = nsp_est_nl;
3743     }
3744
3745     if (min_ci_balanced <= 0 || grid->nc >= min_ci_balanced || grid->nc == 0)
3746     {
3747         /* We don't need to worry */
3748         nsubpair_max = -1;
3749     }
3750     else
3751     {
3752         /* Thus the (average) maximum j-list size should be as follows */
3753         nsubpair_max = max(1,(int)(nsp_est/min_ci_balanced+0.5));
3754
3755         /* Since the target value is a maximum (this avoid high outliers,
3756          * which lead to load imbalance), not average, we get more lists
3757          * than we ask for (to compensate we need to add GPU_NSUBCELL*4/4).
3758          * But more importantly, the optimal GPU performance moves
3759          * to lower number of block for very small blocks.
3760          * To compensate we add the maximum pair count per cj4.
3761          */
3762         nsubpair_max += GPU_NSUBCELL*NBNXN_CPU_CLUSTER_I_SIZE;
3763     }
3764
3765     if (debug)
3766     {
3767         fprintf(debug,"nbl nsp estimate %.1f, nsubpair_max %d\n",
3768                 nsp_est,nsubpair_max);
3769     }
3770
3771     return nsubpair_max;
3772 }
3773
3774 /* Debug list print function */
3775 static void print_nblist_ci_cj(FILE *fp,const nbnxn_pairlist_t *nbl)
3776 {
3777     int i,j;
3778
3779     for(i=0; i<nbl->nci; i++)
3780     {
3781         fprintf(fp,"ci %4d  shift %2d  ncj %3d\n",
3782                 nbl->ci[i].ci,nbl->ci[i].shift,
3783                 nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start);
3784
3785         for(j=nbl->ci[i].cj_ind_start; j<nbl->ci[i].cj_ind_end; j++)
3786         {
3787             fprintf(fp,"  cj %5d  imask %x\n",
3788                     nbl->cj[j].cj,
3789                     nbl->cj[j].excl);
3790         }
3791     }
3792 }
3793
3794 /* Debug list print function */
3795 static void print_nblist_sci_cj(FILE *fp,const nbnxn_pairlist_t *nbl)
3796 {
3797     int i,j4,j;
3798
3799     for(i=0; i<nbl->nsci; i++)
3800     {
3801         fprintf(fp,"ci %4d  shift %2d  ncj4 %2d\n",
3802                 nbl->sci[i].sci,nbl->sci[i].shift,
3803                 nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start);
3804
3805         for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
3806         {
3807             for(j=0; j<4; j++)
3808             {
3809                 fprintf(fp,"  sj %5d  imask %x\n",
3810                         nbl->cj4[j4].cj[j],
3811                         nbl->cj4[j4].imei[0].imask);
3812             }
3813         }
3814     }
3815 }
3816
3817 /* Combine pair lists *nbl generated on multiple threads nblc */
3818 static void combine_nblists(int nnbl,nbnxn_pairlist_t **nbl,
3819                             nbnxn_pairlist_t *nblc)
3820 {
3821     int nsci,ncj4,nexcl;
3822     int n,i;
3823
3824     if (nblc->bSimple)
3825     {
3826         gmx_incons("combine_nblists does not support simple lists");
3827     }
3828
3829     nsci  = nblc->nsci;
3830     ncj4  = nblc->ncj4;
3831     nexcl = nblc->nexcl;
3832     for(i=0; i<nnbl; i++)
3833     {
3834         nsci  += nbl[i]->nsci;
3835         ncj4  += nbl[i]->ncj4;
3836         nexcl += nbl[i]->nexcl;
3837     }
3838
3839     if (nsci > nblc->sci_nalloc)
3840     {
3841         nb_realloc_sci(nblc,nsci);
3842     }
3843     if (ncj4 > nblc->cj4_nalloc)
3844     {
3845         nblc->cj4_nalloc = over_alloc_small(ncj4);
3846         nbnxn_realloc_void((void **)&nblc->cj4,
3847                            nblc->ncj4*sizeof(*nblc->cj4),
3848                            nblc->cj4_nalloc*sizeof(*nblc->cj4),
3849                            nblc->alloc,nblc->free);
3850     }
3851     if (nexcl > nblc->excl_nalloc)
3852     {
3853         nblc->excl_nalloc = over_alloc_small(nexcl);
3854         nbnxn_realloc_void((void **)&nblc->excl,
3855                            nblc->nexcl*sizeof(*nblc->excl),
3856                            nblc->excl_nalloc*sizeof(*nblc->excl),
3857                            nblc->alloc,nblc->free);
3858     }
3859
3860     /* Each thread should copy its own data to the combined arrays,
3861      * as otherwise data will go back and forth between different caches.
3862      */
3863 #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
3864     for(n=0; n<nnbl; n++)
3865     {
3866         int sci_offset;
3867         int cj4_offset;
3868         int ci_offset;
3869         int excl_offset;
3870         int i,j4;
3871         const nbnxn_pairlist_t *nbli;
3872
3873         /* Determine the offset in the combined data for our thread */
3874         sci_offset  = nblc->nsci;
3875         cj4_offset  = nblc->ncj4;
3876         ci_offset   = nblc->nci_tot;
3877         excl_offset = nblc->nexcl;
3878
3879         for(i=0; i<n; i++)
3880         {
3881             sci_offset  += nbl[i]->nsci;
3882             cj4_offset  += nbl[i]->ncj4;
3883             ci_offset   += nbl[i]->nci_tot;
3884             excl_offset += nbl[i]->nexcl;
3885         }
3886
3887         nbli = nbl[n];
3888
3889         for(i=0; i<nbli->nsci; i++)
3890         {
3891             nblc->sci[sci_offset+i]                = nbli->sci[i];
3892             nblc->sci[sci_offset+i].cj4_ind_start += cj4_offset;
3893             nblc->sci[sci_offset+i].cj4_ind_end   += cj4_offset;
3894         }
3895
3896         for(j4=0; j4<nbli->ncj4; j4++)
3897         {
3898             nblc->cj4[cj4_offset+j4] = nbli->cj4[j4];
3899             nblc->cj4[cj4_offset+j4].imei[0].excl_ind += excl_offset;
3900             nblc->cj4[cj4_offset+j4].imei[1].excl_ind += excl_offset;
3901         }
3902
3903         for(j4=0; j4<nbli->nexcl; j4++)
3904         {
3905             nblc->excl[excl_offset+j4] = nbli->excl[j4];
3906         }
3907     }
3908
3909     for(n=0; n<nnbl; n++)
3910     {
3911         nblc->nsci    += nbl[n]->nsci;
3912         nblc->ncj4    += nbl[n]->ncj4;
3913         nblc->nci_tot += nbl[n]->nci_tot;
3914         nblc->nexcl   += nbl[n]->nexcl;
3915     }
3916 }
3917
3918 /* Returns the next ci to be processes by our thread */
3919 static gmx_bool next_ci(const nbnxn_grid_t *grid,
3920                         int conv,
3921                         int nth,int ci_block,
3922                         int *ci_x,int *ci_y,
3923                         int *ci_b,int *ci)
3924 {
3925     (*ci_b)++;
3926     (*ci)++;
3927
3928     if (*ci_b == ci_block)
3929     {
3930         /* Jump to the next block assigned to this task */
3931         *ci   += (nth - 1)*ci_block;
3932         *ci_b  = 0;
3933     }
3934
3935     if (*ci >= grid->nc*conv)
3936     {
3937         return FALSE;
3938     }
3939
3940     while (*ci >= grid->cxy_ind[*ci_x*grid->ncy + *ci_y + 1]*conv)
3941     {
3942         *ci_y += 1;
3943         if (*ci_y == grid->ncy)
3944         {
3945             *ci_x += 1;
3946             *ci_y  = 0;
3947         }
3948     }
3949
3950     return TRUE;
3951 }
3952
3953 /* Returns the distance^2 for which we put cell pairs in the list
3954  * without checking atom pair distances. This is usually < rlist^2.
3955  */
3956 static float boundingbox_only_distance2(const nbnxn_grid_t *gridi,
3957                                         const nbnxn_grid_t *gridj,
3958                                         real rlist,
3959                                         gmx_bool simple)
3960 {
3961     /* If the distance between two sub-cell bounding boxes is less
3962      * than this distance, do not check the distance between
3963      * all particle pairs in the sub-cell, since then it is likely
3964      * that the box pair has atom pairs within the cut-off.
3965      * We use the nblist cut-off minus 0.5 times the average x/y diagonal
3966      * spacing of the sub-cells. Around 40% of the checked pairs are pruned.
3967      * Using more than 0.5 gains at most 0.5%.
3968      * If forces are calculated more than twice, the performance gain
3969      * in the force calculation outweighs the cost of checking.
3970      * Note that with subcell lists, the atom-pair distance check
3971      * is only performed when only 1 out of 8 sub-cells in within range,
3972      * this is because the GPU is much faster than the cpu.
3973      */
3974     real bbx,bby;
3975     real rbb2;
3976
3977     bbx = 0.5*(gridi->sx + gridj->sx);
3978     bby = 0.5*(gridi->sy + gridj->sy);
3979     if (!simple)
3980     {
3981         bbx /= GPU_NSUBCELL_X;
3982         bby /= GPU_NSUBCELL_Y;
3983     }
3984
3985     rbb2 = sqr(max(0,rlist - 0.5*sqrt(bbx*bbx + bby*bby)));
3986
3987 #ifndef GMX_DOUBLE
3988     return rbb2;
3989 #else
3990     return (float)((1+GMX_FLOAT_EPS)*rbb2);
3991 #endif
3992 }
3993
3994 static int get_ci_block_size(const nbnxn_grid_t *gridi,
3995                              gmx_bool bDomDec, int nth)
3996 {
3997     const int ci_block_enum = 5;
3998     const int ci_block_denom = 11;
3999     const int ci_block_min_atoms = 16;
4000     int ci_block;
4001
4002     /* Here we decide how to distribute the blocks over the threads.
4003      * We use prime numbers to try to avoid that the grid size becomes
4004      * a multiple of the number of threads, which would lead to some
4005      * threads getting "inner" pairs and others getting boundary pairs,
4006      * which in turns will lead to load imbalance between threads.
4007      * Set the block size as 5/11/ntask times the average number of cells
4008      * in a y,z slab. This should ensure a quite uniform distribution
4009      * of the grid parts of the different thread along all three grid
4010      * zone boundaries with 3D domain decomposition. At the same time
4011      * the blocks will not become too small.
4012      */
4013     ci_block = (gridi->nc*ci_block_enum)/(ci_block_denom*gridi->ncx*nth);
4014
4015     /* Ensure the blocks are not too small: avoids cache invalidation */
4016     if (ci_block*gridi->na_sc < ci_block_min_atoms)
4017     {
4018         ci_block = (ci_block_min_atoms + gridi->na_sc - 1)/gridi->na_sc;
4019     }
4020     
4021     /* Without domain decomposition
4022      * or with less than 3 blocks per task, divide in nth blocks.
4023      */
4024     if (!bDomDec || ci_block*3*nth > gridi->nc)
4025     {
4026         ci_block = (gridi->nc + nth - 1)/nth;
4027     }
4028
4029     return ci_block;
4030 }
4031
4032 /* Generates the part of pair-list nbl assigned to our thread */
4033 static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
4034                                      const nbnxn_grid_t *gridi,
4035                                      const nbnxn_grid_t *gridj,
4036                                      nbnxn_search_work_t *work,
4037                                      const nbnxn_atomdata_t *nbat,
4038                                      const t_blocka *excl,
4039                                      real rlist,
4040                                      int nb_kernel_type,
4041                                      int ci_block,
4042                                      gmx_bool bFBufferFlag,
4043                                      int nsubpair_max,
4044                                      gmx_bool progBal,
4045                                      int min_ci_balanced,
4046                                      int th,int nth,
4047                                      nbnxn_pairlist_t *nbl)
4048 {
4049     int  na_cj_2log;
4050     matrix box;
4051     real rl2;
4052     float rbb2;
4053     int  d;
4054     int  ci_b,ci,ci_x,ci_y,ci_xy,cj;
4055     ivec shp;
4056     int  tx,ty,tz;
4057     int  shift;
4058     gmx_bool bMakeList;
4059     real shx,shy,shz;
4060     int  conv_i,cell0_i;
4061     const float *bb_i,*bbcz_i,*bbcz_j;
4062     const int *flags_i;
4063     real bx0,bx1,by0,by1,bz0,bz1;
4064     real bz1_frac;
4065     real d2cx,d2z,d2z_cx,d2z_cy,d2zx,d2zxy,d2xy;
4066     int  cxf,cxl,cyf,cyf_x,cyl;
4067     int  cx,cy;
4068     int  c0,c1,cs,cf,cl;
4069     int  ndistc;
4070     int  ncpcheck;
4071     int  gridi_flag_shift=0,gridj_flag_shift=0;
4072     unsigned *gridj_flag=NULL;
4073     int  ncj_old_i,ncj_old_j;
4074
4075     nbs_cycle_start(&work->cc[enbsCCsearch]);
4076
4077     if (gridj->bSimple != nbl->bSimple)
4078     {
4079         gmx_incons("Grid incompatible with pair-list");
4080     }
4081
4082     sync_work(nbl);
4083     nbl->na_sc = gridj->na_sc;
4084     nbl->na_ci = gridj->na_c;
4085     nbl->na_cj = nbnxn_kernel_to_cj_size(nb_kernel_type);
4086     na_cj_2log = get_2log(nbl->na_cj);
4087
4088     nbl->rlist  = rlist;
4089
4090     if (bFBufferFlag)
4091     {
4092         /* Determine conversion of clusters to flag blocks */
4093         gridi_flag_shift = 0;
4094         while ((nbl->na_ci<<gridi_flag_shift) < NBNXN_BUFFERFLAG_SIZE)
4095         {
4096             gridi_flag_shift++;
4097         }
4098         gridj_flag_shift = 0;
4099         while ((nbl->na_cj<<gridj_flag_shift) < NBNXN_BUFFERFLAG_SIZE)
4100         {
4101             gridj_flag_shift++;
4102         }
4103
4104         gridj_flag = work->buffer_flags.flag;
4105     }
4106
4107     copy_mat(nbs->box,box);
4108
4109     rl2 = nbl->rlist*nbl->rlist;
4110
4111     rbb2 = boundingbox_only_distance2(gridi,gridj,nbl->rlist,nbl->bSimple);
4112
4113     if (debug)
4114     {
4115         fprintf(debug,"nbl bounding box only distance %f\n",sqrt(rbb2));
4116     }
4117
4118     /* Set the shift range */
4119     for(d=0; d<DIM; d++)
4120     {
4121         /* Check if we need periodicity shifts.
4122          * Without PBC or with domain decomposition we don't need them.
4123          */
4124         if (d >= ePBC2npbcdim(nbs->ePBC) || nbs->dd_dim[d])
4125         {
4126             shp[d] = 0;
4127         }
4128         else
4129         {
4130             if (d == XX &&
4131                 box[XX][XX] - fabs(box[YY][XX]) - fabs(box[ZZ][XX]) < sqrt(rl2))
4132             {
4133                 shp[d] = 2;
4134             }
4135             else
4136             {
4137                 shp[d] = 1;
4138             }
4139         }
4140     }
4141
4142     if (nbl->bSimple && !gridi->bSimple)
4143     {
4144         conv_i  = gridi->na_sc/gridj->na_sc;
4145         bb_i    = gridi->bb_simple;
4146         bbcz_i  = gridi->bbcz_simple;
4147         flags_i = gridi->flags_simple;
4148     }
4149     else
4150     {
4151         conv_i  = 1;
4152         bb_i    = gridi->bb;
4153         bbcz_i  = gridi->bbcz;
4154         flags_i = gridi->flags;
4155     }
4156     cell0_i = gridi->cell0*conv_i;
4157
4158     bbcz_j = gridj->bbcz;
4159
4160     if (conv_i != 1)
4161     {
4162         /* Blocks of the conversion factor - 1 give a large repeat count
4163          * combined with a small block size. This should result in good
4164          * load balancing for both small and large domains.
4165          */
4166         ci_block = conv_i - 1;
4167     }
4168     if (debug)
4169     {
4170         fprintf(debug,"nbl nc_i %d col.av. %.1f ci_block %d\n",
4171                 gridi->nc,gridi->nc/(double)(gridi->ncx*gridi->ncy),ci_block);
4172     }
4173
4174     ndistc = 0;
4175     ncpcheck = 0;
4176
4177     /* Initially ci_b and ci to 1 before where we want them to start,
4178      * as they will both be incremented in next_ci.
4179      */
4180     ci_b = -1;
4181     ci   = th*ci_block - 1;
4182     ci_x = 0;
4183     ci_y = 0;
4184     while (next_ci(gridi,conv_i,nth,ci_block,&ci_x,&ci_y,&ci_b,&ci))
4185     {
4186         if (nbl->bSimple && flags_i[ci] == 0)
4187         {
4188             continue;
4189         }
4190
4191         ncj_old_i = nbl->ncj;
4192
4193         d2cx = 0;
4194         if (gridj != gridi && shp[XX] == 0)
4195         {
4196             if (nbl->bSimple)
4197             {
4198                 bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX];
4199             }
4200             else
4201             {
4202                 bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx;
4203             }
4204             if (bx1 < gridj->c0[XX])
4205             {
4206                 d2cx = sqr(gridj->c0[XX] - bx1);
4207
4208                 if (d2cx >= rl2)
4209                 {
4210                     continue;
4211                 }
4212             }
4213         }
4214
4215         ci_xy = ci_x*gridi->ncy + ci_y;
4216
4217         /* Loop over shift vectors in three dimensions */
4218         for (tz=-shp[ZZ]; tz<=shp[ZZ]; tz++)
4219         {
4220             shz = tz*box[ZZ][ZZ];
4221
4222             bz0 = bbcz_i[ci*NNBSBB_D  ] + shz;
4223             bz1 = bbcz_i[ci*NNBSBB_D+1] + shz;
4224
4225             if (tz == 0)
4226             {
4227                 d2z = 0;
4228             }
4229             else if (tz < 0)
4230             {
4231                 d2z = sqr(bz1);
4232             }
4233             else
4234             {
4235                 d2z = sqr(bz0 - box[ZZ][ZZ]);
4236             }
4237
4238             d2z_cx = d2z + d2cx;
4239
4240             if (d2z_cx >= rl2)
4241             {
4242                 continue;
4243             }
4244
4245             bz1_frac =
4246                 bz1/((real)(gridi->cxy_ind[ci_xy+1] - gridi->cxy_ind[ci_xy]));
4247             if (bz1_frac < 0)
4248             {
4249                 bz1_frac = 0;
4250             }
4251             /* The check with bz1_frac close to or larger than 1 comes later */
4252
4253             for (ty=-shp[YY]; ty<=shp[YY]; ty++)
4254             {
4255                 shy = ty*box[YY][YY] + tz*box[ZZ][YY];
4256
4257                 if (nbl->bSimple)
4258                 {
4259                     by0 = bb_i[ci*NNBSBB_B         +YY] + shy;
4260                     by1 = bb_i[ci*NNBSBB_B+NNBSBB_C+YY] + shy;
4261                 }
4262                 else
4263                 {
4264                     by0 = gridi->c0[YY] + (ci_y  )*gridi->sy + shy;
4265                     by1 = gridi->c0[YY] + (ci_y+1)*gridi->sy + shy;
4266                 }
4267
4268                 get_cell_range(by0,by1,
4269                                gridj->ncy,gridj->c0[YY],gridj->sy,gridj->inv_sy,
4270                                d2z_cx,rl2,
4271                                &cyf,&cyl);
4272
4273                 if (cyf > cyl)
4274                 {
4275                     continue;
4276                 }
4277
4278                 d2z_cy = d2z;
4279                 if (by1 < gridj->c0[YY])
4280                 {
4281                     d2z_cy += sqr(gridj->c0[YY] - by1);
4282                 }
4283                 else if (by0 > gridj->c1[YY])
4284                 {
4285                     d2z_cy += sqr(by0 - gridj->c1[YY]);
4286                 }
4287
4288                 for (tx=-shp[XX]; tx<=shp[XX]; tx++)
4289                 {
4290                     shift = XYZ2IS(tx,ty,tz);
4291
4292 #ifdef NBNXN_SHIFT_BACKWARD
4293                     if (gridi == gridj && shift > CENTRAL)
4294                     {
4295                         continue;
4296                     }
4297 #endif
4298
4299                     shx = tx*box[XX][XX] + ty*box[YY][XX] + tz*box[ZZ][XX];
4300
4301                     if (nbl->bSimple)
4302                     {
4303                         bx0 = bb_i[ci*NNBSBB_B         +XX] + shx;
4304                         bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX] + shx;
4305                     }
4306                     else
4307                     {
4308                         bx0 = gridi->c0[XX] + (ci_x  )*gridi->sx + shx;
4309                         bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx + shx;
4310                     }
4311
4312                     get_cell_range(bx0,bx1,
4313                                    gridj->ncx,gridj->c0[XX],gridj->sx,gridj->inv_sx,
4314                                    d2z_cy,rl2,
4315                                    &cxf,&cxl);
4316
4317                     if (cxf > cxl)
4318                     {
4319                         continue;
4320                     }
4321
4322                     if (nbl->bSimple)
4323                     {
4324                         new_ci_entry(nbl,cell0_i+ci,shift,flags_i[ci],
4325                                      nbl->work);
4326                     }
4327                     else
4328                     {
4329                         new_sci_entry(nbl,cell0_i+ci,shift,flags_i[ci],
4330                                       nbl->work);
4331                     }
4332
4333 #ifndef NBNXN_SHIFT_BACKWARD
4334                     if (cxf < ci_x)
4335 #else
4336                     if (shift == CENTRAL && gridi == gridj &&
4337                         cxf < ci_x)
4338 #endif
4339                     {
4340                         /* Leave the pairs with i > j.
4341                          * x is the major index, so skip half of it.
4342                          */
4343                         cxf = ci_x;
4344                     }
4345
4346                     if (nbl->bSimple)
4347                     {
4348                         set_icell_bb_simple(bb_i,ci,shx,shy,shz,
4349                                             nbl->work->bb_ci);
4350                     }
4351                     else
4352                     {
4353                         set_icell_bb_supersub(bb_i,ci,shx,shy,shz,
4354                                               nbl->work->bb_ci);
4355                     }
4356
4357                     nbs->icell_set_x(cell0_i+ci,shx,shy,shz,
4358                                      gridi->na_c,nbat->xstride,nbat->x,
4359                                      nbl->work);
4360
4361                     for(cx=cxf; cx<=cxl; cx++)
4362                     {
4363                         d2zx = d2z;
4364                         if (gridj->c0[XX] + cx*gridj->sx > bx1)
4365                         {
4366                             d2zx += sqr(gridj->c0[XX] + cx*gridj->sx - bx1);
4367                         }
4368                         else if (gridj->c0[XX] + (cx+1)*gridj->sx < bx0)
4369                         {
4370                             d2zx += sqr(gridj->c0[XX] + (cx+1)*gridj->sx - bx0);
4371                         }
4372
4373 #ifndef NBNXN_SHIFT_BACKWARD
4374                         if (gridi == gridj &&
4375                             cx == 0 && cyf < ci_y)
4376 #else
4377                         if (gridi == gridj &&
4378                             cx == 0 && shift == CENTRAL && cyf < ci_y)
4379 #endif
4380                         {
4381                             /* Leave the pairs with i > j.
4382                              * Skip half of y when i and j have the same x.
4383                              */
4384                             cyf_x = ci_y;
4385                         }
4386                         else
4387                         {
4388                             cyf_x = cyf;
4389                         }
4390
4391                         for(cy=cyf_x; cy<=cyl; cy++)
4392                         {
4393                             c0 = gridj->cxy_ind[cx*gridj->ncy+cy];
4394                             c1 = gridj->cxy_ind[cx*gridj->ncy+cy+1];
4395 #ifdef NBNXN_SHIFT_BACKWARD
4396                             if (gridi == gridj &&
4397                                 shift == CENTRAL && c0 < ci)
4398                             {
4399                                 c0 = ci;
4400                             }
4401 #endif
4402
4403                             d2zxy = d2zx;
4404                             if (gridj->c0[YY] + cy*gridj->sy > by1)
4405                             {
4406                                 d2zxy += sqr(gridj->c0[YY] + cy*gridj->sy - by1);
4407                             }
4408                             else if (gridj->c0[YY] + (cy+1)*gridj->sy < by0)
4409                             {
4410                                 d2zxy += sqr(gridj->c0[YY] + (cy+1)*gridj->sy - by0);
4411                             }
4412                             if (c1 > c0 && d2zxy < rl2)
4413                             {
4414                                 cs = c0 + (int)(bz1_frac*(c1 - c0));
4415                                 if (cs >= c1)
4416                                 {
4417                                     cs = c1 - 1;
4418                                 }
4419
4420                                 d2xy = d2zxy - d2z;
4421
4422                                 /* Find the lowest cell that can possibly
4423                                  * be within range.
4424                                  */
4425                                 cf = cs;
4426                                 while(cf > c0 &&
4427                                       (bbcz_j[cf*NNBSBB_D+1] >= bz0 ||
4428                                        d2xy + sqr(bbcz_j[cf*NNBSBB_D+1] - bz0) < rl2))
4429                                 {
4430                                     cf--;
4431                                 }
4432
4433                                 /* Find the highest cell that can possibly
4434                                  * be within range.
4435                                  */
4436                                 cl = cs;
4437                                 while(cl < c1-1 &&
4438                                       (bbcz_j[cl*NNBSBB_D] <= bz1 ||
4439                                        d2xy + sqr(bbcz_j[cl*NNBSBB_D] - bz1) < rl2))
4440                                 {
4441                                     cl++;
4442                                 }
4443
4444 #ifdef NBNXN_REFCODE
4445                                 {
4446                                     /* Simple reference code, for debugging,
4447                                      * overrides the more complex code above.
4448                                      */
4449                                     int k;
4450                                     cf = c1;
4451                                     cl = -1;
4452                                     for(k=c0; k<c1; k++)
4453                                     {
4454                                         if (box_dist2(bx0,bx1,by0,by1,bz0,bz1,
4455                                                       bb+k*NNBSBB_B) < rl2 &&
4456                                             k < cf)
4457                                         {
4458                                             cf = k;
4459                                         }
4460                                         if (box_dist2(bx0,bx1,by0,by1,bz0,bz1,
4461                                                       bb+k*NNBSBB_B) < rl2 &&
4462                                             k > cl)
4463                                         {
4464                                             cl = k;
4465                                         }
4466                                     }
4467                                 }
4468 #endif
4469
4470                                 if (gridi == gridj)
4471                                 {
4472                                     /* We want each atom/cell pair only once,
4473                                      * only use cj >= ci.
4474                                      */
4475 #ifndef NBNXN_SHIFT_BACKWARD
4476                                     cf = max(cf,ci);
4477 #else
4478                                     if (shift == CENTRAL)
4479                                     {
4480                                         cf = max(cf,ci);
4481                                     }
4482 #endif
4483                                 }
4484
4485                                 if (cf <= cl)
4486                                 {
4487                                     /* For f buffer flags with simple lists */
4488                                     ncj_old_j = nbl->ncj;
4489
4490                                     switch (nb_kernel_type)
4491                                     {
4492                                     case nbk4x4_PlainC:
4493                                         check_subcell_list_space_simple(nbl,cl-cf+1);
4494
4495                                         make_cluster_list_simple(gridj,
4496                                                                  nbl,ci,cf,cl,
4497                                                                  (gridi == gridj && shift == CENTRAL),
4498                                                                  nbat->x,
4499                                                                  rl2,rbb2,
4500                                                                  &ndistc);
4501                                         break;
4502 #ifdef NBNXN_SEARCH_SSE
4503                                     case nbk4xN_X86_SIMD128:
4504                                         check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
4505                                         make_cluster_list_x86_simd128(gridj,
4506                                                                       nbl,ci,cf,cl,
4507                                                                       (gridi == gridj && shift == CENTRAL),
4508                                                                       nbat->x,
4509                                                                       rl2,rbb2,
4510                                                                       &ndistc);
4511                                         break;
4512 #ifdef GMX_X86_AVX_256
4513                                     case nbk4xN_X86_SIMD256:
4514                                         check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
4515                                         make_cluster_list_x86_simd256(gridj,
4516                                                                       nbl,ci,cf,cl,
4517                                                                       (gridi == gridj && shift == CENTRAL),
4518                                                                       nbat->x,
4519                                                                       rl2,rbb2,
4520                                                                       &ndistc);
4521                                         break;
4522 #endif
4523 #endif
4524                                     case nbk8x8x8_PlainC:
4525                                     case nbk8x8x8_CUDA:
4526                                         check_subcell_list_space_supersub(nbl,cl-cf+1);
4527                                         for(cj=cf; cj<=cl; cj++)
4528                                         {
4529                                             make_cluster_list_supersub(nbs,gridi,gridj,
4530                                                                        nbl,ci,cj,
4531                                                                        (gridi == gridj && shift == CENTRAL && ci == cj),
4532                                                                        nbat->xstride,nbat->x,
4533                                                                        rl2,rbb2,
4534                                                                        &ndistc);
4535                                         }
4536                                         break;
4537                                     }
4538                                     ncpcheck += cl - cf + 1;
4539
4540                                     if (bFBufferFlag && nbl->ncj > ncj_old_j)
4541                                     {
4542                                         int cbf,cbl,cb;
4543
4544                                         cbf = nbl->cj[ncj_old_j].cj >> gridj_flag_shift;
4545                                         cbl = nbl->cj[nbl->ncj-1].cj >> gridj_flag_shift;
4546                                         for(cb=cbf; cb<=cbl; cb++)
4547                                         {
4548                                             gridj_flag[cb] = 1U<<th;
4549                                         }
4550                                     }
4551                                 }
4552                             }
4553                         }
4554                     }
4555
4556                     /* Set the exclusions for this ci list */
4557                     if (nbl->bSimple)
4558                     {
4559                         set_ci_top_excls(nbs,
4560                                          nbl,
4561                                          shift == CENTRAL && gridi == gridj,
4562                                          gridj->na_c_2log,
4563                                          na_cj_2log,
4564                                          &(nbl->ci[nbl->nci]),
4565                                          excl);
4566                     }
4567                     else
4568                     {
4569                         set_sci_top_excls(nbs,
4570                                           nbl,
4571                                           shift == CENTRAL && gridi == gridj,
4572                                           gridj->na_c_2log,
4573                                           &(nbl->sci[nbl->nsci]),
4574                                           excl);
4575                     }
4576
4577                     /* Close this ci list */
4578                     if (nbl->bSimple)
4579                     {
4580                         close_ci_entry_simple(nbl);
4581                     }
4582                     else
4583                     {
4584                         close_ci_entry_supersub(nbl,
4585                                                 nsubpair_max,
4586                                                 progBal,min_ci_balanced,
4587                                                 th,nth);
4588                     }
4589                 }
4590             }
4591         }
4592
4593         if (bFBufferFlag && nbl->ncj > ncj_old_i)
4594         {
4595             work->buffer_flags.flag[(gridi->cell0+ci)>>gridi_flag_shift] = 1U<<th;
4596         }
4597     }
4598
4599     work->ndistc = ndistc;
4600
4601     nbs_cycle_stop(&work->cc[enbsCCsearch]);
4602
4603     if (debug)
4604     {
4605         fprintf(debug,"number of distance checks %d\n",ndistc);
4606         fprintf(debug,"ncpcheck %s %d\n",gridi==gridj ? "local" : "non-local",
4607                 ncpcheck);
4608
4609         if (nbl->bSimple)
4610         {
4611             print_nblist_statistics_simple(debug,nbl,nbs,rlist);
4612         }
4613         else
4614         {
4615             print_nblist_statistics_supersub(debug,nbl,nbs,rlist);
4616         }
4617
4618     }
4619 }
4620
4621 static void reduce_buffer_flags(const nbnxn_search_t nbs,
4622                                 int nsrc,
4623                                 const nbnxn_buffer_flags_t *dest)
4624 {
4625     int s,b;
4626     const unsigned *flag;
4627
4628     for(s=0; s<nsrc; s++)
4629     {
4630         flag = nbs->work[s].buffer_flags.flag;
4631
4632         for(b=0; b<dest->nflag; b++)
4633         {
4634             dest->flag[b] |= flag[b];
4635         }
4636     }
4637 }
4638
4639 static void print_reduction_cost(const nbnxn_buffer_flags_t *flags,int nout)
4640 {
4641     int nelem,nkeep,ncopy,nred,b,c,out;
4642
4643     nelem = 0;
4644     nkeep = 0;
4645     ncopy = 0;
4646     nred  = 0;
4647     for(b=0; b<flags->nflag; b++)
4648     {
4649         if (flags->flag[b] == 1)
4650         {
4651             /* Only flag 0 is set, no copy of reduction required */
4652             nelem++;
4653             nkeep++;
4654         }
4655         else if (flags->flag[b] > 0)
4656         {
4657             c = 0;
4658             for(out=0; out<nout; out++)
4659             {
4660                 if (flags->flag[b] & (1U<<out))
4661                 {
4662                     c++;
4663                 }
4664             }
4665             nelem += c;
4666             if (c == 1)
4667             {
4668                 ncopy++;
4669             }
4670             else
4671             {
4672                 nred += c;
4673             }
4674         }
4675     }
4676
4677     fprintf(debug,"nbnxn reduction: #flag %d #list %d elem %4.2f, keep %4.2f copy %4.2f red %4.2f\n",
4678             flags->nflag,nout,
4679             nelem/(double)(flags->nflag),
4680             nkeep/(double)(flags->nflag),
4681             ncopy/(double)(flags->nflag),
4682             nred/(double)(flags->nflag));
4683 }
4684
4685 /* Make a local or non-local pair-list, depending on iloc */
4686 void nbnxn_make_pairlist(const nbnxn_search_t nbs,
4687                          nbnxn_atomdata_t *nbat,
4688                          const t_blocka *excl,
4689                          real rlist,
4690                          int min_ci_balanced,
4691                          nbnxn_pairlist_set_t *nbl_list,
4692                          int iloc,
4693                          int nb_kernel_type,
4694                          t_nrnb *nrnb)
4695 {
4696     nbnxn_grid_t *gridi,*gridj;
4697     int nzi,zi,zj0,zj1,zj;
4698     int nsubpair_max;
4699     int th;
4700     int nnbl;
4701     nbnxn_pairlist_t **nbl;
4702     int ci_block;
4703     gmx_bool CombineNBLists;
4704     int np_tot,np_noq,np_hlj,nap;
4705
4706     nnbl            = nbl_list->nnbl;
4707     nbl             = nbl_list->nbl;
4708     CombineNBLists  = nbl_list->bCombined;
4709
4710     if (debug)
4711     {
4712         fprintf(debug,"ns making %d nblists\n", nnbl);
4713     }
4714
4715     nbat->bUseBufferFlags = (nbat->nout > 1);
4716     if (nbat->bUseBufferFlags && LOCAL_I(iloc))
4717     {
4718         init_buffer_flags(&nbat->buffer_flags,nbat->natoms);
4719     }
4720
4721     if (nbl_list->bSimple)
4722     {
4723         switch (nb_kernel_type)
4724         {
4725 #ifdef NBNXN_SEARCH_SSE
4726         case nbk4xN_X86_SIMD128:
4727             nbs->icell_set_x = icell_set_x_x86_simd128;
4728             break;
4729 #ifdef GMX_X86_AVX_256
4730         case nbk4xN_X86_SIMD256:
4731             nbs->icell_set_x = icell_set_x_x86_simd256;
4732             break;
4733 #endif
4734 #endif
4735         default:
4736             nbs->icell_set_x = icell_set_x_simple;
4737             break;
4738         }
4739     }
4740     else
4741     {
4742 #ifdef NBNXN_SEARCH_SSE
4743         nbs->icell_set_x = icell_set_x_supersub_sse8;
4744 #else
4745         nbs->icell_set_x = icell_set_x_supersub;
4746 #endif
4747     }
4748
4749     if (LOCAL_I(iloc))
4750     {
4751         /* Only zone (grid) 0 vs 0 */
4752         nzi = 1;
4753         zj0 = 0;
4754         zj1 = 1;
4755     }
4756     else
4757     {
4758         nzi = nbs->zones->nizone;
4759     }
4760
4761     if (!nbl_list->bSimple && min_ci_balanced > 0)
4762     {
4763         nsubpair_max = get_nsubpair_max(nbs,iloc,rlist,min_ci_balanced);
4764     }
4765     else
4766     {
4767         nsubpair_max = 0;
4768     }
4769
4770     /* Clear all pair-lists */
4771     for(th=0; th<nnbl; th++)
4772     {
4773         clear_pairlist(nbl[th]);
4774     }
4775
4776     for(zi=0; zi<nzi; zi++)
4777     {
4778         gridi = &nbs->grid[zi];
4779
4780         if (NONLOCAL_I(iloc))
4781         {
4782             zj0 = nbs->zones->izone[zi].j0;
4783             zj1 = nbs->zones->izone[zi].j1;
4784             if (zi == 0)
4785             {
4786                 zj0++;
4787             }
4788         }
4789         for(zj=zj0; zj<zj1; zj++)
4790         {
4791             gridj = &nbs->grid[zj];
4792
4793             if (debug)
4794             {
4795                 fprintf(debug,"ns search grid %d vs %d\n",zi,zj);
4796             }
4797
4798             nbs_cycle_start(&nbs->cc[enbsCCsearch]);
4799
4800             if (nbl[0]->bSimple && !gridi->bSimple)
4801             {
4802                 /* Hybrid list, determine blocking later */
4803                 ci_block = 0;
4804             }
4805             else
4806             {
4807                 ci_block = get_ci_block_size(gridi,nbs->DomDec,nnbl);
4808             }
4809
4810 #pragma omp parallel for num_threads(nnbl) schedule(static)
4811             for(th=0; th<nnbl; th++)
4812             {
4813                 if (nbat->bUseBufferFlags && zi == 0 && zj == 0)
4814                 {
4815                     init_buffer_flags(&nbs->work[th].buffer_flags,nbat->natoms);
4816                 }
4817
4818                 if (CombineNBLists && th > 0)
4819                 {
4820                     clear_pairlist(nbl[th]);
4821                 }
4822
4823                 /* Divide the i super cell equally over the nblists */
4824                 nbnxn_make_pairlist_part(nbs,gridi,gridj,
4825                                          &nbs->work[th],nbat,excl,
4826                                          rlist,
4827                                          nb_kernel_type,
4828                                          ci_block,
4829                                          nbat->bUseBufferFlags,
4830                                          nsubpair_max,
4831                                          (LOCAL_I(iloc) || nbs->zones->n <= 2),
4832                                          min_ci_balanced,
4833                                          th,nnbl,
4834                                          nbl[th]);
4835             }
4836             nbs_cycle_stop(&nbs->cc[enbsCCsearch]);
4837
4838             np_tot = 0;
4839             np_noq = 0;
4840             np_hlj = 0;
4841             for(th=0; th<nnbl; th++)
4842             {
4843                 inc_nrnb(nrnb,eNR_NBNXN_DIST2,nbs->work[th].ndistc);
4844
4845                 if (nbl_list->bSimple)
4846                 {
4847                     np_tot += nbl[th]->ncj;
4848                     np_noq += nbl[th]->work->ncj_noq;
4849                     np_hlj += nbl[th]->work->ncj_hlj;
4850                 }
4851                 else
4852                 {
4853                     /* This count ignores potential subsequent pair pruning */
4854                     np_tot += nbl[th]->nci_tot;
4855                 }
4856             }
4857             nap = nbl[0]->na_ci*nbl[0]->na_cj;
4858             nbl_list->natpair_ljq = (np_tot - np_noq)*nap - np_hlj*nap/2;
4859             nbl_list->natpair_lj  = np_noq*nap;
4860             nbl_list->natpair_q   = np_hlj*nap/2;
4861
4862             if (CombineNBLists && nnbl > 1)
4863             {
4864                 nbs_cycle_start(&nbs->cc[enbsCCcombine]);
4865
4866                 combine_nblists(nnbl-1,nbl+1,nbl[0]);
4867
4868                 nbs_cycle_stop(&nbs->cc[enbsCCcombine]);
4869             }
4870         }
4871     }
4872
4873     if (nbat->bUseBufferFlags)
4874     {
4875         reduce_buffer_flags(nbs,nnbl,&nbat->buffer_flags);
4876     }
4877
4878     /*
4879     print_supersub_nsp("nsubpair",nbl[0],iloc);
4880     */
4881
4882     /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
4883     if (LOCAL_I(iloc))
4884     {
4885         nbs->search_count++;
4886     }
4887     if (nbs->print_cycles &&
4888         (!nbs->DomDec || (nbs->DomDec && !LOCAL_I(iloc))) &&
4889         nbs->search_count % 100 == 0)
4890     {
4891         nbs_cycle_print(stderr,nbs);
4892     }
4893
4894     if (debug && (CombineNBLists && nnbl > 1))
4895     {
4896         if (nbl[0]->bSimple)
4897         {
4898             print_nblist_statistics_simple(debug,nbl[0],nbs,rlist);
4899         }
4900         else
4901         {
4902             print_nblist_statistics_supersub(debug,nbl[0],nbs,rlist);
4903         }
4904     }
4905
4906     if (debug)
4907     {
4908         if (gmx_debug_at)
4909         {
4910             if (nbl[0]->bSimple)
4911             {
4912                 print_nblist_ci_cj(debug,nbl[0]);
4913             }
4914             else
4915             {
4916                 print_nblist_sci_cj(debug,nbl[0]);
4917             }
4918         }
4919
4920         if (nbat->bUseBufferFlags)
4921         {
4922             print_reduction_cost(&nbat->buffer_flags,nnbl);
4923         }
4924     }
4925 }