Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_search.c
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2  *
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  *          GROningen MAchine for Chemical Simulations
9  *
10  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
11  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
12  * Copyright (c) 2001-2012, The GROMACS development team,
13  * check out http://www.gromacs.org for more information.
14
15  * This program is free software; you can redistribute it and/or
16  * modify it under the terms of the GNU General Public License
17  * as published by the Free Software Foundation; either version 2
18  * of the License, or (at your option) any later version.
19  *
20  * If you want to redistribute modifications, please consider that
21  * scientific software is very special. Version control is crucial -
22  * bugs must be traceable. We will be happy to consider code for
23  * inclusion in the official distribution, but derived work must not
24  * be called official GROMACS. Details are found in the README & COPYING
25  * files - if they are missing, get the official version at www.gromacs.org.
26  *
27  * To help us fund GROMACS development, we humbly ask that you cite
28  * the papers on the package - you can find them in the top README file.
29  *
30  * For more info, check our website at http://www.gromacs.org
31  */
32
33 #ifdef HAVE_CONFIG_H
34 #include <config.h>
35 #endif
36
37 #include <math.h>
38 #include <string.h>
39 #include "sysstuff.h"
40 #include "smalloc.h"
41 #include "macros.h"
42 #include "maths.h"
43 #include "vec.h"
44 #include "pbc.h"
45 #include "nbnxn_consts.h"
46 #include "nbnxn_internal.h"
47 #include "nbnxn_atomdata.h"
48 #include "nbnxn_search.h"
49 #include "gmx_cyclecounter.h"
50 #include "gmxfio.h"
51 #include "gmx_omp_nthreads.h"
52 #include "nrnb.h"
53
54
55 /* Pair search box lower and upper corner in x,y,z.
56  * Store this in 4 iso 3 reals, which is useful with SSE.
57  * To avoid complicating the code we also use 4 without SSE.
58  */
59 #define NNBSBB_C         4
60 #define NNBSBB_B         (2*NNBSBB_C)
61 /* Pair search box lower and upper bound in z only. */
62 #define NNBSBB_D         2
63 /* Pair search box lower and upper corner x,y,z indices */
64 #define BBL_X  0
65 #define BBL_Y  1
66 #define BBL_Z  2
67 #define BBU_X  4
68 #define BBU_Y  5
69 #define BBU_Z  6
70
71
72 #ifdef NBNXN_SEARCH_SSE
73
74 #ifndef GMX_DOUBLE
75 #define NBNXN_SEARCH_SSE_SINGLE
76 #include "gmx_x86_simd_single.h"
77 #else
78 #include "gmx_x86_simd_double.h"
79 #endif
80
81 #if defined NBNXN_SEARCH_SSE_SINGLE && GPU_NSUBCELL == 8
82 #define NBNXN_8BB_SSE
83 #endif
84
85 /* The width of SSE/AVX128 with single precision for bounding boxes with GPU.
86  * Here AVX-256 turns out to be slightly slower than AVX-128.
87  */
88 #define STRIDE_8BB        4
89 #define STRIDE_8BB_2LOG   2
90
91
92 /* The functions below are macros as they are performance sensitive */
93
94 /* 4x4 list, pack=4: no complex conversion required */
95 /* i-cluster to j-cluster conversion */
96 #define CI_TO_CJ_J4(ci)   (ci)
97 /* cluster index to coordinate array index conversion */
98 #define X_IND_CI_J4(ci)  ((ci)*STRIDE_P4)
99 #define X_IND_CJ_J4(cj)  ((cj)*STRIDE_P4)
100
101 /* 4x2 list, pack=4: j-cluster size is half the packing width */
102 /* i-cluster to j-cluster conversion */
103 #define CI_TO_CJ_J2(ci)  ((ci)<<1)
104 /* cluster index to coordinate array index conversion */
105 #define X_IND_CI_J2(ci)  ((ci)*STRIDE_P4)
106 #define X_IND_CJ_J2(cj)  (((cj)>>1)*STRIDE_P4 + ((cj) & 1)*(PACK_X4>>1))
107
108 /* 4x8 list, pack=8: i-cluster size is half the packing width */
109 /* i-cluster to j-cluster conversion */
110 #define CI_TO_CJ_J8(ci)  ((ci)>>1)
111 /* cluster index to coordinate array index conversion */
112 #define X_IND_CI_J8(ci)  (((ci)>>1)*STRIDE_P8 + ((ci) & 1)*(PACK_X8>>1))
113 #define X_IND_CJ_J8(cj)  ((cj)*STRIDE_P8)
114
115 /* The j-cluster size is matched to the SIMD width */
116 #ifndef GMX_DOUBLE
117 /* 128 bits can hold 4 floats */
118 #define CI_TO_CJ_S128(ci)  CI_TO_CJ_J4(ci)
119 #define X_IND_CI_S128(ci)  X_IND_CI_J4(ci)
120 #define X_IND_CJ_S128(cj)  X_IND_CJ_J4(cj)
121 /* 256 bits can hold 8 floats */
122 #define CI_TO_CJ_S256(ci)  CI_TO_CJ_J8(ci)
123 #define X_IND_CI_S256(ci)  X_IND_CI_J8(ci)
124 #define X_IND_CJ_S256(cj)  X_IND_CJ_J8(cj)
125 #else
126 /* 128 bits can hold 2 doubles */
127 #define CI_TO_CJ_S128(ci)  CI_TO_CJ_J2(ci)
128 #define X_IND_CI_S128(ci)  X_IND_CI_J2(ci)
129 #define X_IND_CJ_S128(cj)  X_IND_CJ_J2(cj)
130 /* 256 bits can hold 4 doubles */
131 #define CI_TO_CJ_S256(ci)  CI_TO_CJ_J4(ci)
132 #define X_IND_CI_S256(ci)  X_IND_CI_J4(ci)
133 #define X_IND_CJ_S256(cj)  X_IND_CJ_J4(cj)
134 #endif
135
136 #endif /* NBNXN_SEARCH_SSE */
137
138
139 /* Interaction masks for 4xN atom interactions.
140  * Bit i*CJ_SIZE + j tells if atom i and j interact.
141  */
142 /* All interaction mask is the same for all kernels */
143 #define NBNXN_INT_MASK_ALL        0xffffffff
144 /* 4x4 kernel diagonal mask */
145 #define NBNXN_INT_MASK_DIAG       0x08ce
146 /* 4x2 kernel diagonal masks */
147 #define NBNXN_INT_MASK_DIAG_J2_0  0x0002
148 #define NBNXN_INT_MASK_DIAG_J2_1  0x002F
149 /* 4x8 kernel diagonal masks */
150 #define NBNXN_INT_MASK_DIAG_J8_0  0xf0f8fcfe
151 #define NBNXN_INT_MASK_DIAG_J8_1  0x0080c0e0
152
153
154 #ifdef NBNXN_SEARCH_SSE
155 /* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
156 #define NBNXN_BBXXXX
157 /* Size of bounding box corners quadruplet */
158 #define NNBSBB_XXXX      (NNBSBB_D*DIM*STRIDE_8BB)
159 #endif
160
161 /* We shift the i-particles backward for PBC.
162  * This leads to more conditionals than shifting forward.
163  * We do this to get more balanced pair lists.
164  */
165 #define NBNXN_SHIFT_BACKWARD
166
167
168 /* This define is a lazy way to avoid interdependence of the grid
169  * and searching data structures.
170  */
171 #define NBNXN_NA_SC_MAX (GPU_NSUBCELL*NBNXN_GPU_CLUSTER_SIZE)
172
173
174 static void nbs_cycle_clear(nbnxn_cycle_t *cc)
175 {
176     int i;
177
178     for(i=0; i<enbsCCnr; i++)
179     {
180         cc[i].count = 0;
181         cc[i].c     = 0;
182     }
183 }
184
185 static double Mcyc_av(const nbnxn_cycle_t *cc)
186 {
187     return (double)cc->c*1e-6/cc->count;
188 }
189
190 static void nbs_cycle_print(FILE *fp,const nbnxn_search_t nbs)
191 {
192     int n;
193     int t;
194
195     fprintf(fp,"\n");
196     fprintf(fp,"ns %4d grid %4.1f search %4.1f red.f %5.3f",
197             nbs->cc[enbsCCgrid].count,
198             Mcyc_av(&nbs->cc[enbsCCgrid]),
199             Mcyc_av(&nbs->cc[enbsCCsearch]),
200             Mcyc_av(&nbs->cc[enbsCCreducef]));
201
202     if (nbs->nthread_max > 1)
203     {
204         if (nbs->cc[enbsCCcombine].count > 0)
205         {
206             fprintf(fp," comb %5.2f",
207                     Mcyc_av(&nbs->cc[enbsCCcombine]));
208         }
209         fprintf(fp," s. th");
210         for(t=0; t<nbs->nthread_max; t++)
211         {
212             fprintf(fp," %4.1f",
213                     Mcyc_av(&nbs->work[t].cc[enbsCCsearch]));
214         }
215     }
216     fprintf(fp,"\n");
217 }
218
219 static void nbnxn_grid_init(nbnxn_grid_t * grid)
220 {
221     grid->cxy_na      = NULL;
222     grid->cxy_ind     = NULL;
223     grid->cxy_nalloc  = 0;
224     grid->bb          = NULL;
225     grid->bbj         = NULL;
226     grid->nc_nalloc   = 0;
227 }
228
229 static int get_2log(int n)
230 {
231     int log2;
232
233     log2 = 0;
234     while ((1<<log2) < n)
235     {
236         log2++;
237     }
238     if ((1<<log2) != n)
239     {
240         gmx_fatal(FARGS,"nbnxn na_c (%d) is not a power of 2",n);
241     }
242
243     return log2;
244 }
245
246 static int nbnxn_kernel_to_ci_size(int nb_kernel_type)
247 {
248     switch (nb_kernel_type)
249     {
250     case nbk4x4_PlainC:
251     case nbk4xN_X86_SIMD128:
252     case nbk4xN_X86_SIMD256:
253         return NBNXN_CPU_CLUSTER_I_SIZE;
254     case nbk8x8x8_CUDA:
255     case nbk8x8x8_PlainC:
256         /* The cluster size for super/sub lists is only set here.
257          * Any value should work for the pair-search and atomdata code.
258          * The kernels, of course, might require a particular value.
259          */
260         return NBNXN_GPU_CLUSTER_SIZE;
261     default:
262         gmx_incons("unknown kernel type");
263     }
264
265     return 0;
266 }
267
268 int nbnxn_kernel_to_cj_size(int nb_kernel_type)
269 {
270     switch (nb_kernel_type)
271     {
272     case nbk4x4_PlainC:
273         return NBNXN_CPU_CLUSTER_I_SIZE;
274     case nbk4xN_X86_SIMD128:
275         /* Number of reals that fit in SIMD (128 bits = 16 bytes) */
276         return 16/sizeof(real);
277     case nbk4xN_X86_SIMD256:
278         /* Number of reals that fit in SIMD (256 bits = 32 bytes) */
279         return 32/sizeof(real);
280     case nbk8x8x8_CUDA:
281     case nbk8x8x8_PlainC:
282         return nbnxn_kernel_to_ci_size(nb_kernel_type);
283     default:
284         gmx_incons("unknown kernel type");
285     }
286
287     return 0;
288 }
289
290 static int ci_to_cj(int na_cj_2log,int ci)
291 {
292     switch (na_cj_2log)
293     {
294     case 2: return  ci;     break;
295     case 1: return (ci<<1); break;
296     case 3: return (ci>>1); break;
297     }
298
299     return 0;
300 }
301
302 gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type)
303 {
304     if (nb_kernel_type == nbkNotSet)
305     {
306         gmx_fatal(FARGS, "Non-bonded kernel type not set for Verlet-style pair-list.");
307     }
308
309     switch (nb_kernel_type)
310     {
311     case nbk8x8x8_CUDA:
312     case nbk8x8x8_PlainC:
313         return FALSE;
314
315     case nbk4x4_PlainC:
316     case nbk4xN_X86_SIMD128:
317     case nbk4xN_X86_SIMD256:
318         return TRUE;
319
320     default:
321         gmx_incons("Invalid nonbonded kernel type passed!");
322         return FALSE;
323     }
324 }
325
326 void nbnxn_init_search(nbnxn_search_t * nbs_ptr,
327                        ivec *n_dd_cells,
328                        gmx_domdec_zones_t *zones,
329                        int nthread_max)
330 {
331     nbnxn_search_t nbs;
332     int d,g,t;
333
334     snew(nbs,1);
335     *nbs_ptr = nbs;
336
337     nbs->DomDec = (n_dd_cells != NULL);
338
339     clear_ivec(nbs->dd_dim);
340     nbs->ngrid = 1;
341     if (nbs->DomDec)
342     {
343         nbs->zones = zones;
344
345         for(d=0; d<DIM; d++)
346         {
347             if ((*n_dd_cells)[d] > 1)
348             {
349                 nbs->dd_dim[d] = 1;
350                 /* Each grid matches a DD zone */
351                 nbs->ngrid *= 2;
352             }
353         }
354     }
355
356     snew(nbs->grid,nbs->ngrid);
357     for(g=0; g<nbs->ngrid; g++)
358     {
359         nbnxn_grid_init(&nbs->grid[g]);
360     }
361     nbs->cell        = NULL;
362     nbs->cell_nalloc = 0;
363     nbs->a           = NULL;
364     nbs->a_nalloc    = 0;
365
366     nbs->nthread_max = nthread_max;
367
368     /* Initialize the work data structures for each thread */
369     snew(nbs->work,nbs->nthread_max);
370     for(t=0; t<nbs->nthread_max; t++)
371     {
372         nbs->work[t].cxy_na           = NULL;
373         nbs->work[t].cxy_na_nalloc    = 0;
374         nbs->work[t].sort_work        = NULL;
375         nbs->work[t].sort_work_nalloc = 0;
376     }
377
378     /* Initialize detailed nbsearch cycle counting */
379     nbs->print_cycles = (getenv("GMX_NBNXN_CYCLE") != 0);
380     nbs->search_count = 0;
381     nbs_cycle_clear(nbs->cc);
382     for(t=0; t<nbs->nthread_max; t++)
383     {
384         nbs_cycle_clear(nbs->work[t].cc);
385     }
386 }
387
388 static real grid_atom_density(int n,rvec corner0,rvec corner1)
389 {
390     rvec size;
391
392     rvec_sub(corner1,corner0,size);
393
394     return n/(size[XX]*size[YY]*size[ZZ]);
395 }
396
397 static int set_grid_size_xy(const nbnxn_search_t nbs,
398                             nbnxn_grid_t *grid,
399                             int n,rvec corner0,rvec corner1,
400                             real atom_density,
401                             int XFormat)
402 {
403     rvec size;
404     int  na_c;
405     real adens,tlen,tlen_x,tlen_y,nc_max;
406     int  t;
407
408     rvec_sub(corner1,corner0,size);
409
410     if (n > grid->na_sc)
411     {
412         /* target cell length */
413         if (grid->bSimple)
414         {
415             /* To minimize the zero interactions, we should make
416              * the largest of the i/j cell cubic.
417              */
418             na_c = max(grid->na_c,grid->na_cj);
419
420             /* Approximately cubic cells */
421             tlen   = pow(na_c/atom_density,1.0/3.0);
422             tlen_x = tlen;
423             tlen_y = tlen;
424         }
425         else
426         {
427             /* Approximately cubic sub cells */
428             tlen   = pow(grid->na_c/atom_density,1.0/3.0);
429             tlen_x = tlen*GPU_NSUBCELL_X;
430             tlen_y = tlen*GPU_NSUBCELL_Y;
431         }
432         /* We round ncx and ncy down, because we get less cell pairs
433          * in the nbsist when the fixed cell dimensions (x,y) are
434          * larger than the variable one (z) than the other way around.
435          */
436         grid->ncx = max(1,(int)(size[XX]/tlen_x));
437         grid->ncy = max(1,(int)(size[YY]/tlen_y));
438     }
439     else
440     {
441         grid->ncx = 1;
442         grid->ncy = 1;
443     }
444
445     /* We need one additional cell entry for particles moved by DD */
446     if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
447     {
448         grid->cxy_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
449         srenew(grid->cxy_na,grid->cxy_nalloc);
450         srenew(grid->cxy_ind,grid->cxy_nalloc+1);
451     }
452     for(t=0; t<nbs->nthread_max; t++)
453     {
454         if (grid->ncx*grid->ncy+1 > nbs->work[t].cxy_na_nalloc)
455         {
456             nbs->work[t].cxy_na_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
457             srenew(nbs->work[t].cxy_na,nbs->work[t].cxy_na_nalloc);
458         }
459     }
460
461     /* Worst case scenario of 1 atom in each last cell */
462     if (grid->na_cj <= grid->na_c)
463     {
464         nc_max = n/grid->na_sc + grid->ncx*grid->ncy;
465     }
466     else
467     {
468         nc_max = n/grid->na_sc + grid->ncx*grid->ncy*grid->na_cj/grid->na_c;
469     }
470
471     if (nc_max > grid->nc_nalloc)
472     {
473         int bb_nalloc;
474
475         grid->nc_nalloc = over_alloc_large(nc_max);
476         srenew(grid->nsubc,grid->nc_nalloc);
477         srenew(grid->bbcz,grid->nc_nalloc*NNBSBB_D);
478 #ifdef NBNXN_8BB_SSE
479         bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX;
480 #else
481         bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL*NNBSBB_B;
482 #endif
483         sfree_aligned(grid->bb);
484         /* This snew also zeros the contents, this avoid possible
485          * floating exceptions in SSE with the unused bb elements.
486          */
487         snew_aligned(grid->bb,bb_nalloc,16);
488
489         if (grid->bSimple)
490         {
491             if (grid->na_cj == grid->na_c)
492             {
493                 grid->bbj = grid->bb;
494             }
495             else
496             {
497                 sfree_aligned(grid->bbj);
498                 snew_aligned(grid->bbj,bb_nalloc*grid->na_c/grid->na_cj,16);
499             }
500         }
501
502         srenew(grid->flags,grid->nc_nalloc);
503     }
504
505     copy_rvec(corner0,grid->c0);
506     copy_rvec(corner1,grid->c1);
507     grid->sx = size[XX]/grid->ncx;
508     grid->sy = size[YY]/grid->ncy;
509     grid->inv_sx = 1/grid->sx;
510     grid->inv_sy = 1/grid->sy;
511
512     return nc_max;
513 }
514
515 #define SORT_GRID_OVERSIZE 2
516 #define SGSF (SORT_GRID_OVERSIZE + 1)
517
518 static void sort_atoms(int dim,gmx_bool Backwards,
519                        int *a,int n,rvec *x,
520                        real h0,real invh,int nsort,int *sort)
521 {
522     int i,c;
523     int zi,zim;
524     int cp,tmp;
525
526     if (n <= 1)
527     {
528         /* Nothing to do */
529         return;
530     }
531
532     /* For small oversize factors clearing the whole area is fastest.
533      * For large oversize we should clear the used elements after use.
534      */
535     for(i=0; i<nsort; i++)
536     {
537         sort[i] = -1;
538     }
539     /* Sort the particles using a simple index sort */
540     for(i=0; i<n; i++)
541     {
542         /* The cast takes care of float-point rounding effects below zero.
543          * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
544          * times the box height out of the box.
545          */
546         zi = (int)((x[a[i]][dim] - h0)*invh);
547
548 #ifdef DEBUG_NBNXN_GRIDDING
549         if (zi < 0 || zi >= nsort)
550         {
551             gmx_fatal(FARGS,"(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d\n",
552                       a[i],'x'+dim,x[a[i]][dim],h0,invh,zi,nsort);
553         }
554 #endif
555
556         /* Ideally this particle should go in sort cell zi,
557          * but that might already be in use,
558          * in that case find the first empty cell higher up
559          */
560         if (sort[zi] < 0)
561         {
562             sort[zi] = a[i];
563         }
564         else
565         {
566             /* We have multiple atoms in the same sorting slot.
567              * Sort on real z for minimal bounding box size.
568              * There is an extra check for identical z to ensure
569              * well-defined output order, independent of input order
570              * to ensure binary reproducibility after restarts.
571              */
572             while(sort[zi] >= 0 && ( x[a[i]][dim] >  x[sort[zi]][dim] ||
573                                     (x[a[i]][dim] == x[sort[zi]][dim] &&
574                                      a[i] > sort[zi])))
575             {
576                 zi++;
577             }
578
579             if (sort[zi] >= 0)
580             {
581                 /* Shift all elements by one slot until we find an empty slot */
582                 cp = sort[zi];
583                 zim = zi + 1;
584                 while (sort[zim] >= 0)
585                 {
586                     tmp = sort[zim];
587                     sort[zim] = cp;
588                     cp  = tmp;
589                     zim++;
590                 }
591                 sort[zim] = cp;
592             }
593             sort[zi] = a[i];
594         }
595     }
596
597     c = 0;
598     if (!Backwards)
599     {
600         for(zi=0; zi<nsort; zi++)
601         {
602             if (sort[zi] >= 0)
603             {
604                 a[c++] = sort[zi];
605             }
606         }
607     }
608     else
609     {
610         for(zi=nsort-1; zi>=0; zi--)
611         {
612             if (sort[zi] >= 0)
613             {
614                 a[c++] = sort[zi];
615             }
616         }
617     }
618     if (c < n)
619     {
620         gmx_incons("Lost particles while sorting");
621     }
622 }
623
624 #ifdef GMX_DOUBLE
625 #define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
626 #define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
627 #else
628 #define R2F_D(x) (x)
629 #define R2F_U(x) (x)
630 #endif
631
632 /* Coordinate order x,y,z, bb order xyz0 */
633 static void calc_bounding_box(int na,int stride,const real *x,float *bb)
634 {
635     int  i,j;
636     real xl,xh,yl,yh,zl,zh;
637
638     i = 0;
639     xl = x[i+XX];
640     xh = x[i+XX];
641     yl = x[i+YY];
642     yh = x[i+YY];
643     zl = x[i+ZZ];
644     zh = x[i+ZZ];
645     i += stride;
646     for(j=1; j<na; j++)
647     {
648         xl = min(xl,x[i+XX]);
649         xh = max(xh,x[i+XX]);
650         yl = min(yl,x[i+YY]);
651         yh = max(yh,x[i+YY]);
652         zl = min(zl,x[i+ZZ]);
653         zh = max(zh,x[i+ZZ]);
654         i += stride;
655     }
656     /* Note: possible double to float conversion here */
657     bb[BBL_X] = R2F_D(xl);
658     bb[BBL_Y] = R2F_D(yl);
659     bb[BBL_Z] = R2F_D(zl);
660     bb[BBU_X] = R2F_U(xh);
661     bb[BBU_Y] = R2F_U(yh);
662     bb[BBU_Z] = R2F_U(zh);
663 }
664
665 /* Packed coordinates, bb order xyz0 */
666 static void calc_bounding_box_x_x4(int na,const real *x,float *bb)
667 {
668     int  j;
669     real xl,xh,yl,yh,zl,zh;
670
671     xl = x[XX*PACK_X4];
672     xh = x[XX*PACK_X4];
673     yl = x[YY*PACK_X4];
674     yh = x[YY*PACK_X4];
675     zl = x[ZZ*PACK_X4];
676     zh = x[ZZ*PACK_X4];
677     for(j=1; j<na; j++)
678     {
679         xl = min(xl,x[j+XX*PACK_X4]);
680         xh = max(xh,x[j+XX*PACK_X4]);
681         yl = min(yl,x[j+YY*PACK_X4]);
682         yh = max(yh,x[j+YY*PACK_X4]);
683         zl = min(zl,x[j+ZZ*PACK_X4]);
684         zh = max(zh,x[j+ZZ*PACK_X4]);
685     }
686     /* Note: possible double to float conversion here */
687     bb[BBL_X] = R2F_D(xl);
688     bb[BBL_Y] = R2F_D(yl);
689     bb[BBL_Z] = R2F_D(zl);
690     bb[BBU_X] = R2F_U(xh);
691     bb[BBU_Y] = R2F_U(yh);
692     bb[BBU_Z] = R2F_U(zh);
693 }
694
695 /* Packed coordinates, bb order xyz0 */
696 static void calc_bounding_box_x_x8(int na,const real *x,float *bb)
697 {
698     int  j;
699     real xl,xh,yl,yh,zl,zh;
700
701     xl = x[XX*PACK_X8];
702     xh = x[XX*PACK_X8];
703     yl = x[YY*PACK_X8];
704     yh = x[YY*PACK_X8];
705     zl = x[ZZ*PACK_X8];
706     zh = x[ZZ*PACK_X8];
707     for(j=1; j<na; j++)
708     {
709         xl = min(xl,x[j+XX*PACK_X8]);
710         xh = max(xh,x[j+XX*PACK_X8]);
711         yl = min(yl,x[j+YY*PACK_X8]);
712         yh = max(yh,x[j+YY*PACK_X8]);
713         zl = min(zl,x[j+ZZ*PACK_X8]);
714         zh = max(zh,x[j+ZZ*PACK_X8]);
715     }
716     /* Note: possible double to float conversion here */
717     bb[BBL_X] = R2F_D(xl);
718     bb[BBL_Y] = R2F_D(yl);
719     bb[BBL_Z] = R2F_D(zl);
720     bb[BBU_X] = R2F_U(xh);
721     bb[BBU_Y] = R2F_U(yh);
722     bb[BBU_Z] = R2F_U(zh);
723 }
724
725 #ifdef NBNXN_SEARCH_SSE
726
727 /* Packed coordinates, bb order xyz0 */
728 static void calc_bounding_box_x_x4_halves(int na,const real *x,
729                                           float *bb,float *bbj)
730 {
731     calc_bounding_box_x_x4(min(na,2),x,bbj);
732
733     if (na > 2)
734     {
735         calc_bounding_box_x_x4(min(na-2,2),x+(PACK_X4>>1),bbj+NNBSBB_B);
736     }
737     else
738     {
739         /* Set the "empty" bounding box to the same as the first one,
740          * so we don't need to treat special cases in the rest of the code.
741          */
742         _mm_store_ps(bbj+NNBSBB_B         ,_mm_load_ps(bbj));
743         _mm_store_ps(bbj+NNBSBB_B+NNBSBB_C,_mm_load_ps(bbj+NNBSBB_C));
744     }
745
746     _mm_store_ps(bb         ,_mm_min_ps(_mm_load_ps(bbj),
747                                         _mm_load_ps(bbj+NNBSBB_B)));
748     _mm_store_ps(bb+NNBSBB_C,_mm_max_ps(_mm_load_ps(bbj+NNBSBB_C),
749                                         _mm_load_ps(bbj+NNBSBB_B+NNBSBB_C)));
750 }
751
752 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
753 static void calc_bounding_box_xxxx(int na,int stride,const real *x,float *bb)
754 {
755     int  i,j;
756     real xl,xh,yl,yh,zl,zh;
757
758     i = 0;
759     xl = x[i+XX];
760     xh = x[i+XX];
761     yl = x[i+YY];
762     yh = x[i+YY];
763     zl = x[i+ZZ];
764     zh = x[i+ZZ];
765     i += stride;
766     for(j=1; j<na; j++)
767     {
768         xl = min(xl,x[i+XX]);
769         xh = max(xh,x[i+XX]);
770         yl = min(yl,x[i+YY]);
771         yh = max(yh,x[i+YY]);
772         zl = min(zl,x[i+ZZ]);
773         zh = max(zh,x[i+ZZ]);
774         i += stride;
775     }
776     /* Note: possible double to float conversion here */
777     bb[0*STRIDE_8BB] = R2F_D(xl);
778     bb[1*STRIDE_8BB] = R2F_D(yl);
779     bb[2*STRIDE_8BB] = R2F_D(zl);
780     bb[3*STRIDE_8BB] = R2F_U(xh);
781     bb[4*STRIDE_8BB] = R2F_U(yh);
782     bb[5*STRIDE_8BB] = R2F_U(zh);
783 }
784
785 #endif /* NBNXN_SEARCH_SSE */
786
787 #ifdef NBNXN_SEARCH_SSE_SINGLE
788
789 /* Coordinate order xyz?, bb order xyz0 */
790 static void calc_bounding_box_sse(int na,const float *x,float *bb)
791 {
792     __m128 bb_0_SSE,bb_1_SSE;
793     __m128 x_SSE;
794
795     int  i;
796
797     bb_0_SSE = _mm_load_ps(x);
798     bb_1_SSE = bb_0_SSE;
799
800     for(i=1; i<na; i++)
801     {
802         x_SSE    = _mm_load_ps(x+i*NNBSBB_C);
803         bb_0_SSE = _mm_min_ps(bb_0_SSE,x_SSE);
804         bb_1_SSE = _mm_max_ps(bb_1_SSE,x_SSE);
805     }
806
807     _mm_store_ps(bb  ,bb_0_SSE);
808     _mm_store_ps(bb+4,bb_1_SSE);
809 }
810
811 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
812 static void calc_bounding_box_xxxx_sse(int na,const float *x,
813                                        float *bb_work,
814                                        real *bb)
815 {
816     calc_bounding_box_sse(na,x,bb_work);
817
818     bb[0*STRIDE_8BB] = bb_work[BBL_X];
819     bb[1*STRIDE_8BB] = bb_work[BBL_Y];
820     bb[2*STRIDE_8BB] = bb_work[BBL_Z];
821     bb[3*STRIDE_8BB] = bb_work[BBU_X];
822     bb[4*STRIDE_8BB] = bb_work[BBU_Y];
823     bb[5*STRIDE_8BB] = bb_work[BBU_Z];
824 }
825
826 #endif /* NBNXN_SEARCH_SSE_SINGLE */
827
828 #ifdef NBNXN_SEARCH_SSE
829
830 /* Combines pairs of consecutive bounding boxes */
831 static void combine_bounding_box_pairs(nbnxn_grid_t *grid,const float *bb)
832 {
833     int    i,j,sc2,nc2,c2;
834     __m128 min_SSE,max_SSE;
835
836     for(i=0; i<grid->ncx*grid->ncy; i++)
837     {
838         /* Starting bb in a column is expected to be 2-aligned */
839         sc2 = grid->cxy_ind[i]>>1;
840         /* For odd numbers skip the last bb here */
841         nc2 = (grid->cxy_na[i]+3)>>(2+1);
842         for(c2=sc2; c2<sc2+nc2; c2++)
843         {
844             min_SSE = _mm_min_ps(_mm_load_ps(bb+(c2*4+0)*NNBSBB_C),
845                                  _mm_load_ps(bb+(c2*4+2)*NNBSBB_C));
846             max_SSE = _mm_max_ps(_mm_load_ps(bb+(c2*4+1)*NNBSBB_C),
847                                  _mm_load_ps(bb+(c2*4+3)*NNBSBB_C));
848             _mm_store_ps(grid->bbj+(c2*2+0)*NNBSBB_C,min_SSE);
849             _mm_store_ps(grid->bbj+(c2*2+1)*NNBSBB_C,max_SSE);
850         }
851         if (((grid->cxy_na[i]+3)>>2) & 1)
852         {
853             /* Copy the last bb for odd bb count in this column */
854             for(j=0; j<NNBSBB_C; j++)
855             {
856                 grid->bbj[(c2*2+0)*NNBSBB_C+j] = bb[(c2*4+0)*NNBSBB_C+j];
857                 grid->bbj[(c2*2+1)*NNBSBB_C+j] = bb[(c2*4+1)*NNBSBB_C+j];
858             }
859         }
860     }
861 }
862
863 #endif
864
865
866 /* Prints the average bb size, used for debug output */
867 static void print_bbsizes_simple(FILE *fp,
868                                  const nbnxn_search_t nbs,
869                                  const nbnxn_grid_t *grid)
870 {
871     int  c,d;
872     dvec ba;
873
874     clear_dvec(ba);
875     for(c=0; c<grid->nc; c++)
876     {
877         for(d=0; d<DIM; d++)
878         {
879             ba[d] += grid->bb[c*NNBSBB_B+NNBSBB_C+d] - grid->bb[c*NNBSBB_B+d];
880         }
881     }
882     dsvmul(1.0/grid->nc,ba,ba);
883
884     fprintf(fp,"ns bb: %4.2f %4.2f %4.2f  %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
885             nbs->box[XX][XX]/grid->ncx,
886             nbs->box[YY][YY]/grid->ncy,
887             nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/grid->nc,
888             ba[XX],ba[YY],ba[ZZ],
889             ba[XX]*grid->ncx/nbs->box[XX][XX],
890             ba[YY]*grid->ncy/nbs->box[YY][YY],
891             ba[ZZ]*grid->nc/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
892 }
893
894 /* Prints the average bb size, used for debug output */
895 static void print_bbsizes_supersub(FILE *fp,
896                                    const nbnxn_search_t nbs,
897                                    const nbnxn_grid_t *grid)
898 {
899     int  ns,c,s;
900     dvec ba;
901
902     clear_dvec(ba);
903     ns = 0;
904     for(c=0; c<grid->nc; c++)
905     {
906 #ifdef NBNXN_BBXXXX
907         for(s=0; s<grid->nsubc[c]; s+=STRIDE_8BB)
908         {
909             int cs_w,i,d;
910
911             cs_w = (c*GPU_NSUBCELL + s)/STRIDE_8BB;
912             for(i=0; i<STRIDE_8BB; i++)
913             {
914                 for(d=0; d<DIM; d++)
915                 {
916                     ba[d] +=
917                         grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_8BB+i] -
918                         grid->bb[cs_w*NNBSBB_XXXX+     d *STRIDE_8BB+i];
919                 }
920             }
921         }
922 #else
923         for(s=0; s<grid->nsubc[c]; s++)
924         {
925             int cs,d;
926
927             cs = c*GPU_NSUBCELL + s;
928             for(d=0; d<DIM; d++)
929             {
930                 ba[d] +=
931                     grid->bb[cs*NNBSBB_B+NNBSBB_C+d] -
932                     grid->bb[cs*NNBSBB_B         +d];
933             }
934         }
935 #endif
936         ns += grid->nsubc[c];
937     }
938     dsvmul(1.0/ns,ba,ba);
939
940     fprintf(fp,"ns bb: %4.2f %4.2f %4.2f  %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
941             nbs->box[XX][XX]/(grid->ncx*GPU_NSUBCELL_X),
942             nbs->box[YY][YY]/(grid->ncy*GPU_NSUBCELL_Y),
943             nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z),
944             ba[XX],ba[YY],ba[ZZ],
945             ba[XX]*grid->ncx*GPU_NSUBCELL_X/nbs->box[XX][XX],
946             ba[YY]*grid->ncy*GPU_NSUBCELL_Y/nbs->box[YY][YY],
947             ba[ZZ]*grid->nc*GPU_NSUBCELL_Z/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
948 }
949
950 /* Potentially sorts atoms on LJ coefficients !=0 and ==0.
951  * Also sets interaction flags.
952  */
953 void sort_on_lj(nbnxn_atomdata_t *nbat,int na_c,
954                 int a0,int a1,const int *atinfo,
955                 int *order,
956                 int *flags)
957 {
958     int subc,s,a,n1,n2,a_lj_max,i,j;
959     int sort1[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
960     int sort2[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
961     gmx_bool haveQ;
962
963     *flags = 0;
964
965     subc = 0;
966     for(s=a0; s<a1; s+=na_c)
967     {
968         /* Make lists for this (sub-)cell on atoms with and without LJ */
969         n1 = 0;
970         n2 = 0;
971         haveQ = FALSE;
972         a_lj_max = -1;
973         for(a=s; a<min(s+na_c,a1); a++)
974         {
975             haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
976
977             if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
978             {
979                 sort1[n1++] = order[a];
980                 a_lj_max = a;
981             }
982             else
983             {
984                 sort2[n2++] = order[a];
985             }
986         }
987
988         /* If we don't have atom with LJ, there's nothing to sort */
989         if (n1 > 0)
990         {
991             *flags |= NBNXN_CI_DO_LJ(subc);
992
993             if (2*n1 <= na_c)
994             {
995                 /* Only sort when strictly necessary. Ordering particles
996                  * Ordering particles can lead to less accurate summation
997                  * due to rounding, both for LJ and Coulomb interactions.
998                  */
999                 if (2*(a_lj_max - s) >= na_c)
1000                 {
1001                     for(i=0; i<n1; i++)
1002                     {
1003                         order[a0+i] = sort1[i];
1004                     }
1005                     for(j=0; j<n2; j++)
1006                     {
1007                         order[a0+n1+j] = sort2[j];
1008                     }
1009                 }
1010
1011                 *flags |= NBNXN_CI_HALF_LJ(subc);
1012             }
1013         }
1014         if (haveQ)
1015         {
1016             *flags |= NBNXN_CI_DO_COUL(subc);
1017         }
1018         subc++;
1019     }
1020 }
1021
1022 /* Fill a pair search cell with atoms.
1023  * Potentially sorts atoms and sets the interaction flags.
1024  */
1025 void fill_cell(const nbnxn_search_t nbs,
1026                nbnxn_grid_t *grid,
1027                nbnxn_atomdata_t *nbat,
1028                int a0,int a1,
1029                const int *atinfo,
1030                rvec *x,
1031                int sx,int sy, int sz,
1032                float *bb_work)
1033 {
1034     int    na,a;
1035     size_t offset;
1036     float  *bb_ptr;
1037
1038     na = a1 - a0;
1039
1040     if (grid->bSimple)
1041     {
1042         sort_on_lj(nbat,grid->na_c,a0,a1,atinfo,nbs->a,
1043                    grid->flags+(a0>>grid->na_c_2log)-grid->cell0);
1044     }
1045
1046     /* Now we have sorted the atoms, set the cell indices */
1047     for(a=a0; a<a1; a++)
1048     {
1049         nbs->cell[nbs->a[a]] = a;
1050     }
1051
1052     copy_rvec_to_nbat_real(nbs->a+a0,a1-a0,grid->na_c,x,
1053                            nbat->XFormat,nbat->x,a0,
1054                            sx,sy,sz);
1055
1056     if (nbat->XFormat == nbatX4)
1057     {
1058         /* Store the bounding boxes as xyz.xyz. */
1059         offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
1060         bb_ptr = grid->bb + offset;
1061
1062 #if defined GMX_DOUBLE && defined NBNXN_SEARCH_SSE
1063         if (2*grid->na_cj == grid->na_c)
1064         {
1065             calc_bounding_box_x_x4_halves(na,nbat->x+X4_IND_A(a0),bb_ptr,
1066                                           grid->bbj+offset*2);
1067         }
1068         else
1069 #endif
1070         {
1071             calc_bounding_box_x_x4(na,nbat->x+X4_IND_A(a0),bb_ptr);
1072         }
1073     }
1074     else if (nbat->XFormat == nbatX8)
1075     {
1076         /* Store the bounding boxes as xyz.xyz. */
1077         offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
1078         bb_ptr = grid->bb + offset;
1079
1080         calc_bounding_box_x_x8(na,nbat->x+X8_IND_A(a0),bb_ptr);
1081     }
1082 #ifdef NBNXN_BBXXXX
1083     else if (!grid->bSimple)
1084     {
1085         /* Store the bounding boxes in a format convenient
1086          * for SSE calculations: xxxxyyyyzzzz...
1087                              */
1088         bb_ptr =
1089             grid->bb +
1090             ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_8BB_2LOG))*NNBSBB_XXXX +
1091             (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_8BB-1));
1092
1093 #ifdef NBNXN_SEARCH_SSE_SINGLE
1094         if (nbat->XFormat == nbatXYZQ)
1095         {
1096             calc_bounding_box_xxxx_sse(na,nbat->x+a0*nbat->xstride,
1097                                        bb_work,bb_ptr);
1098         }
1099         else
1100 #endif
1101         {
1102             calc_bounding_box_xxxx(na,nbat->xstride,nbat->x+a0*nbat->xstride,
1103                                    bb_ptr);
1104         }
1105         if (gmx_debug_at)
1106         {
1107             fprintf(debug,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
1108                     sx,sy,sz,
1109                     bb_ptr[0*STRIDE_8BB],bb_ptr[3*STRIDE_8BB],
1110                     bb_ptr[1*STRIDE_8BB],bb_ptr[4*STRIDE_8BB],
1111                     bb_ptr[2*STRIDE_8BB],bb_ptr[5*STRIDE_8BB]);
1112         }
1113     }
1114 #endif
1115     else
1116     {
1117         /* Store the bounding boxes as xyz.xyz. */
1118         bb_ptr = grid->bb+((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
1119
1120         calc_bounding_box(na,nbat->xstride,nbat->x+a0*nbat->xstride,
1121                           bb_ptr);
1122
1123         if (gmx_debug_at)
1124         {
1125             int bbo;
1126             bbo = (a0 - grid->cell0*grid->na_sc)/grid->na_c;
1127             fprintf(debug,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
1128                     sx,sy,sz,
1129                     (grid->bb+bbo*NNBSBB_B)[BBL_X],
1130                     (grid->bb+bbo*NNBSBB_B)[BBU_X],
1131                     (grid->bb+bbo*NNBSBB_B)[BBL_Y],
1132                     (grid->bb+bbo*NNBSBB_B)[BBU_Y],
1133                     (grid->bb+bbo*NNBSBB_B)[BBL_Z],
1134                     (grid->bb+bbo*NNBSBB_B)[BBU_Z]);
1135         }
1136     }
1137 }
1138
1139 /* Spatially sort the atoms within one grid column */
1140 static void sort_columns_simple(const nbnxn_search_t nbs,
1141                                 int dd_zone,
1142                                 nbnxn_grid_t *grid,
1143                                 int a0,int a1,
1144                                 const int *atinfo,
1145                                 rvec *x,
1146                                 nbnxn_atomdata_t *nbat,
1147                                 int cxy_start,int cxy_end,
1148                                 int *sort_work)
1149 {
1150     int  cxy;
1151     int  cx,cy,cz,ncz,cfilled,c;
1152     int  na,ash,ind,a;
1153     int  na_c,ash_c;
1154
1155     if (debug)
1156     {
1157         fprintf(debug,"cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1158                 grid->cell0,cxy_start,cxy_end,a0,a1);
1159     }
1160
1161     /* Sort the atoms within each x,y column in 3 dimensions */
1162     for(cxy=cxy_start; cxy<cxy_end; cxy++)
1163     {
1164         cx = cxy/grid->ncy;
1165         cy = cxy - cx*grid->ncy;
1166
1167         na  = grid->cxy_na[cxy];
1168         ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
1169         ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
1170
1171         /* Sort the atoms within each x,y column on z coordinate */
1172         sort_atoms(ZZ,FALSE,
1173                    nbs->a+ash,na,x,
1174                    grid->c0[ZZ],
1175                    ncz*grid->na_sc*SORT_GRID_OVERSIZE/nbs->box[ZZ][ZZ],
1176                    ncz*grid->na_sc*SGSF,sort_work);
1177
1178         /* Fill the ncz cells in this column */
1179         cfilled = grid->cxy_ind[cxy];
1180         for(cz=0; cz<ncz; cz++)
1181         {
1182             c  = grid->cxy_ind[cxy] + cz ;
1183
1184             ash_c = ash + cz*grid->na_sc;
1185             na_c  = min(grid->na_sc,na-(ash_c-ash));
1186
1187             fill_cell(nbs,grid,nbat,
1188                       ash_c,ash_c+na_c,atinfo,x,
1189                       grid->na_sc*cx + (dd_zone >> 2),
1190                       grid->na_sc*cy + (dd_zone & 3),
1191                       grid->na_sc*cz,
1192                       NULL);
1193
1194             /* This copy to bbcz is not really necessary.
1195              * But it allows to use the same grid search code
1196              * for the simple and supersub cell setups.
1197              */
1198             if (na_c > 0)
1199             {
1200                 cfilled = c;
1201             }
1202             grid->bbcz[c*NNBSBB_D  ] = grid->bb[cfilled*NNBSBB_B+2];
1203             grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled*NNBSBB_B+6];
1204         }
1205
1206         /* Set the unused atom indices to -1 */
1207         for(ind=na; ind<ncz*grid->na_sc; ind++)
1208         {
1209             nbs->a[ash+ind] = -1;
1210         }
1211     }
1212 }
1213
1214 /* Spatially sort the atoms within one grid column */
1215 static void sort_columns_supersub(const nbnxn_search_t nbs,
1216                                   int dd_zone,
1217                                   nbnxn_grid_t *grid,
1218                                   int a0,int a1,
1219                                   const int *atinfo,
1220                                   rvec *x,
1221                                   nbnxn_atomdata_t *nbat,
1222                                   int cxy_start,int cxy_end,
1223                                   int *sort_work)
1224 {
1225     int  cxy;
1226     int  cx,cy,cz=-1,c=-1,ncz;
1227     int  na,ash,na_c,ind,a;
1228     int  subdiv_z,sub_z,na_z,ash_z;
1229     int  subdiv_y,sub_y,na_y,ash_y;
1230     int  subdiv_x,sub_x,na_x,ash_x;
1231
1232     /* cppcheck-suppress unassignedVariable */
1233     float bb_work_array[NNBSBB_B+3],*bb_work_align;
1234
1235     bb_work_align = (float *)(((size_t)(bb_work_array+3)) & (~((size_t)15)));
1236
1237     if (debug)
1238     {
1239         fprintf(debug,"cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1240                 grid->cell0,cxy_start,cxy_end,a0,a1);
1241     }
1242
1243     subdiv_x = grid->na_c;
1244     subdiv_y = GPU_NSUBCELL_X*subdiv_x;
1245     subdiv_z = GPU_NSUBCELL_Y*subdiv_y;
1246
1247     /* Sort the atoms within each x,y column in 3 dimensions */
1248     for(cxy=cxy_start; cxy<cxy_end; cxy++)
1249     {
1250         cx = cxy/grid->ncy;
1251         cy = cxy - cx*grid->ncy;
1252
1253         na  = grid->cxy_na[cxy];
1254         ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
1255         ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
1256
1257         /* Sort the atoms within each x,y column on z coordinate */
1258         sort_atoms(ZZ,FALSE,
1259                    nbs->a+ash,na,x,
1260                    grid->c0[ZZ],
1261                    ncz*grid->na_sc*SORT_GRID_OVERSIZE/nbs->box[ZZ][ZZ],
1262                    ncz*grid->na_sc*SGSF,sort_work);
1263
1264         /* This loop goes over the supercells and subcells along z at once */
1265         for(sub_z=0; sub_z<ncz*GPU_NSUBCELL_Z; sub_z++)
1266         {
1267             ash_z = ash + sub_z*subdiv_z;
1268             na_z  = min(subdiv_z,na-(ash_z-ash));
1269
1270             /* We have already sorted on z */
1271
1272             if (sub_z % GPU_NSUBCELL_Z == 0)
1273             {
1274                 cz = sub_z/GPU_NSUBCELL_Z;
1275                 c  = grid->cxy_ind[cxy] + cz ;
1276
1277                 /* The number of atoms in this supercell */
1278                 na_c = min(grid->na_sc,na-(ash_z-ash));
1279
1280                 grid->nsubc[c] = min(GPU_NSUBCELL,(na_c+grid->na_c-1)/grid->na_c);
1281
1282                 /* Store the z-boundaries of the super cell */
1283                 grid->bbcz[c*NNBSBB_D  ] = x[nbs->a[ash_z]][ZZ];
1284                 grid->bbcz[c*NNBSBB_D+1] = x[nbs->a[ash_z+na_c-1]][ZZ];
1285             }
1286
1287 #if GPU_NSUBCELL_Y > 1
1288             /* Sort the atoms along y */
1289             sort_atoms(YY,(sub_z & 1),
1290                        nbs->a+ash_z,na_z,x,
1291                        grid->c0[YY]+cy*grid->sy,grid->inv_sy,
1292                        subdiv_y*SGSF,sort_work);
1293 #endif
1294
1295             for(sub_y=0; sub_y<GPU_NSUBCELL_Y; sub_y++)
1296             {
1297                 ash_y = ash_z + sub_y*subdiv_y;
1298                 na_y  = min(subdiv_y,na-(ash_y-ash));
1299
1300 #if GPU_NSUBCELL_X > 1
1301                 /* Sort the atoms along x */
1302                 sort_atoms(XX,((cz*GPU_NSUBCELL_Y + sub_y) & 1),
1303                            nbs->a+ash_y,na_y,x,
1304                            grid->c0[XX]+cx*grid->sx,grid->inv_sx,
1305                            subdiv_x*SGSF,sort_work);
1306 #endif
1307
1308                 for(sub_x=0; sub_x<GPU_NSUBCELL_X; sub_x++)
1309                 {
1310                     ash_x = ash_y + sub_x*subdiv_x;
1311                     na_x  = min(subdiv_x,na-(ash_x-ash));
1312
1313                     fill_cell(nbs,grid,nbat,
1314                               ash_x,ash_x+na_x,atinfo,x,
1315                               grid->na_c*(cx*GPU_NSUBCELL_X+sub_x) + (dd_zone >> 2),
1316                               grid->na_c*(cy*GPU_NSUBCELL_Y+sub_y) + (dd_zone & 3),
1317                               grid->na_c*sub_z,
1318                               bb_work_align);
1319                 }
1320             }
1321         }
1322
1323         /* Set the unused atom indices to -1 */
1324         for(ind=na; ind<ncz*grid->na_sc; ind++)
1325         {
1326             nbs->a[ash+ind] = -1;
1327         }
1328     }
1329 }
1330
1331 /* Determine in which grid column atoms should go */
1332 static void calc_column_indices(nbnxn_grid_t *grid,
1333                                 int a0,int a1,
1334                                 rvec *x,const int *move,
1335                                 int thread,int nthread,
1336                                 int *cell,
1337                                 int *cxy_na)
1338 {
1339     int  n0,n1,i;
1340     int  cx,cy;
1341
1342     /* We add one extra cell for particles which moved during DD */
1343     for(i=0; i<grid->ncx*grid->ncy+1; i++)
1344     {
1345         cxy_na[i] = 0;
1346     }
1347
1348     n0 = a0 + (int)((thread+0)*(a1 - a0))/nthread;
1349     n1 = a0 + (int)((thread+1)*(a1 - a0))/nthread;
1350     for(i=n0; i<n1; i++)
1351     {
1352         if (move == NULL || move[i] >= 0)
1353         {
1354             /* We need to be careful with rounding,
1355              * particles might be a few bits outside the local box.
1356              * The int cast takes care of the lower bound,
1357              * we need to explicitly take care of the upper bound.
1358              */
1359             cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
1360             if (cx == grid->ncx)
1361             {
1362                 cx = grid->ncx - 1;
1363             }
1364             cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
1365             if (cy == grid->ncy)
1366             {
1367                 cy = grid->ncy - 1;
1368             }
1369             /* For the moment cell contains only the, grid local,
1370              * x and y indices, not z.
1371              */
1372             cell[i] = cx*grid->ncy + cy;
1373
1374 #ifdef DEBUG_NBNXN_GRIDDING
1375             if (cell[i] < 0 || cell[i] >= grid->ncx*grid->ncy)
1376             {
1377                 gmx_fatal(FARGS,
1378                           "grid cell cx %d cy %d out of range (max %d %d)",
1379                           cx,cy,grid->ncx,grid->ncy);
1380             }
1381 #endif
1382         }
1383         else
1384         {
1385             /* Put this moved particle after the end of the grid,
1386              * so we can process it later without using conditionals.
1387              */
1388             cell[i] = grid->ncx*grid->ncy;
1389         }
1390
1391         cxy_na[cell[i]]++;
1392     }
1393 }
1394
1395 /* Determine in which grid cells the atoms should go */
1396 static void calc_cell_indices(const nbnxn_search_t nbs,
1397                               int dd_zone,
1398                               nbnxn_grid_t *grid,
1399                               int a0,int a1,
1400                               const int *atinfo,
1401                               rvec *x,
1402                               const int *move,
1403                               nbnxn_atomdata_t *nbat)
1404 {
1405     int  n0,n1,i;
1406     int  cx,cy,cxy,ncz_max,ncz;
1407     int  nthread,thread;
1408     int  *cxy_na,cxy_na_i;
1409
1410     nthread = gmx_omp_nthreads_get(emntPairsearch);
1411
1412 #pragma omp parallel for num_threads(nthread) schedule(static)
1413     for(thread=0; thread<nthread; thread++)
1414     {
1415         calc_column_indices(grid,a0,a1,x,move,thread,nthread,
1416                             nbs->cell,nbs->work[thread].cxy_na);
1417     }
1418
1419     /* Make the cell index as a function of x and y */
1420     ncz_max = 0;
1421     ncz = 0;
1422     grid->cxy_ind[0] = 0;
1423     for(i=0; i<grid->ncx*grid->ncy+1; i++)
1424     {
1425         /* We set ncz_max at the beginning of the loop iso at the end
1426          * to skip i=grid->ncx*grid->ncy which are moved particles
1427          * that do not need to be ordered on the grid.
1428          */
1429         if (ncz > ncz_max)
1430         {
1431             ncz_max = ncz;
1432         }
1433         cxy_na_i = nbs->work[0].cxy_na[i];
1434         for(thread=1; thread<nthread; thread++)
1435         {
1436             cxy_na_i += nbs->work[thread].cxy_na[i];
1437         }
1438         ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
1439         if (nbat->XFormat == nbatX8)
1440         {
1441             /* Make the number of cell a multiple of 2 */
1442             ncz = (ncz + 1) & ~1;
1443         }
1444         grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
1445         /* Clear cxy_na, so we can reuse the array below */
1446         grid->cxy_na[i] = 0;
1447     }
1448     grid->nc = grid->cxy_ind[grid->ncx*grid->ncy] - grid->cxy_ind[0];
1449
1450     nbat->natoms = (grid->cell0 + grid->nc)*grid->na_sc;
1451
1452     if (debug)
1453     {
1454         fprintf(debug,"ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1455                 grid->na_sc,grid->na_c,grid->nc,
1456                 grid->ncx,grid->ncy,grid->nc/((double)(grid->ncx*grid->ncy)),
1457                 ncz_max);
1458         if (gmx_debug_at)
1459         {
1460             i = 0;
1461             for(cy=0; cy<grid->ncy; cy++)
1462             {
1463                 for(cx=0; cx<grid->ncx; cx++)
1464                 {
1465                     fprintf(debug," %2d",grid->cxy_ind[i+1]-grid->cxy_ind[i]);
1466                     i++;
1467                 }
1468                 fprintf(debug,"\n");
1469             }
1470         }
1471     }
1472
1473     /* Make sure the work array for sorting is large enough */
1474     if (ncz_max*grid->na_sc*SGSF > nbs->work[0].sort_work_nalloc)
1475     {
1476         for(thread=0; thread<nbs->nthread_max; thread++)
1477         {
1478             nbs->work[thread].sort_work_nalloc =
1479                 over_alloc_large(ncz_max*grid->na_sc*SGSF);
1480             srenew(nbs->work[thread].sort_work,
1481                    nbs->work[thread].sort_work_nalloc);
1482         }
1483     }
1484
1485     /* Now we know the dimensions we can fill the grid.
1486      * This is the first, unsorted fill. We sort the columns after this.
1487      */
1488     for(i=a0; i<a1; i++)
1489     {
1490         /* At this point nbs->cell contains the local grid x,y indices */
1491         cxy = nbs->cell[i];
1492         nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
1493     }
1494
1495     /* Set the cell indices for the moved particles */
1496     n0 = grid->nc*grid->na_sc;
1497     n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
1498     for(i=n0; i<n1; i++)
1499     {
1500         nbs->cell[nbs->a[i]] = i;
1501     }
1502
1503     /* Sort the super-cell columns along z into the sub-cells. */
1504 #pragma omp parallel for num_threads(nbs->nthread_max) schedule(static)
1505     for(thread=0; thread<nbs->nthread_max; thread++)
1506     {
1507         if (grid->bSimple)
1508         {
1509             sort_columns_simple(nbs,dd_zone,grid,a0,a1,atinfo,x,nbat,
1510                                 ((thread+0)*grid->ncx*grid->ncy)/nthread,
1511                                 ((thread+1)*grid->ncx*grid->ncy)/nthread,
1512                                 nbs->work[thread].sort_work);
1513         }
1514         else
1515         {
1516             sort_columns_supersub(nbs,dd_zone,grid,a0,a1,atinfo,x,nbat,
1517                                   ((thread+0)*grid->ncx*grid->ncy)/nthread,
1518                                   ((thread+1)*grid->ncx*grid->ncy)/nthread,
1519                                   nbs->work[thread].sort_work);
1520         }
1521     }
1522
1523 #ifdef NBNXN_SEARCH_SSE
1524     if (grid->bSimple && nbat->XFormat == nbatX8)
1525     {
1526         combine_bounding_box_pairs(grid,grid->bb);
1527     }
1528 #endif
1529
1530     if (!grid->bSimple)
1531     {
1532         grid->nsubc_tot = 0;
1533         for(i=0; i<grid->nc; i++)
1534         {
1535             grid->nsubc_tot += grid->nsubc[i];
1536         }
1537     }
1538
1539     if (debug)
1540     {
1541         if (grid->bSimple)
1542         {
1543             print_bbsizes_simple(debug,nbs,grid);
1544         }
1545         else
1546         {
1547             fprintf(debug,"ns non-zero sub-cells: %d average atoms %.2f\n",
1548                     grid->nsubc_tot,(a1-a0)/(double)grid->nsubc_tot);
1549
1550             print_bbsizes_supersub(debug,nbs,grid);
1551         }
1552     }
1553 }
1554
1555 /* Sets up a grid and puts the atoms on the grid.
1556  * This function only operates on one domain of the domain decompostion.
1557  * Note that without domain decomposition there is only one domain.
1558  */
1559 void nbnxn_put_on_grid(nbnxn_search_t nbs,
1560                        int ePBC,matrix box,
1561                        int dd_zone,
1562                        rvec corner0,rvec corner1,
1563                        int a0,int a1,
1564                        real atom_density,
1565                        const int *atinfo,
1566                        rvec *x,
1567                        int nmoved,int *move,
1568                        int nb_kernel_type,
1569                        nbnxn_atomdata_t *nbat)
1570 {
1571     nbnxn_grid_t *grid;
1572     int n;
1573     int nc_max_grid,nc_max;
1574
1575     grid = &nbs->grid[dd_zone];
1576
1577     nbs_cycle_start(&nbs->cc[enbsCCgrid]);
1578
1579     grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
1580
1581     grid->na_c      = nbnxn_kernel_to_ci_size(nb_kernel_type);
1582     grid->na_cj     = nbnxn_kernel_to_cj_size(nb_kernel_type);
1583     grid->na_sc     = (grid->bSimple ? 1 : GPU_NSUBCELL)*grid->na_c;
1584     grid->na_c_2log = get_2log(grid->na_c);
1585
1586     nbat->na_c = grid->na_c;
1587
1588     if (dd_zone == 0)
1589     {
1590         grid->cell0 = 0;
1591     }
1592     else
1593     {
1594         grid->cell0 =
1595             (nbs->grid[dd_zone-1].cell0 + nbs->grid[dd_zone-1].nc)*
1596             nbs->grid[dd_zone-1].na_sc/grid->na_sc;
1597     }
1598
1599     n = a1 - a0;
1600
1601     if (dd_zone == 0)
1602     {
1603         nbs->ePBC = ePBC;
1604         copy_mat(box,nbs->box);
1605
1606         if (atom_density >= 0)
1607         {
1608             grid->atom_density = atom_density;
1609         }
1610         else
1611         {
1612             grid->atom_density = grid_atom_density(n-nmoved,corner0,corner1);
1613         }
1614
1615         grid->cell0 = 0;
1616
1617         nbs->natoms_local    = a1 - nmoved;
1618         /* We assume that nbnxn_put_on_grid is called first
1619          * for the local atoms (dd_zone=0).
1620          */
1621         nbs->natoms_nonlocal = a1 - nmoved;
1622     }
1623     else
1624     {
1625         nbs->natoms_nonlocal = max(nbs->natoms_nonlocal,a1);
1626     }
1627
1628     nc_max_grid = set_grid_size_xy(nbs,grid,n-nmoved,corner0,corner1,
1629                                    nbs->grid[0].atom_density,
1630                                    nbat->XFormat);
1631
1632     nc_max = grid->cell0 + nc_max_grid;
1633
1634     if (a1 > nbs->cell_nalloc)
1635     {
1636         nbs->cell_nalloc = over_alloc_large(a1);
1637         srenew(nbs->cell,nbs->cell_nalloc);
1638     }
1639
1640     /* To avoid conditionals we store the moved particles at the end of a,
1641      * make sure we have enough space.
1642      */
1643     if (nc_max*grid->na_sc + nmoved > nbs->a_nalloc)
1644     {
1645         nbs->a_nalloc = over_alloc_large(nc_max*grid->na_sc + nmoved);
1646         srenew(nbs->a,nbs->a_nalloc);
1647     }
1648
1649     if (nc_max*grid->na_sc > nbat->nalloc)
1650     {
1651         nbnxn_atomdata_realloc(nbat,nc_max*grid->na_sc);
1652     }
1653
1654     calc_cell_indices(nbs,dd_zone,grid,a0,a1,atinfo,x,move,nbat);
1655
1656     if (dd_zone == 0)
1657     {
1658         nbat->natoms_local = nbat->natoms;
1659     }
1660
1661     nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
1662 }
1663
1664 /* Calls nbnxn_put_on_grid for all non-local domains */
1665 void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs,
1666                                 const gmx_domdec_zones_t *zones,
1667                                 const int *atinfo,
1668                                 rvec *x,
1669                                 int nb_kernel_type,
1670                                 nbnxn_atomdata_t *nbat)
1671 {
1672     int  zone,d;
1673     rvec c0,c1;
1674
1675     for(zone=1; zone<zones->n; zone++)
1676     {
1677         for(d=0; d<DIM; d++)
1678         {
1679             c0[d] = zones->size[zone].bb_x0[d];
1680             c1[d] = zones->size[zone].bb_x1[d];
1681         }
1682
1683         nbnxn_put_on_grid(nbs,nbs->ePBC,NULL,
1684                           zone,c0,c1,
1685                           zones->cg_range[zone],
1686                           zones->cg_range[zone+1],
1687                           -1,
1688                           atinfo,
1689                           x,
1690                           0,NULL,
1691                           nb_kernel_type,
1692                           nbat);
1693     }
1694 }
1695
1696 /* Add simple grid type information to the local super/sub grid */
1697 void nbnxn_grid_add_simple(nbnxn_search_t nbs,
1698                            nbnxn_atomdata_t *nbat)
1699 {
1700     nbnxn_grid_t *grid;
1701     float *bbcz,*bb;
1702     int ncd,sc;
1703
1704     grid = &nbs->grid[0];
1705
1706     if (grid->bSimple)
1707     {
1708         gmx_incons("nbnxn_grid_simple called with a simple grid");
1709     }
1710
1711     ncd = grid->na_sc/NBNXN_CPU_CLUSTER_I_SIZE;
1712
1713     if (grid->nc*ncd > grid->nc_nalloc_simple)
1714     {
1715         grid->nc_nalloc_simple = over_alloc_large(grid->nc*ncd);
1716         srenew(grid->bbcz_simple,grid->nc_nalloc_simple*NNBSBB_D);
1717         srenew(grid->bb_simple,grid->nc_nalloc_simple*NNBSBB_B);
1718         srenew(grid->flags_simple,grid->nc_nalloc_simple);
1719         if (nbat->XFormat)
1720         {
1721             sfree_aligned(grid->bbj);
1722             snew_aligned(grid->bbj,grid->nc_nalloc_simple/2,16);
1723         }
1724     }
1725
1726     bbcz = grid->bbcz_simple;
1727     bb   = grid->bb_simple;
1728
1729 #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
1730     for(sc=0; sc<grid->nc; sc++)
1731     {
1732         int c,tx,na;
1733
1734         for(c=0; c<ncd; c++)
1735         {
1736             tx = sc*ncd + c;
1737
1738             na = NBNXN_CPU_CLUSTER_I_SIZE;
1739             while (na > 0 &&
1740                    nbat->type[tx*NBNXN_CPU_CLUSTER_I_SIZE+na-1] == nbat->ntype-1)
1741             {
1742                 na--;
1743             }
1744
1745             if (na > 0)
1746             {
1747                 switch (nbat->XFormat)
1748                 {
1749                 case nbatX4:
1750                     /* PACK_X4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
1751                     calc_bounding_box_x_x4(na,nbat->x+tx*STRIDE_P4,
1752                                            bb+tx*NNBSBB_B);
1753                     break;
1754                 case nbatX8:
1755                     /* PACK_X8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
1756                     calc_bounding_box_x_x8(na,nbat->x+X8_IND_A(tx*NBNXN_CPU_CLUSTER_I_SIZE),
1757                                            bb+tx*NNBSBB_B);
1758                     break;
1759                 default:
1760                     calc_bounding_box(na,nbat->xstride,
1761                                       nbat->x+tx*NBNXN_CPU_CLUSTER_I_SIZE*nbat->xstride,
1762                                       bb+tx*NNBSBB_B);
1763                     break;
1764                 }
1765                 bbcz[tx*NNBSBB_D+0] = bb[tx*NNBSBB_B         +ZZ];
1766                 bbcz[tx*NNBSBB_D+1] = bb[tx*NNBSBB_B+NNBSBB_C+ZZ];
1767
1768                 /* No interaction optimization yet here */
1769                 grid->flags_simple[tx] = NBNXN_CI_DO_LJ(0) | NBNXN_CI_DO_COUL(0);
1770             }
1771             else
1772             {
1773                 grid->flags_simple[tx] = 0;
1774             }
1775         }
1776     }
1777
1778 #ifdef NBNXN_SEARCH_SSE
1779     if (grid->bSimple && nbat->XFormat == nbatX8)
1780     {
1781         combine_bounding_box_pairs(grid,grid->bb_simple);
1782     }
1783 #endif
1784 }
1785
1786 void nbnxn_get_ncells(nbnxn_search_t nbs,int *ncx,int *ncy)
1787 {
1788     *ncx = nbs->grid[0].ncx;
1789     *ncy = nbs->grid[0].ncy;
1790 }
1791
1792 void nbnxn_get_atomorder(nbnxn_search_t nbs,int **a,int *n)
1793 {
1794     const nbnxn_grid_t *grid;
1795
1796     grid = &nbs->grid[0];
1797
1798     /* Return the atom order for the home cell (index 0) */
1799     *a  = nbs->a;
1800
1801     *n = grid->cxy_ind[grid->ncx*grid->ncy]*grid->na_sc;
1802 }
1803
1804 void nbnxn_set_atomorder(nbnxn_search_t nbs)
1805 {
1806     nbnxn_grid_t *grid;
1807     int ao,cx,cy,cxy,cz,j;
1808
1809     /* Set the atom order for the home cell (index 0) */
1810     grid = &nbs->grid[0];
1811
1812     ao = 0;
1813     for(cx=0; cx<grid->ncx; cx++)
1814     {
1815         for(cy=0; cy<grid->ncy; cy++)
1816         {
1817             cxy = cx*grid->ncy + cy;
1818             j   = grid->cxy_ind[cxy]*grid->na_sc;
1819             for(cz=0; cz<grid->cxy_na[cxy]; cz++)
1820             {
1821                 nbs->a[j]     = ao;
1822                 nbs->cell[ao] = j;
1823                 ao++;
1824                 j++;
1825             }
1826         }
1827     }
1828 }
1829
1830 /* Determines the cell range along one dimension that
1831  * the bounding box b0 - b1 sees.
1832  */
1833 static void get_cell_range(real b0,real b1,
1834                            int nc,real c0,real s,real invs,
1835                            real d2,real r2,int *cf,int *cl)
1836 {
1837     *cf = max((int)((b0 - c0)*invs),0);
1838
1839     while (*cf > 0 && d2 + sqr((b0 - c0) - (*cf-1+1)*s) < r2)
1840     {
1841         (*cf)--;
1842     }
1843
1844     *cl = min((int)((b1 - c0)*invs),nc-1);
1845     while (*cl < nc-1 && d2 + sqr((*cl+1)*s - (b1 - c0)) < r2)
1846     {
1847         (*cl)++;
1848     }
1849 }
1850
1851 /* Reference code calculating the distance^2 between two bounding boxes */
1852 static float box_dist2(float bx0,float bx1,float by0,
1853                        float by1,float bz0,float bz1,
1854                        const float *bb)
1855 {
1856     float d2;
1857     float dl,dh,dm,dm0;
1858
1859     d2 = 0;
1860
1861     dl  = bx0 - bb[BBU_X];
1862     dh  = bb[BBL_X] - bx1;
1863     dm  = max(dl,dh);
1864     dm0 = max(dm,0);
1865     d2 += dm0*dm0;
1866
1867     dl  = by0 - bb[BBU_Y];
1868     dh  = bb[BBL_Y] - by1;
1869     dm  = max(dl,dh);
1870     dm0 = max(dm,0);
1871     d2 += dm0*dm0;
1872
1873     dl  = bz0 - bb[BBU_Z];
1874     dh  = bb[BBL_Z] - bz1;
1875     dm  = max(dl,dh);
1876     dm0 = max(dm,0);
1877     d2 += dm0*dm0;
1878
1879     return d2;
1880 }
1881
1882 /* Plain C code calculating the distance^2 between two bounding boxes */
1883 static float subc_bb_dist2(int si,const float *bb_i_ci,
1884                            int csj,const float *bb_j_all)
1885 {
1886     const float *bb_i,*bb_j;
1887     float d2;
1888     float dl,dh,dm,dm0;
1889
1890     bb_i = bb_i_ci  +  si*NNBSBB_B;
1891     bb_j = bb_j_all + csj*NNBSBB_B;
1892
1893     d2 = 0;
1894
1895     dl  = bb_i[BBL_X] - bb_j[BBU_X];
1896     dh  = bb_j[BBL_X] - bb_i[BBU_X];
1897     dm  = max(dl,dh);
1898     dm0 = max(dm,0);
1899     d2 += dm0*dm0;
1900
1901     dl  = bb_i[BBL_Y] - bb_j[BBU_Y];
1902     dh  = bb_j[BBL_Y] - bb_i[BBU_Y];
1903     dm  = max(dl,dh);
1904     dm0 = max(dm,0);
1905     d2 += dm0*dm0;
1906
1907     dl  = bb_i[BBL_Z] - bb_j[BBU_Z];
1908     dh  = bb_j[BBL_Z] - bb_i[BBU_Z];
1909     dm  = max(dl,dh);
1910     dm0 = max(dm,0);
1911     d2 += dm0*dm0;
1912
1913     return d2;
1914 }
1915
1916 #ifdef NBNXN_SEARCH_SSE
1917
1918 /* SSE code for bb distance for bb format xyz0 */
1919 static float subc_bb_dist2_sse(int na_c,
1920                               int si,const float *bb_i_ci,
1921                               int csj,const float *bb_j_all)
1922 {
1923     const float *bb_i,*bb_j;
1924
1925     __m128 bb_i_SSE0,bb_i_SSE1;
1926     __m128 bb_j_SSE0,bb_j_SSE1;
1927     __m128 dl_SSE;
1928     __m128 dh_SSE;
1929     __m128 dm_SSE;
1930     __m128 dm0_SSE;
1931     __m128 d2_SSE;
1932 #ifndef GMX_X86_SSE4_1
1933     float d2_array[7],*d2_align;
1934
1935     d2_align = (float *)(((size_t)(d2_array+3)) & (~((size_t)15)));
1936 #else
1937     float d2;
1938 #endif
1939
1940     bb_i = bb_i_ci  +  si*NNBSBB_B;
1941     bb_j = bb_j_all + csj*NNBSBB_B;
1942
1943     bb_i_SSE0 = _mm_load_ps(bb_i);
1944     bb_i_SSE1 = _mm_load_ps(bb_i+NNBSBB_C);
1945     bb_j_SSE0 = _mm_load_ps(bb_j);
1946     bb_j_SSE1 = _mm_load_ps(bb_j+NNBSBB_C);
1947
1948     dl_SSE    = _mm_sub_ps(bb_i_SSE0,bb_j_SSE1);
1949     dh_SSE    = _mm_sub_ps(bb_j_SSE0,bb_i_SSE1);
1950
1951     dm_SSE    = _mm_max_ps(dl_SSE,dh_SSE);
1952     dm0_SSE   = _mm_max_ps(dm_SSE,_mm_setzero_ps());
1953 #ifndef GMX_X86_SSE4_1
1954     d2_SSE    = _mm_mul_ps(dm0_SSE,dm0_SSE);
1955
1956     _mm_store_ps(d2_align,d2_SSE);
1957
1958     return d2_align[0] + d2_align[1] + d2_align[2];
1959 #else
1960     /* SSE4.1 dot product of components 0,1,2 */
1961     d2_SSE    = _mm_dp_ps(dm0_SSE,dm0_SSE,0x71);
1962
1963     _mm_store_ss(&d2,d2_SSE);
1964
1965     return d2;
1966 #endif
1967 }
1968
1969 /* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
1970 #define SUBC_BB_DIST2_SSE_XXXX_INNER(si,bb_i,d2) \
1971 {                                                \
1972     int    shi;                                  \
1973                                                  \
1974     __m128 dx_0,dy_0,dz_0;                       \
1975     __m128 dx_1,dy_1,dz_1;                       \
1976                                                  \
1977     __m128 mx,my,mz;                             \
1978     __m128 m0x,m0y,m0z;                          \
1979                                                  \
1980     __m128 d2x,d2y,d2z;                          \
1981     __m128 d2s,d2t;                              \
1982                                                  \
1983     shi = si*NNBSBB_D*DIM;                       \
1984                                                  \
1985     xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_8BB);   \
1986     yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_8BB);   \
1987     zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_8BB);   \
1988     xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_8BB);   \
1989     yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_8BB);   \
1990     zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_8BB);   \
1991                                                  \
1992     dx_0 = _mm_sub_ps(xi_l,xj_h);                \
1993     dy_0 = _mm_sub_ps(yi_l,yj_h);                \
1994     dz_0 = _mm_sub_ps(zi_l,zj_h);                \
1995                                                  \
1996     dx_1 = _mm_sub_ps(xj_l,xi_h);                \
1997     dy_1 = _mm_sub_ps(yj_l,yi_h);                \
1998     dz_1 = _mm_sub_ps(zj_l,zi_h);                \
1999                                                  \
2000     mx   = _mm_max_ps(dx_0,dx_1);                \
2001     my   = _mm_max_ps(dy_0,dy_1);                \
2002     mz   = _mm_max_ps(dz_0,dz_1);                \
2003                                                  \
2004     m0x  = _mm_max_ps(mx,zero);                  \
2005     m0y  = _mm_max_ps(my,zero);                  \
2006     m0z  = _mm_max_ps(mz,zero);                  \
2007                                                  \
2008     d2x  = _mm_mul_ps(m0x,m0x);                  \
2009     d2y  = _mm_mul_ps(m0y,m0y);                  \
2010     d2z  = _mm_mul_ps(m0z,m0z);                  \
2011                                                  \
2012     d2s  = _mm_add_ps(d2x,d2y);                  \
2013     d2t  = _mm_add_ps(d2s,d2z);                  \
2014                                                  \
2015     _mm_store_ps(d2+si,d2t);                     \
2016 }
2017
2018 /* SSE code for nsi bb distances for bb format xxxxyyyyzzzz */
2019 static void subc_bb_dist2_sse_xxxx(const float *bb_j,
2020                                    int nsi,const float *bb_i,
2021                                    float *d2)
2022 {
2023     __m128 xj_l,yj_l,zj_l;
2024     __m128 xj_h,yj_h,zj_h;
2025     __m128 xi_l,yi_l,zi_l;
2026     __m128 xi_h,yi_h,zi_h;
2027
2028     __m128 zero;
2029
2030     zero = _mm_setzero_ps();
2031
2032     xj_l = _mm_set1_ps(bb_j[0*STRIDE_8BB]);
2033     yj_l = _mm_set1_ps(bb_j[1*STRIDE_8BB]);
2034     zj_l = _mm_set1_ps(bb_j[2*STRIDE_8BB]);
2035     xj_h = _mm_set1_ps(bb_j[3*STRIDE_8BB]);
2036     yj_h = _mm_set1_ps(bb_j[4*STRIDE_8BB]);
2037     zj_h = _mm_set1_ps(bb_j[5*STRIDE_8BB]);
2038
2039     /* Here we "loop" over si (0,STRIDE_8BB) from 0 to nsi with step STRIDE_8BB.
2040      * But as we know the number of iterations is 1 or 2, we unroll manually.
2041      */
2042     SUBC_BB_DIST2_SSE_XXXX_INNER(0,bb_i,d2);
2043     if (STRIDE_8BB < nsi)
2044     {
2045         SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_8BB,bb_i,d2);
2046     }
2047 }
2048
2049 #endif /* NBNXN_SEARCH_SSE */
2050
2051 /* Plain C function which determines if any atom pair between two cells
2052  * is within distance sqrt(rl2).
2053  */
2054 static gmx_bool subc_in_range_x(int na_c,
2055                                 int si,const real *x_i,
2056                                 int csj,int stride,const real *x_j,
2057                                 real rl2)
2058 {
2059     int  i,j,i0,j0;
2060     real d2;
2061
2062     for(i=0; i<na_c; i++)
2063     {
2064         i0 = (si*na_c + i)*DIM;
2065         for(j=0; j<na_c; j++)
2066         {
2067             j0 = (csj*na_c + j)*stride;
2068
2069             d2 = sqr(x_i[i0  ] - x_j[j0  ]) +
2070                  sqr(x_i[i0+1] - x_j[j0+1]) +
2071                  sqr(x_i[i0+2] - x_j[j0+2]);
2072
2073             if (d2 < rl2)
2074             {
2075                 return TRUE;
2076             }
2077         }
2078     }
2079
2080     return FALSE;
2081 }
2082
2083 /* SSE function which determines if any atom pair between two cells,
2084  * both with 8 atoms, is within distance sqrt(rl2).
2085  */
2086 static gmx_bool subc_in_range_sse8(int na_c,
2087                                    int si,const real *x_i,
2088                                    int csj,int stride,const real *x_j,
2089                                    real rl2)
2090 {
2091 #ifdef NBNXN_SEARCH_SSE_SINGLE
2092     __m128 ix_SSE0,iy_SSE0,iz_SSE0;
2093     __m128 ix_SSE1,iy_SSE1,iz_SSE1;
2094
2095     __m128 rc2_SSE;
2096
2097     int na_c_sse;
2098     int j0,j1;
2099
2100     rc2_SSE   = _mm_set1_ps(rl2);
2101
2102     na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_8BB;
2103     ix_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_8BB);
2104     iy_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_8BB);
2105     iz_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_8BB);
2106     ix_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_8BB);
2107     iy_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_8BB);
2108     iz_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_8BB);
2109
2110     /* We loop from the outer to the inner particles to maximize
2111      * the chance that we find a pair in range quickly and return.
2112      */
2113     j0 = csj*na_c;
2114     j1 = j0 + na_c - 1;
2115     while (j0 < j1)
2116     {
2117         __m128 jx0_SSE,jy0_SSE,jz0_SSE;
2118         __m128 jx1_SSE,jy1_SSE,jz1_SSE;
2119
2120         __m128 dx_SSE0,dy_SSE0,dz_SSE0;
2121         __m128 dx_SSE1,dy_SSE1,dz_SSE1;
2122         __m128 dx_SSE2,dy_SSE2,dz_SSE2;
2123         __m128 dx_SSE3,dy_SSE3,dz_SSE3;
2124
2125         __m128 rsq_SSE0;
2126         __m128 rsq_SSE1;
2127         __m128 rsq_SSE2;
2128         __m128 rsq_SSE3;
2129
2130         __m128 wco_SSE0;
2131         __m128 wco_SSE1;
2132         __m128 wco_SSE2;
2133         __m128 wco_SSE3;
2134         __m128 wco_any_SSE01,wco_any_SSE23,wco_any_SSE;
2135
2136         jx0_SSE = _mm_load1_ps(x_j+j0*stride+0);
2137         jy0_SSE = _mm_load1_ps(x_j+j0*stride+1);
2138         jz0_SSE = _mm_load1_ps(x_j+j0*stride+2);
2139
2140         jx1_SSE = _mm_load1_ps(x_j+j1*stride+0);
2141         jy1_SSE = _mm_load1_ps(x_j+j1*stride+1);
2142         jz1_SSE = _mm_load1_ps(x_j+j1*stride+2);
2143
2144         /* Calculate distance */
2145         dx_SSE0            = _mm_sub_ps(ix_SSE0,jx0_SSE);
2146         dy_SSE0            = _mm_sub_ps(iy_SSE0,jy0_SSE);
2147         dz_SSE0            = _mm_sub_ps(iz_SSE0,jz0_SSE);
2148         dx_SSE1            = _mm_sub_ps(ix_SSE1,jx0_SSE);
2149         dy_SSE1            = _mm_sub_ps(iy_SSE1,jy0_SSE);
2150         dz_SSE1            = _mm_sub_ps(iz_SSE1,jz0_SSE);
2151         dx_SSE2            = _mm_sub_ps(ix_SSE0,jx1_SSE);
2152         dy_SSE2            = _mm_sub_ps(iy_SSE0,jy1_SSE);
2153         dz_SSE2            = _mm_sub_ps(iz_SSE0,jz1_SSE);
2154         dx_SSE3            = _mm_sub_ps(ix_SSE1,jx1_SSE);
2155         dy_SSE3            = _mm_sub_ps(iy_SSE1,jy1_SSE);
2156         dz_SSE3            = _mm_sub_ps(iz_SSE1,jz1_SSE);
2157
2158         /* rsq = dx*dx+dy*dy+dz*dz */
2159         rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
2160         rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
2161         rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
2162         rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
2163
2164         wco_SSE0           = _mm_cmplt_ps(rsq_SSE0,rc2_SSE);
2165         wco_SSE1           = _mm_cmplt_ps(rsq_SSE1,rc2_SSE);
2166         wco_SSE2           = _mm_cmplt_ps(rsq_SSE2,rc2_SSE);
2167         wco_SSE3           = _mm_cmplt_ps(rsq_SSE3,rc2_SSE);
2168
2169         wco_any_SSE01      = _mm_or_ps(wco_SSE0,wco_SSE1);
2170         wco_any_SSE23      = _mm_or_ps(wco_SSE2,wco_SSE3);
2171         wco_any_SSE        = _mm_or_ps(wco_any_SSE01,wco_any_SSE23);
2172
2173         if (_mm_movemask_ps(wco_any_SSE))
2174         {
2175             return TRUE;
2176         }
2177
2178         j0++;
2179         j1--;
2180     }
2181     return FALSE;
2182
2183 #else
2184     /* No SSE */
2185     gmx_incons("SSE function called without SSE support");
2186
2187     return TRUE;
2188 #endif
2189 }
2190
2191 /* Returns the j sub-cell for index cj_ind */
2192 static int nbl_cj(const nbnxn_pairlist_t *nbl,int cj_ind)
2193 {
2194     return nbl->cj4[cj_ind>>2].cj[cj_ind & 3];
2195 }
2196
2197 /* Returns the i-interaction mask of the j sub-cell for index cj_ind */
2198 static unsigned nbl_imask0(const nbnxn_pairlist_t *nbl,int cj_ind)
2199 {
2200     return nbl->cj4[cj_ind>>2].imei[0].imask;
2201 }
2202
2203 /* Ensures there is enough space for extra extra exclusion masks */
2204 static void check_excl_space(nbnxn_pairlist_t *nbl,int extra)
2205 {
2206     if (nbl->nexcl+extra > nbl->excl_nalloc)
2207     {
2208         nbl->excl_nalloc = over_alloc_small(nbl->nexcl+extra);
2209         nbnxn_realloc_void((void **)&nbl->excl,
2210                            nbl->nexcl*sizeof(*nbl->excl),
2211                            nbl->excl_nalloc*sizeof(*nbl->excl),
2212                            nbl->alloc,nbl->free);
2213     }
2214 }
2215
2216 /* Ensures there is enough space for ncell extra j-cells in the list */
2217 static void check_subcell_list_space_simple(nbnxn_pairlist_t *nbl,
2218                                             int ncell)
2219 {
2220     int cj_max;
2221
2222     cj_max = nbl->ncj + ncell;
2223
2224     if (cj_max > nbl->cj_nalloc)
2225     {
2226         nbl->cj_nalloc = over_alloc_small(cj_max);
2227         nbnxn_realloc_void((void **)&nbl->cj,
2228                            nbl->ncj*sizeof(*nbl->cj),
2229                            nbl->cj_nalloc*sizeof(*nbl->cj),
2230                            nbl->alloc,nbl->free);
2231     }
2232 }
2233
2234 /* Ensures there is enough space for ncell extra j-subcells in the list */
2235 static void check_subcell_list_space_supersub(nbnxn_pairlist_t *nbl,
2236                                               int nsupercell)
2237 {
2238     int ncj4_max,j4,j,w,t;
2239
2240 #define NWARP       2
2241 #define WARP_SIZE  32
2242
2243     /* We can have maximally nsupercell*GPU_NSUBCELL sj lists */
2244     /* We can store 4 j-subcell - i-supercell pairs in one struct.
2245      * since we round down, we need one extra entry.
2246      */
2247     ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL + 4-1) >> 2);
2248
2249     if (ncj4_max > nbl->cj4_nalloc)
2250     {
2251         nbl->cj4_nalloc = over_alloc_small(ncj4_max);
2252         nbnxn_realloc_void((void **)&nbl->cj4,
2253                            nbl->work->cj4_init*sizeof(*nbl->cj4),
2254                            nbl->cj4_nalloc*sizeof(*nbl->cj4),
2255                            nbl->alloc,nbl->free);
2256     }
2257
2258     if (ncj4_max > nbl->work->cj4_init)
2259     {
2260         for(j4=nbl->work->cj4_init; j4<ncj4_max; j4++)
2261         {
2262             /* No i-subcells and no excl's in the list initially */
2263             for(w=0; w<NWARP; w++)
2264             {
2265                 nbl->cj4[j4].imei[w].imask    = 0U;
2266                 nbl->cj4[j4].imei[w].excl_ind = 0;
2267
2268             }
2269         }
2270         nbl->work->cj4_init = ncj4_max;
2271     }
2272 }
2273
2274 /* Set all excl masks for one GPU warp no exclusions */
2275 static void set_no_excls(nbnxn_excl_t *excl)
2276 {
2277     int t;
2278
2279     for(t=0; t<WARP_SIZE; t++)
2280     {
2281         /* Turn all interaction bits on */
2282         excl->pair[t] = NBNXN_INT_MASK_ALL;
2283     }
2284 }
2285
2286 /* Initializes a single nbnxn_pairlist_t data structure */
2287 static void nbnxn_init_pairlist(nbnxn_pairlist_t *nbl,
2288                                 gmx_bool bSimple,
2289                                 nbnxn_alloc_t *alloc,
2290                                 nbnxn_free_t  *free)
2291 {
2292     if (alloc == NULL)
2293     {
2294         nbl->alloc = nbnxn_alloc_aligned;
2295     }
2296     else
2297     {
2298         nbl->alloc = alloc;
2299     }
2300     if (free == NULL)
2301     {
2302         nbl->free = nbnxn_free_aligned;
2303     }
2304     else
2305     {
2306         nbl->free = free;
2307     }
2308
2309     nbl->bSimple     = bSimple;
2310     nbl->na_sc       = 0;
2311     nbl->na_ci       = 0;
2312     nbl->na_cj       = 0;
2313     nbl->nci         = 0;
2314     nbl->ci          = NULL;
2315     nbl->ci_nalloc   = 0;
2316     nbl->ncj         = 0;
2317     nbl->cj          = NULL;
2318     nbl->cj_nalloc   = 0;
2319     nbl->ncj4        = 0;
2320     /* We need one element extra in sj, so alloc initially with 1 */
2321     nbl->cj4_nalloc  = 0;
2322     nbl->cj4         = NULL;
2323     nbl->nci_tot     = 0;
2324
2325     if (!nbl->bSimple)
2326     {
2327         nbl->excl        = NULL;
2328         nbl->excl_nalloc = 0;
2329         nbl->nexcl       = 0;
2330         check_excl_space(nbl,1);
2331         nbl->nexcl       = 1;
2332         set_no_excls(&nbl->excl[0]);
2333     }
2334
2335     snew(nbl->work,1);
2336 #ifdef NBNXN_BBXXXX
2337     snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,16);
2338 #else
2339     snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,16);
2340 #endif
2341     snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,16);
2342 #ifdef NBNXN_SEARCH_SSE
2343     snew_aligned(nbl->work->x_ci_x86_simd128,1,16);
2344 #ifdef GMX_X86_AVX_256
2345     snew_aligned(nbl->work->x_ci_x86_simd256,1,32);
2346 #endif
2347 #endif
2348     snew_aligned(nbl->work->d2,GPU_NSUBCELL,16);
2349 }
2350
2351 void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
2352                              gmx_bool bSimple, gmx_bool bCombined,
2353                              nbnxn_alloc_t *alloc,
2354                              nbnxn_free_t  *free)
2355 {
2356     int i;
2357
2358     nbl_list->bSimple   = bSimple;
2359     nbl_list->bCombined = bCombined;
2360
2361     nbl_list->nnbl = gmx_omp_nthreads_get(emntNonbonded);
2362
2363     snew(nbl_list->nbl,nbl_list->nnbl);
2364     /* Execute in order to avoid memory interleaving between threads */
2365 #pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)
2366     for(i=0; i<nbl_list->nnbl; i++)
2367     {
2368         /* Allocate the nblist data structure locally on each thread
2369          * to optimize memory access for NUMA architectures.
2370          */
2371         snew(nbl_list->nbl[i],1);
2372
2373         /* Only list 0 is used on the GPU, use normal allocation for i>0 */
2374         if (i == 0)
2375         {
2376             nbnxn_init_pairlist(nbl_list->nbl[i],nbl_list->bSimple,alloc,free);
2377         }
2378         else
2379         {
2380             nbnxn_init_pairlist(nbl_list->nbl[i],nbl_list->bSimple,NULL,NULL);
2381         }
2382     }
2383 }
2384
2385 /* Print statistics of a pair list, used for debug output */
2386 static void print_nblist_statistics_simple(FILE *fp,const nbnxn_pairlist_t *nbl,
2387                                            const nbnxn_search_t nbs,real rl)
2388 {
2389     const nbnxn_grid_t *grid;
2390     int cs[SHIFTS];
2391     int s,i,j;
2392     int npexcl;
2393
2394     /* This code only produces correct statistics with domain decomposition */
2395     grid = &nbs->grid[0];
2396
2397     fprintf(fp,"nbl nci %d ncj %d\n",
2398             nbl->nci,nbl->ncj);
2399     fprintf(fp,"nbl na_sc %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
2400             nbl->na_sc,rl,nbl->ncj,nbl->ncj/(double)grid->nc,
2401             nbl->ncj/(double)grid->nc*grid->na_sc,
2402             nbl->ncj/(double)grid->nc*grid->na_sc/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nc*grid->na_sc/det(nbs->box)));
2403
2404     fprintf(fp,"nbl average j cell list length %.1f\n",
2405             0.25*nbl->ncj/(double)nbl->nci);
2406
2407     for(s=0; s<SHIFTS; s++)
2408     {
2409         cs[s] = 0;
2410     }
2411     npexcl = 0;
2412     for(i=0; i<nbl->nci; i++)
2413     {
2414         cs[nbl->ci[i].shift & NBNXN_CI_SHIFT] +=
2415             nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start;
2416
2417         j = nbl->ci[i].cj_ind_start;
2418         while (j < nbl->ci[i].cj_ind_end &&
2419                nbl->cj[j].excl != NBNXN_INT_MASK_ALL)
2420         {
2421             npexcl++;
2422             j++;
2423         }
2424     }
2425     fprintf(fp,"nbl cell pairs, total: %d excl: %d %.1f%%\n",
2426             nbl->ncj,npexcl,100*npexcl/(double)nbl->ncj);
2427     for(s=0; s<SHIFTS; s++)
2428     {
2429         if (cs[s] > 0)
2430         {
2431             fprintf(fp,"nbl shift %2d ncj %3d\n",s,cs[s]);
2432         }
2433     }
2434 }
2435
2436 /* Print statistics of a pair lists, used for debug output */
2437 static void print_nblist_statistics_supersub(FILE *fp,const nbnxn_pairlist_t *nbl,
2438                                              const nbnxn_search_t nbs,real rl)
2439 {
2440     const nbnxn_grid_t *grid;
2441     int i,j4,j,si,b;
2442     int c[GPU_NSUBCELL+1];
2443
2444     /* This code only produces correct statistics with domain decomposition */
2445     grid = &nbs->grid[0];
2446
2447     fprintf(fp,"nbl nsci %d ncj4 %d nsi %d excl4 %d\n",
2448             nbl->nsci,nbl->ncj4,nbl->nci_tot,nbl->nexcl);
2449     fprintf(fp,"nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
2450             nbl->na_ci,rl,nbl->nci_tot,nbl->nci_tot/(double)grid->nsubc_tot,
2451             nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c,
2452             nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nsubc_tot*grid->na_c/det(nbs->box)));
2453
2454     fprintf(fp,"nbl average j super cell list length %.1f\n",
2455             0.25*nbl->ncj4/(double)nbl->nsci);
2456     fprintf(fp,"nbl average i sub cell list length %.1f\n",
2457             nbl->nci_tot/(0.25*nbl->ncj4));
2458
2459     for(si=0; si<=GPU_NSUBCELL; si++)
2460     {
2461         c[si] = 0;
2462     }
2463     for(i=0; i<nbl->nsci; i++)
2464     {
2465         for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
2466         {
2467             for(j=0; j<4; j++)
2468             {
2469                 b = 0;
2470                 for(si=0; si<GPU_NSUBCELL; si++)
2471                 {
2472                     if (nbl->cj4[j4].imei[0].imask & (1U << (j*GPU_NSUBCELL + si)))
2473                     {
2474                         b++;
2475                     }
2476                 }
2477                 c[b]++;
2478             }
2479         }
2480     }
2481     for(b=0; b<=GPU_NSUBCELL; b++)
2482     {
2483         fprintf(fp,"nbl j-list #i-subcell %d %7d %4.1f\n",
2484                 b,c[b],100.0*c[b]/(double)(nbl->ncj4*NBNXN_GPU_JGROUP_SIZE));
2485     }
2486 }
2487
2488 /* Print the full pair list, used for debug output */
2489 static void print_supersub_nsp(const char *fn,
2490                                const nbnxn_pairlist_t *nbl,
2491                                int iloc)
2492 {
2493     char buf[STRLEN];
2494     FILE *fp;
2495     int i,nsp,j4,p;
2496
2497     sprintf(buf,"%s_%s.xvg",fn,NONLOCAL_I(iloc) ? "nl" : "l");
2498     fp = ffopen(buf,"w");
2499
2500     for(i=0; i<nbl->nci; i++)
2501     {
2502         nsp = 0;
2503         for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
2504         {
2505             for(p=0; p<NBNXN_GPU_JGROUP_SIZE*GPU_NSUBCELL; p++)
2506             {
2507                 nsp += (nbl->cj4[j4].imei[0].imask >> p) & 1;
2508             }
2509         }
2510         fprintf(fp,"%4d %3d %3d\n",
2511                 i,
2512                 nsp,
2513                 nbl->sci[i].cj4_ind_end-nbl->sci[i].cj4_ind_start);
2514     }
2515
2516     fclose(fp);
2517 }
2518
2519 /* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp */
2520 static void low_get_nbl_exclusions(nbnxn_pairlist_t *nbl,int cj4,
2521                                    int warp,nbnxn_excl_t **excl)
2522 {
2523     if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
2524     {
2525         /* No exclusions set, make a new list entry */
2526         nbl->cj4[cj4].imei[warp].excl_ind = nbl->nexcl;
2527         nbl->nexcl++;
2528         *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
2529         set_no_excls(*excl);
2530     }
2531     else
2532     {
2533         /* We already have some exclusions, new ones can be added to the list */
2534         *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
2535     }
2536 }
2537
2538 /* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp,
2539  * allocates extra memory, if necessary.
2540  */
2541 static void get_nbl_exclusions_1(nbnxn_pairlist_t *nbl,int cj4,
2542                                  int warp,nbnxn_excl_t **excl)
2543 {
2544     if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
2545     {
2546         /* We need to make a new list entry, check if we have space */
2547         check_excl_space(nbl,1);
2548     }
2549     low_get_nbl_exclusions(nbl,cj4,warp,excl);
2550 }
2551
2552 /* Returns pointers to the exclusion mask for cj4-unit cj4 for both warps,
2553  * allocates extra memory, if necessary.
2554  */
2555 static void get_nbl_exclusions_2(nbnxn_pairlist_t *nbl,int cj4,
2556                                  nbnxn_excl_t **excl_w0,
2557                                  nbnxn_excl_t **excl_w1)
2558 {
2559     /* Check for space we might need */
2560     check_excl_space(nbl,2);
2561
2562     low_get_nbl_exclusions(nbl,cj4,0,excl_w0);
2563     low_get_nbl_exclusions(nbl,cj4,1,excl_w1);
2564 }
2565
2566 /* Sets the self exclusions i=j and pair exclusions i>j */
2567 static void set_self_and_newton_excls_supersub(nbnxn_pairlist_t *nbl,
2568                                                int cj4_ind,int sj_offset,
2569                                                int si)
2570 {
2571     nbnxn_excl_t *excl[2];
2572     int  ei,ej,w;
2573
2574     /* Here we only set the set self and double pair exclusions */
2575
2576     get_nbl_exclusions_2(nbl,cj4_ind,&excl[0],&excl[1]);
2577
2578     /* Only minor < major bits set */
2579     for(ej=0; ej<nbl->na_ci; ej++)
2580     {
2581         w = (ej>>2);
2582         for(ei=ej; ei<nbl->na_ci; ei++)
2583         {
2584             excl[w]->pair[(ej&(4-1))*nbl->na_ci+ei] &=
2585                 ~(1U << (sj_offset*GPU_NSUBCELL+si));
2586         }
2587     }
2588 }
2589
2590 /* Returns a diagonal or off-diagonal interaction mask for plain C lists */
2591 static unsigned int get_imask(gmx_bool rdiag,int ci,int cj)
2592 {
2593     return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
2594 }
2595
2596 #ifdef NBNXN_SEARCH_SSE
2597 /* Returns a diagonal or off-diagonal interaction mask for SIMD128 lists */
2598 static unsigned int get_imask_x86_simd128(gmx_bool rdiag,int ci,int cj)
2599 {
2600 #ifndef GMX_DOUBLE /* cj-size = 4 */
2601     return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
2602 #else              /* cj-size = 2 */
2603     return (rdiag && ci*2 == cj ? NBNXN_INT_MASK_DIAG_J2_0 :
2604             (rdiag && ci*2+1 == cj ? NBNXN_INT_MASK_DIAG_J2_1 :
2605              NBNXN_INT_MASK_ALL));
2606 #endif
2607 }
2608
2609 #ifdef GMX_X86_AVX_256
2610 /* Returns a diagonal or off-diagonal interaction mask for SIMD256 lists */
2611 static unsigned int get_imask_x86_simd256(gmx_bool rdiag,int ci,int cj)
2612 {
2613 #ifndef GMX_DOUBLE /* cj-size = 8 */
2614     return (rdiag && ci == cj*2 ? NBNXN_INT_MASK_DIAG_J8_0 :
2615             (rdiag && ci == cj*2+1 ? NBNXN_INT_MASK_DIAG_J8_1 :
2616              NBNXN_INT_MASK_ALL));
2617 #else              /* cj-size = 2 */
2618     return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
2619 #endif
2620 }
2621 #endif
2622 #endif /* NBNXN_SEARCH_SSE */
2623
2624 /* Plain C code for making a pair list of cell ci vs cell cjf-cjl.
2625  * Checks bounding box distances and possibly atom pair distances.
2626  */
2627 static void make_cluster_list_simple(const nbnxn_grid_t *gridj,
2628                                      nbnxn_pairlist_t *nbl,
2629                                      int ci,int cjf,int cjl,
2630                                      gmx_bool remove_sub_diag,
2631                                      const real *x_j,
2632                                      real rl2,float rbb2,
2633                                      int *ndistc)
2634 {
2635     const nbnxn_list_work_t *work;
2636
2637     const float *bb_ci;
2638     const real  *x_ci;
2639
2640     gmx_bool   InRange;
2641     real       d2;
2642     int        cjf_gl,cjl_gl,cj;
2643
2644     work = nbl->work;
2645
2646     bb_ci = nbl->work->bb_ci;
2647     x_ci  = nbl->work->x_ci;
2648
2649     InRange = FALSE;
2650     while (!InRange && cjf <= cjl)
2651     {
2652         d2 = subc_bb_dist2(0,bb_ci,cjf,gridj->bb);
2653         *ndistc += 2;
2654
2655         /* Check if the distance is within the distance where
2656          * we use only the bounding box distance rbb,
2657          * or within the cut-off and there is at least one atom pair
2658          * within the cut-off.
2659          */
2660         if (d2 < rbb2)
2661         {
2662             InRange = TRUE;
2663         }
2664         else if (d2 < rl2)
2665         {
2666             int i,j;
2667
2668             cjf_gl = gridj->cell0 + cjf;
2669             for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
2670             {
2671                 for(j=0; j<NBNXN_CPU_CLUSTER_I_SIZE; j++)
2672                 {
2673                     InRange = InRange ||
2674                         (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
2675                          sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
2676                          sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
2677                 }
2678             }
2679             *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
2680         }
2681         if (!InRange)
2682         {
2683             cjf++;
2684         }
2685     }
2686     if (!InRange)
2687     {
2688         return;
2689     }
2690
2691     InRange = FALSE;
2692     while (!InRange && cjl > cjf)
2693     {
2694         d2 = subc_bb_dist2(0,bb_ci,cjl,gridj->bb);
2695         *ndistc += 2;
2696
2697         /* Check if the distance is within the distance where
2698          * we use only the bounding box distance rbb,
2699          * or within the cut-off and there is at least one atom pair
2700          * within the cut-off.
2701          */
2702         if (d2 < rbb2)
2703         {
2704             InRange = TRUE;
2705         }
2706         else if (d2 < rl2)
2707         {
2708             int i,j;
2709
2710             cjl_gl = gridj->cell0 + cjl;
2711             for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
2712             {
2713                 for(j=0; j<NBNXN_CPU_CLUSTER_I_SIZE; j++)
2714                 {
2715                     InRange = InRange ||
2716                         (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
2717                          sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
2718                          sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
2719                 }
2720             }
2721             *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
2722         }
2723         if (!InRange)
2724         {
2725             cjl--;
2726         }
2727     }
2728
2729     if (cjf <= cjl)
2730     {
2731         for(cj=cjf; cj<=cjl; cj++)
2732         {
2733             /* Store cj and the interaction mask */
2734             nbl->cj[nbl->ncj].cj   = gridj->cell0 + cj;
2735             nbl->cj[nbl->ncj].excl = get_imask(remove_sub_diag,ci,cj);
2736             nbl->ncj++;
2737         }
2738         /* Increase the closing index in i super-cell list */
2739         nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
2740     }
2741 }
2742
2743 #ifdef NBNXN_SEARCH_SSE
2744 /* Include make_cluster_list_x86_simd128/256 */
2745 #define GMX_MM128_HERE
2746 #include "gmx_x86_simd_macros.h"
2747 #define STRIDE_S  PACK_X4
2748 #include "nbnxn_search_x86_simd.h"
2749 #undef STRIDE_S
2750 #undef GMX_MM128_HERE
2751 #ifdef GMX_X86_AVX_256
2752 /* Include make_cluster_list_x86_simd128/256 */
2753 #define GMX_MM256_HERE
2754 #include "gmx_x86_simd_macros.h"
2755 #define STRIDE_S  GMX_X86_SIMD_WIDTH_HERE
2756 #include "nbnxn_search_x86_simd.h"
2757 #undef STRIDE_S
2758 #undef GMX_MM256_HERE
2759 #endif
2760 #endif
2761
2762 /* Plain C or SSE code for making a pair list of super-cell sci vs scj.
2763  * Checks bounding box distances and possibly atom pair distances.
2764  */
2765 static void make_cluster_list_supersub(const nbnxn_search_t nbs,
2766                                        const nbnxn_grid_t *gridi,
2767                                        const nbnxn_grid_t *gridj,
2768                                        nbnxn_pairlist_t *nbl,
2769                                        int sci,int scj,
2770                                        gmx_bool sci_equals_scj,
2771                                        int stride,const real *x,
2772                                        real rl2,float rbb2,
2773                                        int *ndistc)
2774 {
2775     int  na_c;
2776     int  npair;
2777     int  cjo,ci1,ci,cj,cj_gl;
2778     int  cj4_ind,cj_offset;
2779     unsigned imask;
2780     nbnxn_cj4_t *cj4;
2781     const float *bb_ci;
2782     const real *x_ci;
2783     float *d2l,d2;
2784     int  w;
2785 #define PRUNE_LIST_CPU_ONE
2786 #ifdef PRUNE_LIST_CPU_ONE
2787     int  ci_last=-1;
2788 #endif
2789
2790     d2l = nbl->work->d2;
2791
2792     bb_ci = nbl->work->bb_ci;
2793     x_ci  = nbl->work->x_ci;
2794
2795     na_c = gridj->na_c;
2796
2797     for(cjo=0; cjo<gridj->nsubc[scj]; cjo++)
2798     {
2799         cj4_ind   = (nbl->work->cj_ind >> 2);
2800         cj_offset = nbl->work->cj_ind - cj4_ind*NBNXN_GPU_JGROUP_SIZE;
2801         cj4       = &nbl->cj4[cj4_ind];
2802
2803         cj = scj*GPU_NSUBCELL + cjo;
2804
2805         cj_gl = gridj->cell0*GPU_NSUBCELL + cj;
2806
2807         /* Initialize this j-subcell i-subcell list */
2808         cj4->cj[cj_offset] = cj_gl;
2809         imask              = 0;
2810
2811         if (sci_equals_scj)
2812         {
2813             ci1 = cjo + 1;
2814         }
2815         else
2816         {
2817             ci1 = gridi->nsubc[sci];
2818         }
2819
2820 #ifdef NBNXN_BBXXXX
2821         /* Determine all ci1 bb distances in one call with SSE */
2822         subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_8BB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_8BB-1)),
2823                                ci1,bb_ci,d2l);
2824         *ndistc += na_c*2;
2825 #endif
2826
2827         npair = 0;
2828         /* We use a fixed upper-bound instead of ci1 to help optimization */
2829         for(ci=0; ci<GPU_NSUBCELL; ci++)
2830         {
2831             if (ci == ci1)
2832             {
2833                 break;
2834             }
2835
2836 #ifndef NBNXN_BBXXXX
2837             /* Determine the bb distance between ci and cj */
2838             d2l[ci] = subc_bb_dist2(ci,bb_ci,cj,gridj->bb);
2839             *ndistc += 2;
2840 #endif
2841             d2 = d2l[ci];
2842
2843 #ifdef PRUNE_LIST_CPU_ALL
2844             /* Check if the distance is within the distance where
2845              * we use only the bounding box distance rbb,
2846              * or within the cut-off and there is at least one atom pair
2847              * within the cut-off. This check is very costly.
2848              */
2849             *ndistc += na_c*na_c;
2850             if (d2 < rbb2 ||
2851                 (d2 < rl2 && subc_in_range_x(na_c,ci,x_ci,cj_gl,stride,x,rl2)))
2852 #else
2853             /* Check if the distance between the two bounding boxes
2854              * in within the pair-list cut-off.
2855              */
2856             if (d2 < rl2)
2857 #endif
2858             {
2859                 /* Flag this i-subcell to be taken into account */
2860                 imask |= (1U << (cj_offset*GPU_NSUBCELL+ci));
2861
2862 #ifdef PRUNE_LIST_CPU_ONE
2863                 ci_last = ci;
2864 #endif
2865
2866                 npair++;
2867             }
2868         }
2869
2870 #ifdef PRUNE_LIST_CPU_ONE
2871         /* If we only found 1 pair, check if any atoms are actually
2872          * within the cut-off, so we could get rid of it.
2873          */
2874         if (npair == 1 && d2l[ci_last] >= rbb2)
2875         {
2876             /* Avoid using function pointers here, as it's slower */
2877             if (
2878 #ifdef NBNXN_8BB_SSE
2879                 !subc_in_range_sse8
2880 #else
2881                 !subc_in_range_x
2882 #endif
2883                                 (na_c,ci_last,x_ci,cj_gl,stride,x,rl2))
2884             {
2885                 imask &= ~(1U << (cj_offset*GPU_NSUBCELL+ci_last));
2886                 npair--;
2887             }
2888         }
2889 #endif
2890
2891         if (npair > 0)
2892         {
2893             /* We have a useful sj entry, close it now */
2894
2895             /* Set the exclucions for the ci== sj entry.
2896              * Here we don't bother to check if this entry is actually flagged,
2897              * as it will nearly always be in the list.
2898              */
2899             if (sci_equals_scj)
2900             {
2901                 set_self_and_newton_excls_supersub(nbl,cj4_ind,cj_offset,cjo);
2902             }
2903
2904             /* Copy the cluster interaction mask to the list */
2905             for(w=0; w<NWARP; w++)
2906             {
2907                 cj4->imei[w].imask |= imask;
2908             }
2909
2910             nbl->work->cj_ind++;
2911
2912             /* Keep the count */
2913             nbl->nci_tot += npair;
2914
2915             /* Increase the closing index in i super-cell list */
2916             nbl->sci[nbl->nsci].cj4_ind_end = ((nbl->work->cj_ind+4-1)>>2);
2917         }
2918     }
2919 }
2920
2921 /* Set all atom-pair exclusions from the topology stored in excl
2922  * as masks in the pair-list for simple list i-entry nbl_ci
2923  */
2924 static void set_ci_top_excls(const nbnxn_search_t nbs,
2925                              nbnxn_pairlist_t *nbl,
2926                              gmx_bool diagRemoved,
2927                              int na_ci_2log,
2928                              int na_cj_2log,
2929                              const nbnxn_ci_t *nbl_ci,
2930                              const t_blocka *excl)
2931 {
2932     const int *cell;
2933     int ci;
2934     int cj_ind_first,cj_ind_last;
2935     int cj_first,cj_last;
2936     int ndirect;
2937     int i,ai,aj,si,eind,ge,se;
2938     int found,cj_ind_0,cj_ind_1,cj_ind_m;
2939     int cj_m;
2940     gmx_bool Found_si;
2941     int si_ind;
2942     nbnxn_excl_t *nbl_excl;
2943     int inner_i,inner_e;
2944
2945     cell = nbs->cell;
2946
2947     if (nbl_ci->cj_ind_end == nbl_ci->cj_ind_start)
2948     {
2949         /* Empty list */
2950         return;
2951     }
2952
2953     ci = nbl_ci->ci;
2954
2955     cj_ind_first = nbl_ci->cj_ind_start;
2956     cj_ind_last  = nbl->ncj - 1;
2957
2958     cj_first = nbl->cj[cj_ind_first].cj;
2959     cj_last  = nbl->cj[cj_ind_last].cj;
2960
2961     /* Determine how many contiguous j-cells we have starting
2962      * from the first i-cell. This number can be used to directly
2963      * calculate j-cell indices for excluded atoms.
2964      */
2965     ndirect = 0;
2966     if (na_ci_2log == na_cj_2log)
2967     {
2968         while (cj_ind_first + ndirect <= cj_ind_last &&
2969                nbl->cj[cj_ind_first+ndirect].cj == ci + ndirect)
2970         {
2971             ndirect++;
2972         }
2973     }
2974 #ifdef NBNXN_SEARCH_SSE
2975     else
2976     {
2977         while (cj_ind_first + ndirect <= cj_ind_last &&
2978                nbl->cj[cj_ind_first+ndirect].cj == ci_to_cj(na_cj_2log,ci) + ndirect)
2979         {
2980             ndirect++;
2981         }
2982     }
2983 #endif
2984
2985     /* Loop over the atoms in the i super-cell */
2986     for(i=0; i<nbl->na_sc; i++)
2987     {
2988         ai = nbs->a[ci*nbl->na_sc+i];
2989         if (ai >= 0)
2990         {
2991             si  = (i>>na_ci_2log);
2992
2993             /* Loop over the topology-based exclusions for this i-atom */
2994             for(eind=excl->index[ai]; eind<excl->index[ai+1]; eind++)
2995             {
2996                 aj = excl->a[eind];
2997
2998                 if (aj == ai)
2999                 {
3000                     /* The self exclusion are already set, save some time */
3001                     continue;
3002                 }
3003
3004                 ge = cell[aj];
3005
3006                 /* Without shifts we only calculate interactions j>i
3007                  * for one-way pair-lists.
3008                  */
3009                 if (diagRemoved && ge <= ci*nbl->na_sc + i)
3010                 {
3011                     continue;
3012                 }
3013
3014                 se = (ge >> na_cj_2log);
3015
3016                 /* Could the cluster se be in our list? */
3017                 if (se >= cj_first && se <= cj_last)
3018                 {
3019                     if (se < cj_first + ndirect)
3020                     {
3021                         /* We can calculate cj_ind directly from se */
3022                         found = cj_ind_first + se - cj_first;
3023                     }
3024                     else
3025                     {
3026                         /* Search for se using bisection */
3027                         found = -1;
3028                         cj_ind_0 = cj_ind_first + ndirect;
3029                         cj_ind_1 = cj_ind_last + 1;
3030                         while (found == -1 && cj_ind_0 < cj_ind_1)
3031                         {
3032                             cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
3033
3034                             cj_m = nbl->cj[cj_ind_m].cj;
3035
3036                             if (se == cj_m)
3037                             {
3038                                 found = cj_ind_m;
3039                             }
3040                             else if (se < cj_m)
3041                             {
3042                                 cj_ind_1 = cj_ind_m;
3043                             }
3044                             else
3045                             {
3046                                 cj_ind_0 = cj_ind_m + 1;
3047                             }
3048                         }
3049                     }
3050
3051                     if (found >= 0)
3052                     {
3053                         inner_i = i  - (si << na_ci_2log);
3054                         inner_e = ge - (se << na_cj_2log);
3055
3056                         nbl->cj[found].excl &= ~(1U<<((inner_i<<na_cj_2log) + inner_e));
3057                     }
3058                 }
3059             }
3060         }
3061     }
3062 }
3063
3064 /* Set all atom-pair exclusions from the topology stored in excl
3065  * as masks in the pair-list for i-super-cell entry nbl_sci
3066  */
3067 static void set_sci_top_excls(const nbnxn_search_t nbs,
3068                               nbnxn_pairlist_t *nbl,
3069                               gmx_bool diagRemoved,
3070                               int na_c_2log,
3071                               const nbnxn_sci_t *nbl_sci,
3072                               const t_blocka *excl)
3073 {
3074     const int *cell;
3075     int na_c;
3076     int sci;
3077     int cj_ind_first,cj_ind_last;
3078     int cj_first,cj_last;
3079     int ndirect;
3080     int i,ai,aj,si,eind,ge,se;
3081     int found,cj_ind_0,cj_ind_1,cj_ind_m;
3082     int cj_m;
3083     gmx_bool Found_si;
3084     int si_ind;
3085     nbnxn_excl_t *nbl_excl;
3086     int inner_i,inner_e,w;
3087
3088     cell = nbs->cell;
3089
3090     na_c = nbl->na_ci;
3091
3092     if (nbl_sci->cj4_ind_end == nbl_sci->cj4_ind_start)
3093     {
3094         /* Empty list */
3095         return;
3096     }
3097
3098     sci = nbl_sci->sci;
3099
3100     cj_ind_first = nbl_sci->cj4_ind_start*NBNXN_GPU_JGROUP_SIZE;
3101     cj_ind_last  = nbl->work->cj_ind - 1;
3102
3103     cj_first = nbl->cj4[nbl_sci->cj4_ind_start].cj[0];
3104     cj_last  = nbl_cj(nbl,cj_ind_last);
3105
3106     /* Determine how many contiguous j-clusters we have starting
3107      * from the first i-cluster. This number can be used to directly
3108      * calculate j-cluster indices for excluded atoms.
3109      */
3110     ndirect = 0;
3111     while (cj_ind_first + ndirect <= cj_ind_last &&
3112            nbl_cj(nbl,cj_ind_first+ndirect) == sci*GPU_NSUBCELL + ndirect)
3113     {
3114         ndirect++;
3115     }
3116
3117     /* Loop over the atoms in the i super-cell */
3118     for(i=0; i<nbl->na_sc; i++)
3119     {
3120         ai = nbs->a[sci*nbl->na_sc+i];
3121         if (ai >= 0)
3122         {
3123             si  = (i>>na_c_2log);
3124
3125             /* Loop over the topology-based exclusions for this i-atom */
3126             for(eind=excl->index[ai]; eind<excl->index[ai+1]; eind++)
3127             {
3128                 aj = excl->a[eind];
3129
3130                 if (aj == ai)
3131                 {
3132                     /* The self exclusion are already set, save some time */
3133                     continue;
3134                 }
3135
3136                 ge = cell[aj];
3137
3138                 /* Without shifts we only calculate interactions j>i
3139                  * for one-way pair-lists.
3140                  */
3141                 if (diagRemoved && ge <= sci*nbl->na_sc + i)
3142                 {
3143                     continue;
3144                 }
3145
3146                 se = ge>>na_c_2log;
3147                 /* Could the cluster se be in our list? */
3148                 if (se >= cj_first && se <= cj_last)
3149                 {
3150                     if (se < cj_first + ndirect)
3151                     {
3152                         /* We can calculate cj_ind directly from se */
3153                         found = cj_ind_first + se - cj_first;
3154                     }
3155                     else
3156                     {
3157                         /* Search for se using bisection */
3158                         found = -1;
3159                         cj_ind_0 = cj_ind_first + ndirect;
3160                         cj_ind_1 = cj_ind_last + 1;
3161                         while (found == -1 && cj_ind_0 < cj_ind_1)
3162                         {
3163                             cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
3164
3165                             cj_m = nbl_cj(nbl,cj_ind_m);
3166
3167                             if (se == cj_m)
3168                             {
3169                                 found = cj_ind_m;
3170                             }
3171                             else if (se < cj_m)
3172                             {
3173                                 cj_ind_1 = cj_ind_m;
3174                             }
3175                             else
3176                             {
3177                                 cj_ind_0 = cj_ind_m + 1;
3178                             }
3179                         }
3180                     }
3181
3182                     if (found >= 0)
3183                     {
3184                         inner_i = i  - si*na_c;
3185                         inner_e = ge - se*na_c;
3186
3187 /* Macro for getting the index of atom a within a cluster */
3188 #define AMODI(a)  ((a) & (NBNXN_CPU_CLUSTER_I_SIZE - 1))
3189 /* Macro for converting an atom number to a cluster number */
3190 #define A2CI(a)   ((a) >> NBNXN_CPU_CLUSTER_I_SIZE_2LOG)
3191
3192                         if (nbl_imask0(nbl,found) & (1U << (AMODI(found)*GPU_NSUBCELL + si)))
3193                         {
3194                             w       = (inner_e >> 2);
3195
3196                             get_nbl_exclusions_1(nbl,A2CI(found),w,&nbl_excl);
3197
3198                             nbl_excl->pair[AMODI(inner_e)*nbl->na_ci+inner_i] &=
3199                                 ~(1U << (AMODI(found)*GPU_NSUBCELL + si));
3200                         }
3201
3202 #undef AMODI
3203 #undef A2CI
3204                     }
3205                 }
3206             }
3207         }
3208     }
3209 }
3210
3211 /* Reallocate the simple ci list for at least n entries */
3212 static void nb_realloc_ci(nbnxn_pairlist_t *nbl,int n)
3213 {
3214     nbl->ci_nalloc = over_alloc_small(n);
3215     nbnxn_realloc_void((void **)&nbl->ci,
3216                        nbl->nci*sizeof(*nbl->ci),
3217                        nbl->ci_nalloc*sizeof(*nbl->ci),
3218                        nbl->alloc,nbl->free);
3219 }
3220
3221 /* Reallocate the super-cell sci list for at least n entries */
3222 static void nb_realloc_sci(nbnxn_pairlist_t *nbl,int n)
3223 {
3224     nbl->sci_nalloc = over_alloc_small(n);
3225     nbnxn_realloc_void((void **)&nbl->sci,
3226                        nbl->nsci*sizeof(*nbl->sci),
3227                        nbl->sci_nalloc*sizeof(*nbl->sci),
3228                        nbl->alloc,nbl->free);
3229 }
3230
3231 /* Make a new ci entry at index nbl->nci */
3232 static void new_ci_entry(nbnxn_pairlist_t *nbl,int ci,int shift,int flags,
3233                          nbnxn_list_work_t *work)
3234 {
3235     if (nbl->nci + 1 > nbl->ci_nalloc)
3236     {
3237         nb_realloc_ci(nbl,nbl->nci+1);
3238     }
3239     nbl->ci[nbl->nci].ci            = ci;
3240     nbl->ci[nbl->nci].shift         = shift;
3241     /* Store the interaction flags along with the shift */
3242     nbl->ci[nbl->nci].shift        |= flags;
3243     nbl->ci[nbl->nci].cj_ind_start  = nbl->ncj;
3244     nbl->ci[nbl->nci].cj_ind_end    = nbl->ncj;
3245 }
3246
3247 /* Make a new sci entry at index nbl->nsci */
3248 static void new_sci_entry(nbnxn_pairlist_t *nbl,int sci,int shift,int flags,
3249                           nbnxn_list_work_t *work)
3250 {
3251     if (nbl->nsci + 1 > nbl->sci_nalloc)
3252     {
3253         nb_realloc_sci(nbl,nbl->nsci+1);
3254     }
3255     nbl->sci[nbl->nsci].sci           = sci;
3256     nbl->sci[nbl->nsci].shift         = shift;
3257     nbl->sci[nbl->nsci].cj4_ind_start = nbl->ncj4;
3258     nbl->sci[nbl->nsci].cj4_ind_end   = nbl->ncj4;
3259 }
3260
3261 /* Sort the simple j-list cj on exclusions.
3262  * Entries with exclusions will all be sorted to the beginning of the list.
3263  */
3264 static void sort_cj_excl(nbnxn_cj_t *cj,int ncj,
3265                          nbnxn_list_work_t *work)
3266 {
3267     int jnew,j;
3268
3269     if (ncj > work->cj_nalloc)
3270     {
3271         work->cj_nalloc = over_alloc_large(ncj);
3272         srenew(work->cj,work->cj_nalloc);
3273     }
3274
3275     /* Make a list of the j-cells involving exclusions */
3276     jnew = 0;
3277     for(j=0; j<ncj; j++)
3278     {
3279         if (cj[j].excl != NBNXN_INT_MASK_ALL)
3280         {
3281             work->cj[jnew++] = cj[j];
3282         }
3283     }
3284     /* Check if there are exclusions at all or not just the first entry */
3285     if (!((jnew == 0) ||
3286           (jnew == 1 && cj[0].excl != NBNXN_INT_MASK_ALL)))
3287     {
3288         for(j=0; j<ncj; j++)
3289         {
3290             if (cj[j].excl == NBNXN_INT_MASK_ALL)
3291             {
3292                 work->cj[jnew++] = cj[j];
3293             }
3294         }
3295         for(j=0; j<ncj; j++)
3296         {
3297             cj[j] = work->cj[j];
3298         }
3299     }
3300 }
3301
3302 /* Close this simple list i entry */
3303 static void close_ci_entry_simple(nbnxn_pairlist_t *nbl)
3304 {
3305     int jlen;
3306
3307     /* All content of the new ci entry have already been filled correctly,
3308      * we only need to increase the count here (for non empty lists).
3309      */
3310     jlen = nbl->ci[nbl->nci].cj_ind_end - nbl->ci[nbl->nci].cj_ind_start;
3311     if (jlen > 0)
3312     {
3313         sort_cj_excl(nbl->cj+nbl->ci[nbl->nci].cj_ind_start,jlen,nbl->work);
3314
3315         if (nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0))
3316         {
3317             nbl->work->ncj_hlj += jlen;
3318         }
3319         else if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
3320         {
3321             nbl->work->ncj_noq += jlen;
3322         }
3323
3324         nbl->nci++;
3325     }
3326 }
3327
3328 /* Split sci entry for load balancing on the GPU.
3329  * As we only now the current count on our own thread,
3330  * we will need to estimate the current total amount of i-entries.
3331  * As the lists get concatenated later, this estimate depends
3332  * both on nthread and our own thread index thread.
3333  */
3334 static void split_sci_entry(nbnxn_pairlist_t *nbl,
3335                             int nsp_max_av,gmx_bool progBal,int nc_bal,
3336                             int thread,int nthread)
3337 {
3338     int nsci_est;
3339     int nsp_max;
3340     int cj4_start,cj4_end,j4len,cj4;
3341     int sci;
3342     int nsp,nsp_sci,nsp_cj4,nsp_cj4_e,nsp_cj4_p;
3343     int p;
3344
3345     /* Estimate the total numbers of ci's of the nblist combined
3346      * over all threads using the target number of ci's.
3347      */
3348     nsci_est = nc_bal*thread/nthread + nbl->nsci;
3349     if (progBal)
3350     {
3351         /* The first ci blocks should be larger, to avoid overhead.
3352          * The last ci blocks should be smaller, to improve load balancing.
3353          */
3354         nsp_max = max(1,
3355                       nsp_max_av*nc_bal*3/(2*(nsci_est - 1 + nc_bal)));
3356     }
3357     else
3358     {
3359         nsp_max = nsp_max_av;
3360     }
3361
3362     cj4_start = nbl->sci[nbl->nsci-1].cj4_ind_start;
3363     cj4_end   = nbl->sci[nbl->nsci-1].cj4_ind_end;
3364     j4len = cj4_end - cj4_start;
3365
3366     if (j4len > 1 && j4len*GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE > nsp_max)
3367     {
3368         /* Remove the last ci entry and process the cj4's again */
3369         nbl->nsci -= 1;
3370
3371         sci        = nbl->nsci;
3372         cj4        = cj4_start;
3373         nsp        = 0;
3374         nsp_sci    = 0;
3375         nsp_cj4_e  = 0;
3376         nsp_cj4    = 0;
3377         while (cj4 < cj4_end)
3378         {
3379             nsp_cj4_p = nsp_cj4;
3380             nsp_cj4   = 0;
3381             for(p=0; p<GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE; p++)
3382             {
3383                 nsp_cj4 += (nbl->cj4[cj4].imei[0].imask >> p) & 1;
3384             }
3385             nsp += nsp_cj4;
3386
3387             if (nsp > nsp_max && nsp > nsp_cj4)
3388             {
3389                 nbl->sci[sci].cj4_ind_end = cj4;
3390                 sci++;
3391                 nbl->nsci++;
3392                 if (nbl->nsci+1 > nbl->sci_nalloc)
3393                 {
3394                     nb_realloc_sci(nbl,nbl->nsci+1);
3395                 }
3396                 nbl->sci[sci].sci           = nbl->sci[nbl->nsci-1].sci;
3397                 nbl->sci[sci].shift         = nbl->sci[nbl->nsci-1].shift;
3398                 nbl->sci[sci].cj4_ind_start = cj4;
3399                 nsp_sci   = nsp - nsp_cj4;
3400                 nsp_cj4_e = nsp_cj4_p;
3401                 nsp       = nsp_cj4;
3402             }
3403
3404             cj4++;
3405         }
3406
3407         /* Put the remaining cj4's in a new ci entry */
3408         nbl->sci[sci].cj4_ind_end = cj4_end;
3409
3410         /* Possibly balance out the last two ci's
3411          * by moving the last cj4 of the second last ci.
3412          */
3413         if (nsp_sci - nsp_cj4_e >= nsp + nsp_cj4_e)
3414         {
3415             nbl->sci[sci-1].cj4_ind_end--;
3416             nbl->sci[sci].cj4_ind_start--;
3417         }
3418
3419         sci++;
3420         nbl->nsci++;
3421     }
3422 }
3423
3424 /* Clost this super/sub list i entry */
3425 static void close_ci_entry_supersub(nbnxn_pairlist_t *nbl,
3426                                     int nsp_max_av,
3427                                     gmx_bool progBal,int nc_bal,
3428                                     int thread,int nthread)
3429 {
3430     int j4len,tlen;
3431     int nb,b;
3432
3433     /* All content of the new ci entry have already been filled correctly,
3434      * we only need to increase the count here (for non empty lists).
3435      */
3436     j4len = nbl->sci[nbl->nsci].cj4_ind_end - nbl->sci[nbl->nsci].cj4_ind_start;
3437     if (j4len > 0)
3438     {
3439         /* We can only have complete blocks of 4 j-entries in a list,
3440          * so round the count up before closing.
3441          */
3442         nbl->ncj4         = ((nbl->work->cj_ind + 4-1) >> 2);
3443         nbl->work->cj_ind = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
3444
3445         nbl->nsci++;
3446
3447         if (nsp_max_av > 0)
3448         {
3449             split_sci_entry(nbl,nsp_max_av,progBal,nc_bal,thread,nthread);
3450         }
3451     }
3452 }
3453
3454 /* Syncs the working array before adding another grid pair to the list */
3455 static void sync_work(nbnxn_pairlist_t *nbl)
3456 {
3457     if (!nbl->bSimple)
3458     {
3459         nbl->work->cj_ind   = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
3460         nbl->work->cj4_init = nbl->ncj4;
3461     }
3462 }
3463
3464 /* Clears an nbnxn_pairlist_t data structure */
3465 static void clear_pairlist(nbnxn_pairlist_t *nbl)
3466 {
3467     nbl->nci           = 0;
3468     nbl->nsci          = 0;
3469     nbl->ncj           = 0;
3470     nbl->ncj4          = 0;
3471     nbl->nci_tot       = 0;
3472     nbl->nexcl         = 1;
3473
3474     nbl->work->ncj_noq = 0;
3475     nbl->work->ncj_hlj = 0;
3476 }
3477
3478 /* Sets a simple list i-cell bounding box, including PBC shift */
3479 static void set_icell_bb_simple(const float *bb,int ci,
3480                                 real shx,real shy,real shz,
3481                                 float *bb_ci)
3482 {
3483     int ia;
3484
3485     ia = ci*NNBSBB_B;
3486     bb_ci[BBL_X] = bb[ia+BBL_X] + shx;
3487     bb_ci[BBL_Y] = bb[ia+BBL_Y] + shy;
3488     bb_ci[BBL_Z] = bb[ia+BBL_Z] + shz;
3489     bb_ci[BBU_X] = bb[ia+BBU_X] + shx;
3490     bb_ci[BBU_Y] = bb[ia+BBU_Y] + shy;
3491     bb_ci[BBU_Z] = bb[ia+BBU_Z] + shz;
3492 }
3493
3494 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
3495 static void set_icell_bb_supersub(const float *bb,int ci,
3496                                   real shx,real shy,real shz,
3497                                   float *bb_ci)
3498 {
3499     int ia,m,i;
3500
3501 #ifdef NBNXN_BBXXXX
3502     ia = ci*(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX;
3503     for(m=0; m<(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX; m+=NNBSBB_XXXX)
3504     {
3505         for(i=0; i<STRIDE_8BB; i++)
3506         {
3507             bb_ci[m+0*STRIDE_8BB+i] = bb[ia+m+0*STRIDE_8BB+i] + shx;
3508             bb_ci[m+1*STRIDE_8BB+i] = bb[ia+m+1*STRIDE_8BB+i] + shy;
3509             bb_ci[m+2*STRIDE_8BB+i] = bb[ia+m+2*STRIDE_8BB+i] + shz;
3510             bb_ci[m+3*STRIDE_8BB+i] = bb[ia+m+3*STRIDE_8BB+i] + shx;
3511             bb_ci[m+4*STRIDE_8BB+i] = bb[ia+m+4*STRIDE_8BB+i] + shy;
3512             bb_ci[m+5*STRIDE_8BB+i] = bb[ia+m+5*STRIDE_8BB+i] + shz;
3513         }
3514     }
3515 #else
3516     ia = ci*GPU_NSUBCELL*NNBSBB_B;
3517     for(i=0; i<GPU_NSUBCELL*NNBSBB_B; i+=NNBSBB_B)
3518     {
3519         bb_ci[BBL_X] = bb[ia+BBL_X] + shx;
3520         bb_ci[BBL_Y] = bb[ia+BBL_Y] + shy;
3521         bb_ci[BBL_Z] = bb[ia+BBL_Z] + shz;
3522         bb_ci[BBU_X] = bb[ia+BBU_X] + shx;
3523         bb_ci[BBU_Y] = bb[ia+BBU_Y] + shy;
3524         bb_ci[BBU_Z] = bb[ia+BBU_Z] + shz;
3525     }
3526 #endif
3527 }
3528
3529 /* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
3530 static void icell_set_x_simple(int ci,
3531                                real shx,real shy,real shz,
3532                                int na_c,
3533                                int stride,const real *x,
3534                                nbnxn_list_work_t *work)
3535 {
3536     int  ia,i;
3537
3538     ia = ci*NBNXN_CPU_CLUSTER_I_SIZE;
3539
3540     for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE; i++)
3541     {
3542         work->x_ci[i*STRIDE_XYZ+XX] = x[(ia+i)*stride+XX] + shx;
3543         work->x_ci[i*STRIDE_XYZ+YY] = x[(ia+i)*stride+YY] + shy;
3544         work->x_ci[i*STRIDE_XYZ+ZZ] = x[(ia+i)*stride+ZZ] + shz;
3545     }
3546 }
3547
3548 /* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
3549 static void icell_set_x_supersub(int ci,
3550                                  real shx,real shy,real shz,
3551                                  int na_c,
3552                                  int stride,const real *x,
3553                                  nbnxn_list_work_t *work)
3554 {
3555     int  ia,i;
3556     real *x_ci;
3557
3558     x_ci = work->x_ci;
3559
3560     ia = ci*GPU_NSUBCELL*na_c;
3561     for(i=0; i<GPU_NSUBCELL*na_c; i++)
3562     {
3563         x_ci[i*DIM + XX] = x[(ia+i)*stride + XX] + shx;
3564         x_ci[i*DIM + YY] = x[(ia+i)*stride + YY] + shy;
3565         x_ci[i*DIM + ZZ] = x[(ia+i)*stride + ZZ] + shz;
3566     }
3567 }
3568
3569 #ifdef NBNXN_SEARCH_SSE
3570 /* Copies PBC shifted super-cell packed atom coordinates to working array */
3571 static void icell_set_x_supersub_sse8(int ci,
3572                                       real shx,real shy,real shz,
3573                                       int na_c,
3574                                       int stride,const real *x,
3575                                       nbnxn_list_work_t *work)
3576 {
3577     int  si,io,ia,i,j;
3578     real *x_ci;
3579
3580     x_ci = work->x_ci;
3581
3582     for(si=0; si<GPU_NSUBCELL; si++)
3583     {
3584         for(i=0; i<na_c; i+=STRIDE_8BB)
3585         {
3586             io = si*na_c + i;
3587             ia = ci*GPU_NSUBCELL*na_c + io;
3588             for(j=0; j<STRIDE_8BB; j++)
3589             {
3590                 x_ci[io*DIM + j + XX*STRIDE_8BB] = x[(ia+j)*stride+XX] + shx;
3591                 x_ci[io*DIM + j + YY*STRIDE_8BB] = x[(ia+j)*stride+YY] + shy;
3592                 x_ci[io*DIM + j + ZZ*STRIDE_8BB] = x[(ia+j)*stride+ZZ] + shz;
3593             }
3594         }
3595     }
3596 }
3597 #endif
3598
3599 static real nbnxn_rlist_inc_nonloc_fac = 0.6;
3600
3601 /* Due to the cluster size the effective pair-list is longer than
3602  * that of a simple atom pair-list. This function gives the extra distance.
3603  */
3604 real nbnxn_get_rlist_effective_inc(int cluster_size,real atom_density)
3605 {
3606     return ((0.5 + nbnxn_rlist_inc_nonloc_fac)*sqr(((cluster_size) - 1.0)/(cluster_size))*pow((cluster_size)/(atom_density),1.0/3.0));
3607 }
3608
3609 /* Estimates the interaction volume^2 for non-local interactions */
3610 static real nonlocal_vol2(const gmx_domdec_zones_t *zones,rvec ls,real r)
3611 {
3612     int  z,d;
3613     real cl,ca,za;
3614     real vold_est;
3615     real vol2_est_tot;
3616
3617     vol2_est_tot = 0;
3618
3619     /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
3620      * not home interaction volume^2. As these volumes are not additive,
3621      * this is an overestimate, but it would only be significant in the limit
3622      * of small cells, where we anyhow need to split the lists into
3623      * as small parts as possible.
3624      */
3625
3626     for(z=0; z<zones->n; z++)
3627     {
3628         if (zones->shift[z][XX] + zones->shift[z][YY] + zones->shift[z][ZZ] == 1)
3629         {
3630             cl = 0;
3631             ca = 1;
3632             za = 1;
3633             for(d=0; d<DIM; d++)
3634             {
3635                 if (zones->shift[z][d] == 0)
3636                 {
3637                     cl += 0.5*ls[d];
3638                     ca *= ls[d];
3639                     za *= zones->size[z].x1[d] - zones->size[z].x0[d];
3640                 }
3641             }
3642
3643             /* 4 octants of a sphere */
3644             vold_est  = 0.25*M_PI*r*r*r*r;
3645             /* 4 quarter pie slices on the edges */
3646             vold_est += 4*cl*M_PI/6.0*r*r*r;
3647             /* One rectangular volume on a face */
3648             vold_est += ca*0.5*r*r;
3649
3650             vol2_est_tot += vold_est*za;
3651         }
3652     }
3653
3654     return vol2_est_tot;
3655 }
3656
3657 /* Estimates the average size of a full j-list for super/sub setup */
3658 static int get_nsubpair_max(const nbnxn_search_t nbs,
3659                             int iloc,
3660                             real rlist,
3661                             int min_ci_balanced)
3662 {
3663     const nbnxn_grid_t *grid;
3664     rvec ls;
3665     real xy_diag2,r_eff_sup,vol_est,nsp_est,nsp_est_nl;
3666     int  nsubpair_max;
3667
3668     grid = &nbs->grid[0];
3669
3670     ls[XX] = (grid->c1[XX] - grid->c0[XX])/(grid->ncx*GPU_NSUBCELL_X);
3671     ls[YY] = (grid->c1[YY] - grid->c0[YY])/(grid->ncy*GPU_NSUBCELL_Y);
3672     ls[ZZ] = (grid->c1[ZZ] - grid->c0[ZZ])*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z);
3673
3674     /* The average squared length of the diagonal of a sub cell */
3675     xy_diag2 = ls[XX]*ls[XX] + ls[YY]*ls[YY] + ls[ZZ]*ls[ZZ];
3676
3677     /* The formulas below are a heuristic estimate of the average nsj per si*/
3678     r_eff_sup = rlist + nbnxn_rlist_inc_nonloc_fac*sqr((grid->na_c - 1.0)/grid->na_c)*sqrt(xy_diag2/3);
3679
3680     if (!nbs->DomDec || nbs->zones->n == 1)
3681     {
3682         nsp_est_nl = 0;
3683     }
3684     else
3685     {
3686         nsp_est_nl =
3687             sqr(grid->atom_density/grid->na_c)*
3688             nonlocal_vol2(nbs->zones,ls,r_eff_sup);
3689     }
3690
3691     if (LOCAL_I(iloc))
3692     {
3693         /* Sub-cell interacts with itself */
3694         vol_est  = ls[XX]*ls[YY]*ls[ZZ];
3695         /* 6/2 rectangular volume on the faces */
3696         vol_est += (ls[XX]*ls[YY] + ls[XX]*ls[ZZ] + ls[YY]*ls[ZZ])*r_eff_sup;
3697         /* 12/2 quarter pie slices on the edges */
3698         vol_est += 2*(ls[XX] + ls[YY] + ls[ZZ])*0.25*M_PI*sqr(r_eff_sup);
3699         /* 4 octants of a sphere */
3700         vol_est += 0.5*4.0/3.0*M_PI*pow(r_eff_sup,3);
3701
3702         nsp_est = grid->nsubc_tot*vol_est*grid->atom_density/grid->na_c;
3703
3704         /* Subtract the non-local pair count */
3705         nsp_est -= nsp_est_nl;
3706
3707         if (debug)
3708         {
3709             fprintf(debug,"nsp_est local %5.1f non-local %5.1f\n",
3710                     nsp_est,nsp_est_nl);
3711         }
3712     }
3713     else
3714     {
3715         nsp_est = nsp_est_nl;
3716     }
3717
3718     if (min_ci_balanced <= 0 || grid->nc >= min_ci_balanced || grid->nc == 0)
3719     {
3720         /* We don't need to worry */
3721         nsubpair_max = -1;
3722     }
3723     else
3724     {
3725         /* Thus the (average) maximum j-list size should be as follows */
3726         nsubpair_max = max(1,(int)(nsp_est/min_ci_balanced+0.5));
3727
3728         /* Since the target value is a maximum (this avoid high outliers,
3729          * which lead to load imbalance), not average, we get more lists
3730          * than we ask for (to compensate we need to add GPU_NSUBCELL*4/4).
3731          * But more importantly, the optimal GPU performance moves
3732          * to lower number of block for very small blocks.
3733          * To compensate we add the maximum pair count per cj4.
3734          */
3735         nsubpair_max += GPU_NSUBCELL*NBNXN_CPU_CLUSTER_I_SIZE;
3736     }
3737
3738     if (debug)
3739     {
3740         fprintf(debug,"nbl nsp estimate %.1f, nsubpair_max %d\n",
3741                 nsp_est,nsubpair_max);
3742     }
3743
3744     return nsubpair_max;
3745 }
3746
3747 /* Debug list print function */
3748 static void print_nblist_ci_cj(FILE *fp,const nbnxn_pairlist_t *nbl)
3749 {
3750     int i,j;
3751
3752     for(i=0; i<nbl->nci; i++)
3753     {
3754         fprintf(fp,"ci %4d  shift %2d  ncj %3d\n",
3755                 nbl->ci[i].ci,nbl->ci[i].shift,
3756                 nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start);
3757
3758         for(j=nbl->ci[i].cj_ind_start; j<nbl->ci[i].cj_ind_end; j++)
3759         {
3760             fprintf(fp,"  cj %5d  imask %x\n",
3761                     nbl->cj[j].cj,
3762                     nbl->cj[j].excl);
3763         }
3764     }
3765 }
3766
3767 /* Debug list print function */
3768 static void print_nblist_sci_cj(FILE *fp,const nbnxn_pairlist_t *nbl)
3769 {
3770     int i,j4,j;
3771
3772     for(i=0; i<nbl->nsci; i++)
3773     {
3774         fprintf(fp,"ci %4d  shift %2d  ncj4 %2d\n",
3775                 nbl->sci[i].sci,nbl->sci[i].shift,
3776                 nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start);
3777
3778         for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
3779         {
3780             for(j=0; j<4; j++)
3781             {
3782                 fprintf(fp,"  sj %5d  imask %x\n",
3783                         nbl->cj4[j4].cj[j],
3784                         nbl->cj4[j4].imei[0].imask);
3785             }
3786         }
3787     }
3788 }
3789
3790 /* Combine pair lists *nbl generated on multiple threads nblc */
3791 static void combine_nblists(int nnbl,nbnxn_pairlist_t **nbl,
3792                             nbnxn_pairlist_t *nblc)
3793 {
3794     int nsci,ncj4,nexcl;
3795     int n,i;
3796
3797     if (nblc->bSimple)
3798     {
3799         gmx_incons("combine_nblists does not support simple lists");
3800     }
3801
3802     nsci  = nblc->nsci;
3803     ncj4  = nblc->ncj4;
3804     nexcl = nblc->nexcl;
3805     for(i=0; i<nnbl; i++)
3806     {
3807         nsci  += nbl[i]->nsci;
3808         ncj4  += nbl[i]->ncj4;
3809         nexcl += nbl[i]->nexcl;
3810     }
3811
3812     if (nsci > nblc->sci_nalloc)
3813     {
3814         nb_realloc_sci(nblc,nsci);
3815     }
3816     if (ncj4 > nblc->cj4_nalloc)
3817     {
3818         nblc->cj4_nalloc = over_alloc_small(ncj4);
3819         nbnxn_realloc_void((void **)&nblc->cj4,
3820                            nblc->ncj4*sizeof(*nblc->cj4),
3821                            nblc->cj4_nalloc*sizeof(*nblc->cj4),
3822                            nblc->alloc,nblc->free);
3823     }
3824     if (nexcl > nblc->excl_nalloc)
3825     {
3826         nblc->excl_nalloc = over_alloc_small(nexcl);
3827         nbnxn_realloc_void((void **)&nblc->excl,
3828                            nblc->nexcl*sizeof(*nblc->excl),
3829                            nblc->excl_nalloc*sizeof(*nblc->excl),
3830                            nblc->alloc,nblc->free);
3831     }
3832
3833     /* Each thread should copy its own data to the combined arrays,
3834      * as otherwise data will go back and forth between different caches.
3835      */
3836 #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
3837     for(n=0; n<nnbl; n++)
3838     {
3839         int sci_offset;
3840         int cj4_offset;
3841         int ci_offset;
3842         int excl_offset;
3843         int i,j4;
3844         const nbnxn_pairlist_t *nbli;
3845
3846         /* Determine the offset in the combined data for our thread */
3847         sci_offset  = nblc->nsci;
3848         cj4_offset  = nblc->ncj4;
3849         ci_offset   = nblc->nci_tot;
3850         excl_offset = nblc->nexcl;
3851
3852         for(i=0; i<n; i++)
3853         {
3854             sci_offset  += nbl[i]->nsci;
3855             cj4_offset  += nbl[i]->ncj4;
3856             ci_offset   += nbl[i]->nci_tot;
3857             excl_offset += nbl[i]->nexcl;
3858         }
3859
3860         nbli = nbl[n];
3861
3862         for(i=0; i<nbli->nsci; i++)
3863         {
3864             nblc->sci[sci_offset+i]                = nbli->sci[i];
3865             nblc->sci[sci_offset+i].cj4_ind_start += cj4_offset;
3866             nblc->sci[sci_offset+i].cj4_ind_end   += cj4_offset;
3867         }
3868
3869         for(j4=0; j4<nbli->ncj4; j4++)
3870         {
3871             nblc->cj4[cj4_offset+j4] = nbli->cj4[j4];
3872             nblc->cj4[cj4_offset+j4].imei[0].excl_ind += excl_offset;
3873             nblc->cj4[cj4_offset+j4].imei[1].excl_ind += excl_offset;
3874         }
3875
3876         for(j4=0; j4<nbli->nexcl; j4++)
3877         {
3878             nblc->excl[excl_offset+j4] = nbli->excl[j4];
3879         }
3880     }
3881
3882     for(n=0; n<nnbl; n++)
3883     {
3884         nblc->nsci    += nbl[n]->nsci;
3885         nblc->ncj4    += nbl[n]->ncj4;
3886         nblc->nci_tot += nbl[n]->nci_tot;
3887         nblc->nexcl   += nbl[n]->nexcl;
3888     }
3889 }
3890
3891 /* Returns the next ci to be processes by our thread */
3892 static gmx_bool next_ci(const nbnxn_grid_t *grid,
3893                         int conv,
3894                         int nth,int ci_block,
3895                         int *ci_x,int *ci_y,
3896                         int *ci_b,int *ci)
3897 {
3898     (*ci_b)++;
3899     (*ci)++;
3900
3901     if (*ci_b == ci_block)
3902     {
3903         /* Jump to the next block assigned to this task */
3904         *ci   += (nth - 1)*ci_block;
3905         *ci_b  = 0;
3906     }
3907
3908     if (*ci >= grid->nc*conv)
3909     {
3910         return FALSE;
3911     }
3912
3913     while (*ci >= grid->cxy_ind[*ci_x*grid->ncy + *ci_y + 1]*conv)
3914     {
3915         *ci_y += 1;
3916         if (*ci_y == grid->ncy)
3917         {
3918             *ci_x += 1;
3919             *ci_y  = 0;
3920         }
3921     }
3922
3923     return TRUE;
3924 }
3925
3926 /* Returns the distance^2 for which we put cell pairs in the list
3927  * without checking atom pair distances. This is usually < rlist^2.
3928  */
3929 static float boundingbox_only_distance2(const nbnxn_grid_t *gridi,
3930                                         const nbnxn_grid_t *gridj,
3931                                         real rlist,
3932                                         gmx_bool simple)
3933 {
3934     /* If the distance between two sub-cell bounding boxes is less
3935      * than this distance, do not check the distance between
3936      * all particle pairs in the sub-cell, since then it is likely
3937      * that the box pair has atom pairs within the cut-off.
3938      * We use the nblist cut-off minus 0.5 times the average x/y diagonal
3939      * spacing of the sub-cells. Around 40% of the checked pairs are pruned.
3940      * Using more than 0.5 gains at most 0.5%.
3941      * If forces are calculated more than twice, the performance gain
3942      * in the force calculation outweighs the cost of checking.
3943      * Note that with subcell lists, the atom-pair distance check
3944      * is only performed when only 1 out of 8 sub-cells in within range,
3945      * this is because the GPU is much faster than the cpu.
3946      */
3947     real bbx,bby;
3948     real rbb2;
3949
3950     bbx = 0.5*(gridi->sx + gridj->sx);
3951     bby = 0.5*(gridi->sy + gridj->sy);
3952     if (!simple)
3953     {
3954         bbx /= GPU_NSUBCELL_X;
3955         bby /= GPU_NSUBCELL_Y;
3956     }
3957
3958     rbb2 = sqr(max(0,rlist - 0.5*sqrt(bbx*bbx + bby*bby)));
3959
3960 #ifndef GMX_DOUBLE
3961     return rbb2;
3962 #else
3963     return (float)((1+GMX_FLOAT_EPS)*rbb2);
3964 #endif
3965 }
3966
3967 /* Generates the part of pair-list nbl assigned to our thread */
3968 static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
3969                                      const nbnxn_grid_t *gridi,
3970                                      const nbnxn_grid_t *gridj,
3971                                      nbnxn_search_work_t *work,
3972                                      const nbnxn_atomdata_t *nbat,
3973                                      const t_blocka *excl,
3974                                      real rlist,
3975                                      int nb_kernel_type,
3976                                      int nsubpair_max,
3977                                      gmx_bool progBal,
3978                                      int min_ci_balanced,
3979                                      int th,int nth,
3980                                      nbnxn_pairlist_t *nbl)
3981 {
3982     int  na_cj_2log;
3983     matrix box;
3984     real rl2;
3985     float rbb2;
3986     int  d;
3987     int  ci_block,ci_b,ci,ci_x,ci_y,ci_xy,cj;
3988     ivec shp;
3989     int  tx,ty,tz;
3990     int  shift;
3991     gmx_bool bMakeList;
3992     real shx,shy,shz;
3993     int  conv_i,cell0_i;
3994     const float *bb_i,*bbcz_i,*bbcz_j;
3995     const int *flags_i;
3996     real bx0,bx1,by0,by1,bz0,bz1;
3997     real bz1_frac;
3998     real d2cx,d2z,d2z_cx,d2z_cy,d2zx,d2zxy,d2xy;
3999     int  cxf,cxl,cyf,cyf_x,cyl;
4000     int  cx,cy;
4001     int  c0,c1,cs,cf,cl;
4002     int  ndistc;
4003     int  ncpcheck;
4004
4005     nbs_cycle_start(&work->cc[enbsCCsearch]);
4006
4007     if (gridj->bSimple != nbl->bSimple)
4008     {
4009         gmx_incons("Grid incompatible with pair-list");
4010     }
4011
4012     sync_work(nbl);
4013
4014     nbl->na_sc = gridj->na_sc;
4015     nbl->na_ci = gridj->na_c;
4016     nbl->na_cj = nbnxn_kernel_to_cj_size(nb_kernel_type);
4017     na_cj_2log = get_2log(nbl->na_cj);
4018
4019     nbl->rlist  = rlist;
4020
4021     copy_mat(nbs->box,box);
4022
4023     rl2 = nbl->rlist*nbl->rlist;
4024
4025     rbb2 = boundingbox_only_distance2(gridi,gridj,nbl->rlist,nbl->bSimple);
4026
4027     if (debug)
4028     {
4029         fprintf(debug,"nbl bounding box only distance %f\n",sqrt(rbb2));
4030     }
4031
4032     /* Set the shift range */
4033     for(d=0; d<DIM; d++)
4034     {
4035         /* Check if we need periodicity shifts.
4036          * Without PBC or with domain decomposition we don't need them.
4037          */
4038         if (d >= ePBC2npbcdim(nbs->ePBC) || nbs->dd_dim[d])
4039         {
4040             shp[d] = 0;
4041         }
4042         else
4043         {
4044             if (d == XX &&
4045                 box[XX][XX] - fabs(box[YY][XX]) - fabs(box[ZZ][XX]) < sqrt(rl2))
4046             {
4047                 shp[d] = 2;
4048             }
4049             else
4050             {
4051                 shp[d] = 1;
4052             }
4053         }
4054     }
4055
4056     if (nbl->bSimple && !gridi->bSimple)
4057     {
4058         conv_i  = gridi->na_sc/gridj->na_sc;
4059         bb_i    = gridi->bb_simple;
4060         bbcz_i  = gridi->bbcz_simple;
4061         flags_i = gridi->flags_simple;
4062     }
4063     else
4064     {
4065         conv_i  = 1;
4066         bb_i    = gridi->bb;
4067         bbcz_i  = gridi->bbcz;
4068         flags_i = gridi->flags;
4069     }
4070     cell0_i = gridi->cell0*conv_i;
4071
4072     bbcz_j = gridj->bbcz;
4073
4074     if (conv_i == 1)
4075     {
4076 #define CI_BLOCK_ENUM    5
4077 #define CI_BLOCK_DENOM  11
4078         /* Here we decide how to distribute the blocks over the threads.
4079          * We use prime numbers to try to avoid that the grid size becomes
4080          * a multiple of the number of threads, which would lead to some
4081          * threads getting "inner" pairs and others getting boundary pairs,
4082          * which in turns will lead to load imbalance between threads.
4083          * Set the block size as 5/11/ntask times the average number of cells
4084          * in a y,z slab. This should ensure a quite uniform distribution
4085          * of the grid parts of the different thread along all three grid
4086          * zone boundaries with 3D domain decomposition. At the same time
4087          * the blocks will not become too small.
4088          */
4089         ci_block = (gridi->nc*CI_BLOCK_ENUM)/(CI_BLOCK_DENOM*gridi->ncx*nth);
4090
4091         /* Ensure the blocks are not too small: avoids cache invalidation */
4092         if (ci_block*gridi->na_sc < 16)
4093         {
4094             ci_block = (16 + gridi->na_sc - 1)/gridi->na_sc;
4095         }
4096
4097         /* Without domain decomposition
4098          * or with less than 3 blocks per task, divide in nth blocks.
4099          */
4100         if (!nbs->DomDec || ci_block*3*nth > gridi->nc)
4101         {
4102             ci_block = (gridi->nc + nth - 1)/nth;
4103         }
4104     }
4105     else
4106     {
4107         /* Blocks of the conversion factor - 1 give a large repeat count
4108          * combined with a small block size. This should result in good
4109          * load balancing for both small and large domains.
4110          */
4111         ci_block = conv_i - 1;
4112     }
4113     if (debug)
4114     {
4115         fprintf(debug,"nbl nc_i %d col.av. %.1f ci_block %d\n",
4116                 gridi->nc,gridi->nc/(double)(gridi->ncx*gridi->ncy),ci_block);
4117     }
4118
4119     ndistc = 0;
4120     ncpcheck = 0;
4121
4122     ci_b = -1;
4123     ci   = th*ci_block - 1;
4124     ci_x = 0;
4125     ci_y = 0;
4126     while (next_ci(gridi,conv_i,nth,ci_block,&ci_x,&ci_y,&ci_b,&ci))
4127     {
4128         if (nbl->bSimple && flags_i[ci] == 0)
4129         {
4130             continue;
4131         }
4132
4133         d2cx = 0;
4134         if (gridj != gridi && shp[XX] == 0)
4135         {
4136             if (nbl->bSimple)
4137             {
4138                 bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX];
4139             }
4140             else
4141             {
4142                 bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx;
4143             }
4144             if (bx1 < gridj->c0[XX])
4145             {
4146                 d2cx = sqr(gridj->c0[XX] - bx1);
4147
4148                 if (d2cx >= rl2)
4149                 {
4150                     continue;
4151                 }
4152             }
4153         }
4154
4155         ci_xy = ci_x*gridi->ncy + ci_y;
4156
4157         /* Loop over shift vectors in three dimensions */
4158         for (tz=-shp[ZZ]; tz<=shp[ZZ]; tz++)
4159         {
4160             shz = tz*box[ZZ][ZZ];
4161
4162             bz0 = bbcz_i[ci*NNBSBB_D  ] + shz;
4163             bz1 = bbcz_i[ci*NNBSBB_D+1] + shz;
4164
4165             if (tz == 0)
4166             {
4167                 d2z = 0;
4168             }
4169             else if (tz < 0)
4170             {
4171                 d2z = sqr(bz1);
4172             }
4173             else
4174             {
4175                 d2z = sqr(bz0 - box[ZZ][ZZ]);
4176             }
4177
4178             d2z_cx = d2z + d2cx;
4179
4180             if (d2z_cx >= rl2)
4181             {
4182                 continue;
4183             }
4184
4185             bz1_frac =
4186                 bz1/((real)(gridi->cxy_ind[ci_xy+1] - gridi->cxy_ind[ci_xy]));
4187             if (bz1_frac < 0)
4188             {
4189                 bz1_frac = 0;
4190             }
4191             /* The check with bz1_frac close to or larger than 1 comes later */
4192
4193             for (ty=-shp[YY]; ty<=shp[YY]; ty++)
4194             {
4195                 shy = ty*box[YY][YY] + tz*box[ZZ][YY];
4196
4197                 if (nbl->bSimple)
4198                 {
4199                     by0 = bb_i[ci*NNBSBB_B         +YY] + shy;
4200                     by1 = bb_i[ci*NNBSBB_B+NNBSBB_C+YY] + shy;
4201                 }
4202                 else
4203                 {
4204                     by0 = gridi->c0[YY] + (ci_y  )*gridi->sy + shy;
4205                     by1 = gridi->c0[YY] + (ci_y+1)*gridi->sy + shy;
4206                 }
4207
4208                 get_cell_range(by0,by1,
4209                                gridj->ncy,gridj->c0[YY],gridj->sy,gridj->inv_sy,
4210                                d2z_cx,rl2,
4211                                &cyf,&cyl);
4212
4213                 if (cyf > cyl)
4214                 {
4215                     continue;
4216                 }
4217
4218                 d2z_cy = d2z;
4219                 if (by1 < gridj->c0[YY])
4220                 {
4221                     d2z_cy += sqr(gridj->c0[YY] - by1);
4222                 }
4223                 else if (by0 > gridj->c1[YY])
4224                 {
4225                     d2z_cy += sqr(by0 - gridj->c1[YY]);
4226                 }
4227
4228                 for (tx=-shp[XX]; tx<=shp[XX]; tx++)
4229                 {
4230                     shift = XYZ2IS(tx,ty,tz);
4231
4232 #ifdef NBNXN_SHIFT_BACKWARD
4233                     if (gridi == gridj && shift > CENTRAL)
4234                     {
4235                         continue;
4236                     }
4237 #endif
4238
4239                     shx = tx*box[XX][XX] + ty*box[YY][XX] + tz*box[ZZ][XX];
4240
4241                     if (nbl->bSimple)
4242                     {
4243                         bx0 = bb_i[ci*NNBSBB_B         +XX] + shx;
4244                         bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX] + shx;
4245                     }
4246                     else
4247                     {
4248                         bx0 = gridi->c0[XX] + (ci_x  )*gridi->sx + shx;
4249                         bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx + shx;
4250                     }
4251
4252                     get_cell_range(bx0,bx1,
4253                                    gridj->ncx,gridj->c0[XX],gridj->sx,gridj->inv_sx,
4254                                    d2z_cy,rl2,
4255                                    &cxf,&cxl);
4256
4257                     if (cxf > cxl)
4258                     {
4259                         continue;
4260                     }
4261
4262                     if (nbl->bSimple)
4263                     {
4264                         new_ci_entry(nbl,cell0_i+ci,shift,flags_i[ci],
4265                                      nbl->work);
4266                     }
4267                     else
4268                     {
4269                         new_sci_entry(nbl,cell0_i+ci,shift,flags_i[ci],
4270                                       nbl->work);
4271                     }
4272
4273 #ifndef NBNXN_SHIFT_BACKWARD
4274                     if (cxf < ci_x)
4275 #else
4276                     if (shift == CENTRAL && gridi == gridj &&
4277                         cxf < ci_x)
4278 #endif
4279                     {
4280                         /* Leave the pairs with i > j.
4281                          * x is the major index, so skip half of it.
4282                          */
4283                         cxf = ci_x;
4284                     }
4285
4286                     if (nbl->bSimple)
4287                     {
4288                         set_icell_bb_simple(bb_i,ci,shx,shy,shz,
4289                                             nbl->work->bb_ci);
4290                     }
4291                     else
4292                     {
4293                         set_icell_bb_supersub(bb_i,ci,shx,shy,shz,
4294                                               nbl->work->bb_ci);
4295                     }
4296
4297                     nbs->icell_set_x(cell0_i+ci,shx,shy,shz,
4298                                      gridi->na_c,nbat->xstride,nbat->x,
4299                                      nbl->work);
4300
4301                     for(cx=cxf; cx<=cxl; cx++)
4302                     {
4303                         d2zx = d2z;
4304                         if (gridj->c0[XX] + cx*gridj->sx > bx1)
4305                         {
4306                             d2zx += sqr(gridj->c0[XX] + cx*gridj->sx - bx1);
4307                         }
4308                         else if (gridj->c0[XX] + (cx+1)*gridj->sx < bx0)
4309                         {
4310                             d2zx += sqr(gridj->c0[XX] + (cx+1)*gridj->sx - bx0);
4311                         }
4312
4313 #ifndef NBNXN_SHIFT_BACKWARD
4314                         if (gridi == gridj &&
4315                             cx == 0 && cyf < ci_y)
4316 #else
4317                         if (gridi == gridj &&
4318                             cx == 0 && shift == CENTRAL && cyf < ci_y)
4319 #endif
4320                         {
4321                             /* Leave the pairs with i > j.
4322                              * Skip half of y when i and j have the same x.
4323                              */
4324                             cyf_x = ci_y;
4325                         }
4326                         else
4327                         {
4328                             cyf_x = cyf;
4329                         }
4330
4331                         for(cy=cyf_x; cy<=cyl; cy++)
4332                         {
4333                             c0 = gridj->cxy_ind[cx*gridj->ncy+cy];
4334                             c1 = gridj->cxy_ind[cx*gridj->ncy+cy+1];
4335 #ifdef NBNXN_SHIFT_BACKWARD
4336                             if (gridi == gridj &&
4337                                 shift == CENTRAL && c0 < ci)
4338                             {
4339                                 c0 = ci;
4340                             }
4341 #endif
4342
4343                             d2zxy = d2zx;
4344                             if (gridj->c0[YY] + cy*gridj->sy > by1)
4345                             {
4346                                 d2zxy += sqr(gridj->c0[YY] + cy*gridj->sy - by1);
4347                             }
4348                             else if (gridj->c0[YY] + (cy+1)*gridj->sy < by0)
4349                             {
4350                                 d2zxy += sqr(gridj->c0[YY] + (cy+1)*gridj->sy - by0);
4351                             }
4352                             if (c1 > c0 && d2zxy < rl2)
4353                             {
4354                                 cs = c0 + (int)(bz1_frac*(c1 - c0));
4355                                 if (cs >= c1)
4356                                 {
4357                                     cs = c1 - 1;
4358                                 }
4359
4360                                 d2xy = d2zxy - d2z;
4361
4362                                 /* Find the lowest cell that can possibly
4363                                  * be within range.
4364                                  */
4365                                 cf = cs;
4366                                 while(cf > c0 &&
4367                                       (bbcz_j[cf*NNBSBB_D+1] >= bz0 ||
4368                                        d2xy + sqr(bbcz_j[cf*NNBSBB_D+1] - bz0) < rl2))
4369                                 {
4370                                     cf--;
4371                                 }
4372
4373                                 /* Find the highest cell that can possibly
4374                                  * be within range.
4375                                  */
4376                                 cl = cs;
4377                                 while(cl < c1-1 &&
4378                                       (bbcz_j[cl*NNBSBB_D] <= bz1 ||
4379                                        d2xy + sqr(bbcz_j[cl*NNBSBB_D] - bz1) < rl2))
4380                                 {
4381                                     cl++;
4382                                 }
4383
4384 #ifdef NBNXN_REFCODE
4385                                 {
4386                                     /* Simple reference code, for debugging,
4387                                      * overrides the more complex code above.
4388                                      */
4389                                     int k;
4390                                     cf = c1;
4391                                     cl = -1;
4392                                     for(k=c0; k<c1; k++)
4393                                     {
4394                                         if (box_dist2(bx0,bx1,by0,by1,bz0,bz1,
4395                                                       bb+k*NNBSBB_B) < rl2 &&
4396                                             k < cf)
4397                                         {
4398                                             cf = k;
4399                                         }
4400                                         if (box_dist2(bx0,bx1,by0,by1,bz0,bz1,
4401                                                       bb+k*NNBSBB_B) < rl2 &&
4402                                             k > cl)
4403                                         {
4404                                             cl = k;
4405                                         }
4406                                     }
4407                                 }
4408 #endif
4409
4410                                 if (gridi == gridj)
4411                                 {
4412                                     /* We want each atom/cell pair only once,
4413                                      * only use cj >= ci.
4414                                      */
4415 #ifndef NBNXN_SHIFT_BACKWARD
4416                                     cf = max(cf,ci);
4417 #else
4418                                     if (shift == CENTRAL)
4419                                     {
4420                                         cf = max(cf,ci);
4421                                     }
4422 #endif
4423                                 }
4424
4425                                 if (cf <= cl)
4426                                 {
4427                                     switch (nb_kernel_type)
4428                                     {
4429                                     case nbk4x4_PlainC:
4430                                         check_subcell_list_space_simple(nbl,cl-cf+1);
4431
4432                                         make_cluster_list_simple(gridj,
4433                                                                  nbl,ci,cf,cl,
4434                                                                  (gridi == gridj && shift == CENTRAL),
4435                                                                  nbat->x,
4436                                                                  rl2,rbb2,
4437                                                                  &ndistc);
4438                                         break;
4439 #ifdef NBNXN_SEARCH_SSE
4440                                     case nbk4xN_X86_SIMD128:
4441                                         check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
4442                                         make_cluster_list_x86_simd128(gridj,
4443                                                                       nbl,ci,cf,cl,
4444                                                                       (gridi == gridj && shift == CENTRAL),
4445                                                                       nbat->x,
4446                                                                       rl2,rbb2,
4447                                                                       &ndistc);
4448                                         break;
4449 #ifdef GMX_X86_AVX_256
4450                                     case nbk4xN_X86_SIMD256:
4451                                         check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
4452                                         make_cluster_list_x86_simd256(gridj,
4453                                                                       nbl,ci,cf,cl,
4454                                                                       (gridi == gridj && shift == CENTRAL),
4455                                                                       nbat->x,
4456                                                                       rl2,rbb2,
4457                                                                       &ndistc);
4458                                         break;
4459 #endif
4460 #endif
4461                                     case nbk8x8x8_PlainC:
4462                                     case nbk8x8x8_CUDA:
4463                                         check_subcell_list_space_supersub(nbl,cl-cf+1);
4464                                         for(cj=cf; cj<=cl; cj++)
4465                                         {
4466                                             make_cluster_list_supersub(nbs,gridi,gridj,
4467                                                                        nbl,ci,cj,
4468                                                                        (gridi == gridj && shift == CENTRAL && ci == cj),
4469                                                                        nbat->xstride,nbat->x,
4470                                                                        rl2,rbb2,
4471                                                                        &ndistc);
4472                                         }
4473                                         break;
4474                                     }
4475                                     ncpcheck += cl - cf + 1;
4476                                 }
4477                             }
4478                         }
4479                     }
4480
4481                     /* Set the exclusions for this ci list */
4482                     if (nbl->bSimple)
4483                     {
4484                         set_ci_top_excls(nbs,
4485                                          nbl,
4486                                          shift == CENTRAL && gridi == gridj,
4487                                          gridj->na_c_2log,
4488                                          na_cj_2log,
4489                                          &(nbl->ci[nbl->nci]),
4490                                          excl);
4491                     }
4492                     else
4493                     {
4494                         set_sci_top_excls(nbs,
4495                                           nbl,
4496                                           shift == CENTRAL && gridi == gridj,
4497                                           gridj->na_c_2log,
4498                                           &(nbl->sci[nbl->nsci]),
4499                                           excl);
4500                     }
4501
4502                     /* Close this ci list */
4503                     if (nbl->bSimple)
4504                     {
4505                         close_ci_entry_simple(nbl);
4506                     }
4507                     else
4508                     {
4509                         close_ci_entry_supersub(nbl,
4510                                                 nsubpair_max,
4511                                                 progBal,min_ci_balanced,
4512                                                 th,nth);
4513                     }
4514                 }
4515             }
4516         }
4517     }
4518
4519     work->ndistc = ndistc;
4520
4521     nbs_cycle_stop(&work->cc[enbsCCsearch]);
4522
4523     if (debug)
4524     {
4525         fprintf(debug,"number of distance checks %d\n",ndistc);
4526         fprintf(debug,"ncpcheck %s %d\n",gridi==gridj ? "local" : "non-local",
4527                 ncpcheck);
4528
4529         if (nbl->bSimple)
4530         {
4531             print_nblist_statistics_simple(debug,nbl,nbs,rlist);
4532         }
4533         else
4534         {
4535             print_nblist_statistics_supersub(debug,nbl,nbs,rlist);
4536         }
4537
4538     }
4539 }
4540
4541 /* Make a local or non-local pair-list, depending on iloc */
4542 void nbnxn_make_pairlist(const nbnxn_search_t nbs,
4543                          const nbnxn_atomdata_t *nbat,
4544                          const t_blocka *excl,
4545                          real rlist,
4546                          int min_ci_balanced,
4547                          nbnxn_pairlist_set_t *nbl_list,
4548                          int iloc,
4549                          int nb_kernel_type,
4550                          t_nrnb *nrnb)
4551 {
4552     const nbnxn_grid_t *gridi,*gridj;
4553     int nzi,zi,zj0,zj1,zj;
4554     int nsubpair_max;
4555     int nth,th;
4556     int nnbl;
4557     nbnxn_pairlist_t **nbl;
4558     gmx_bool CombineNBLists;
4559     int np_tot,np_noq,np_hlj,nap;
4560
4561     nnbl            = nbl_list->nnbl;
4562     nbl             = nbl_list->nbl;
4563     CombineNBLists  = nbl_list->bCombined;
4564
4565     if (debug)
4566     {
4567         fprintf(debug,"ns making %d nblists\n", nnbl);
4568     }
4569
4570     if (nbl_list->bSimple)
4571     {
4572         switch (nb_kernel_type)
4573         {
4574 #ifdef NBNXN_SEARCH_SSE
4575         case nbk4xN_X86_SIMD128:
4576             nbs->icell_set_x = icell_set_x_x86_simd128;
4577             break;
4578 #ifdef GMX_X86_AVX_256
4579         case nbk4xN_X86_SIMD256:
4580             nbs->icell_set_x = icell_set_x_x86_simd256;
4581             break;
4582 #endif
4583 #endif
4584         default:
4585             nbs->icell_set_x = icell_set_x_simple;
4586             break;
4587         }
4588     }
4589     else
4590     {
4591 #ifdef NBNXN_SEARCH_SSE
4592         nbs->icell_set_x = icell_set_x_supersub_sse8;
4593 #else
4594         nbs->icell_set_x = icell_set_x_supersub;
4595 #endif
4596     }
4597
4598     if (LOCAL_I(iloc))
4599     {
4600         /* Only zone (grid) 0 vs 0 */
4601         nzi = 1;
4602         zj0 = 0;
4603         zj1 = 1;
4604     }
4605     else
4606     {
4607         nzi = nbs->zones->nizone;
4608     }
4609
4610     if (!nbl_list->bSimple && min_ci_balanced > 0)
4611     {
4612         nsubpair_max = get_nsubpair_max(nbs,iloc,rlist,min_ci_balanced);
4613     }
4614     else
4615     {
4616         nsubpair_max = 0;
4617     }
4618
4619     /* Clear all pair-lists */
4620     for(th=0; th<nnbl; th++)
4621     {
4622         clear_pairlist(nbl[th]);
4623     }
4624
4625     for(zi=0; zi<nzi; zi++)
4626     {
4627         gridi = &nbs->grid[zi];
4628
4629         if (NONLOCAL_I(iloc))
4630         {
4631             zj0 = nbs->zones->izone[zi].j0;
4632             zj1 = nbs->zones->izone[zi].j1;
4633             if (zi == 0)
4634             {
4635                 zj0++;
4636             }
4637         }
4638         for(zj=zj0; zj<zj1; zj++)
4639         {
4640             gridj = &nbs->grid[zj];
4641
4642             if (debug)
4643             {
4644                 fprintf(debug,"ns search grid %d vs %d\n",zi,zj);
4645             }
4646
4647             nbs_cycle_start(&nbs->cc[enbsCCsearch]);
4648
4649 #pragma omp parallel for num_threads(nnbl) schedule(static)
4650             for(th=0; th<nnbl; th++)
4651             {
4652                 if (CombineNBLists && th > 0)
4653                 {
4654                     clear_pairlist(nbl[th]);
4655                 }
4656
4657                 /* Divide the i super cell equally over the nblists */
4658                 nbnxn_make_pairlist_part(nbs,gridi,gridj,
4659                                          &nbs->work[th],nbat,excl,
4660                                          rlist,
4661                                          nb_kernel_type,
4662                                          nsubpair_max,
4663                                          (LOCAL_I(iloc) || nbs->zones->n <= 2),
4664                                          min_ci_balanced,
4665                                          th,nnbl,
4666                                          nbl[th]);
4667             }
4668             nbs_cycle_stop(&nbs->cc[enbsCCsearch]);
4669
4670             np_tot = 0;
4671             np_noq = 0;
4672             np_hlj = 0;
4673             for(th=0; th<nnbl; th++)
4674             {
4675                 inc_nrnb(nrnb,eNR_NBNXN_DIST2,nbs->work[th].ndistc);
4676
4677                 if (nbl_list->bSimple)
4678                 {
4679                     np_tot += nbl[th]->ncj;
4680                     np_noq += nbl[th]->work->ncj_noq;
4681                     np_hlj += nbl[th]->work->ncj_hlj;
4682                 }
4683                 else
4684                 {
4685                     /* This count ignores potential subsequent pair pruning */
4686                     np_tot += nbl[th]->nci_tot;
4687                 }
4688             }
4689             nap = nbl[0]->na_ci*nbl[0]->na_cj;
4690             nbl_list->natpair_ljq = (np_tot - np_noq)*nap - np_hlj*nap/2;
4691             nbl_list->natpair_lj  = np_noq*nap;
4692             nbl_list->natpair_q   = np_hlj*nap/2;
4693
4694             if (CombineNBLists && nnbl > 1)
4695             {
4696                 nbs_cycle_start(&nbs->cc[enbsCCcombine]);
4697
4698                 combine_nblists(nnbl-1,nbl+1,nbl[0]);
4699
4700                 nbs_cycle_stop(&nbs->cc[enbsCCcombine]);
4701             }
4702
4703         }
4704     }
4705
4706     /*
4707     print_supersub_nsp("nsubpair",nbl[0],iloc);
4708     */
4709
4710     /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
4711     if (LOCAL_I(iloc))
4712     {
4713         nbs->search_count++;
4714     }
4715     if (nbs->print_cycles &&
4716         (!nbs->DomDec || (nbs->DomDec && !LOCAL_I(iloc))) &&
4717         nbs->search_count % 100 == 0)
4718     {
4719         nbs_cycle_print(stderr,nbs);
4720     }
4721
4722     if (debug && (CombineNBLists && nnbl > 1))
4723     {
4724         if (nbl[0]->bSimple)
4725         {
4726             print_nblist_statistics_simple(debug,nbl[0],nbs,rlist);
4727         }
4728         else
4729         {
4730             print_nblist_statistics_supersub(debug,nbl[0],nbs,rlist);
4731         }
4732     }
4733
4734     if (gmx_debug_at)
4735     {
4736         if (nbl[0]->bSimple)
4737         {
4738             print_nblist_ci_cj(debug,nbl[0]);
4739         }
4740         else
4741         {
4742             print_nblist_sci_cj(debug,nbl[0]);
4743         }
4744     }
4745 }