Update copyright statements and change license to LGPL
[alexxy/gromacs.git] / src / mdlib / nbnxn_search.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2012, The GROMACS development team,
6  * check out http://www.gromacs.org for more information.
7  * Copyright (c) 2012, by the GROMACS development team, led by
8  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
9  * others, as listed in the AUTHORS file in the top-level source
10  * directory and at http://www.gromacs.org.
11  *
12  * GROMACS is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public License
14  * as published by the Free Software Foundation; either version 2.1
15  * of the License, or (at your option) any later version.
16  *
17  * GROMACS is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with GROMACS; if not, see
24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
26  *
27  * If you want to redistribute modifications to GROMACS, please
28  * consider that scientific software is very special. Version
29  * control is crucial - bugs must be traceable. We will be happy to
30  * consider code for inclusion in the official distribution, but
31  * derived work must not be called official GROMACS. Details are found
32  * in the README & COPYING files - if they are missing, get the
33  * official version at http://www.gromacs.org.
34  *
35  * To help us fund GROMACS development, we humbly ask that you cite
36  * the research papers on the package. Check out http://www.gromacs.org.
37  */
38
39 #ifdef HAVE_CONFIG_H
40 #include <config.h>
41 #endif
42
43 #include <math.h>
44 #include <string.h>
45 #include "sysstuff.h"
46 #include "smalloc.h"
47 #include "macros.h"
48 #include "maths.h"
49 #include "vec.h"
50 #include "pbc.h"
51 #include "nbnxn_consts.h"
52 #include "nbnxn_internal.h"
53 #include "nbnxn_atomdata.h"
54 #include "nbnxn_search.h"
55 #include "gmx_cyclecounter.h"
56 #include "gmxfio.h"
57 #include "gmx_omp_nthreads.h"
58 #include "nrnb.h"
59
60
61 /* Pair search box lower and upper corner in x,y,z.
62  * Store this in 4 iso 3 reals, which is useful with SSE.
63  * To avoid complicating the code we also use 4 without SSE.
64  */
65 #define NNBSBB_C         4
66 #define NNBSBB_B         (2*NNBSBB_C)
67 /* Pair search box lower and upper bound in z only. */
68 #define NNBSBB_D         2
69 /* Pair search box lower and upper corner x,y,z indices */
70 #define BBL_X  0
71 #define BBL_Y  1
72 #define BBL_Z  2
73 #define BBU_X  4
74 #define BBU_Y  5
75 #define BBU_Z  6
76
77
78 #ifdef NBNXN_SEARCH_SSE
79
80 #ifndef GMX_DOUBLE
81 #define NBNXN_SEARCH_SSE_SINGLE
82 #include "gmx_x86_simd_single.h"
83 #else
84 #include "gmx_x86_simd_double.h"
85 #endif
86
87 #if defined NBNXN_SEARCH_SSE_SINGLE && GPU_NSUBCELL == 8
88 #define NBNXN_8BB_SSE
89 #endif
90
91 /* The width of SSE/AVX128 with single precision for bounding boxes with GPU.
92  * Here AVX-256 turns out to be slightly slower than AVX-128.
93  */
94 #define STRIDE_8BB        4
95 #define STRIDE_8BB_2LOG   2
96
97
98 /* The functions below are macros as they are performance sensitive */
99
100 /* 4x4 list, pack=4: no complex conversion required */
101 /* i-cluster to j-cluster conversion */
102 #define CI_TO_CJ_J4(ci)   (ci)
103 /* cluster index to coordinate array index conversion */
104 #define X_IND_CI_J4(ci)  ((ci)*STRIDE_P4)
105 #define X_IND_CJ_J4(cj)  ((cj)*STRIDE_P4)
106
107 /* 4x2 list, pack=4: j-cluster size is half the packing width */
108 /* i-cluster to j-cluster conversion */
109 #define CI_TO_CJ_J2(ci)  ((ci)<<1)
110 /* cluster index to coordinate array index conversion */
111 #define X_IND_CI_J2(ci)  ((ci)*STRIDE_P4)
112 #define X_IND_CJ_J2(cj)  (((cj)>>1)*STRIDE_P4 + ((cj) & 1)*(PACK_X4>>1))
113
114 /* 4x8 list, pack=8: i-cluster size is half the packing width */
115 /* i-cluster to j-cluster conversion */
116 #define CI_TO_CJ_J8(ci)  ((ci)>>1)
117 /* cluster index to coordinate array index conversion */
118 #define X_IND_CI_J8(ci)  (((ci)>>1)*STRIDE_P8 + ((ci) & 1)*(PACK_X8>>1))
119 #define X_IND_CJ_J8(cj)  ((cj)*STRIDE_P8)
120
121 /* The j-cluster size is matched to the SIMD width */
122 #ifndef GMX_DOUBLE
123 /* 128 bits can hold 4 floats */
124 #define CI_TO_CJ_S128(ci)  CI_TO_CJ_J4(ci)
125 #define X_IND_CI_S128(ci)  X_IND_CI_J4(ci)
126 #define X_IND_CJ_S128(cj)  X_IND_CJ_J4(cj)
127 /* 256 bits can hold 8 floats */
128 #define CI_TO_CJ_S256(ci)  CI_TO_CJ_J8(ci)
129 #define X_IND_CI_S256(ci)  X_IND_CI_J8(ci)
130 #define X_IND_CJ_S256(cj)  X_IND_CJ_J8(cj)
131 #else
132 /* 128 bits can hold 2 doubles */
133 #define CI_TO_CJ_S128(ci)  CI_TO_CJ_J2(ci)
134 #define X_IND_CI_S128(ci)  X_IND_CI_J2(ci)
135 #define X_IND_CJ_S128(cj)  X_IND_CJ_J2(cj)
136 /* 256 bits can hold 4 doubles */
137 #define CI_TO_CJ_S256(ci)  CI_TO_CJ_J4(ci)
138 #define X_IND_CI_S256(ci)  X_IND_CI_J4(ci)
139 #define X_IND_CJ_S256(cj)  X_IND_CJ_J4(cj)
140 #endif
141
142 #endif /* NBNXN_SEARCH_SSE */
143
144
145 /* Interaction masks for 4xN atom interactions.
146  * Bit i*CJ_SIZE + j tells if atom i and j interact.
147  */
148 /* All interaction mask is the same for all kernels */
149 #define NBNXN_INT_MASK_ALL        0xffffffff
150 /* 4x4 kernel diagonal mask */
151 #define NBNXN_INT_MASK_DIAG       0x08ce
152 /* 4x2 kernel diagonal masks */
153 #define NBNXN_INT_MASK_DIAG_J2_0  0x0002
154 #define NBNXN_INT_MASK_DIAG_J2_1  0x002F
155 /* 4x8 kernel diagonal masks */
156 #define NBNXN_INT_MASK_DIAG_J8_0  0xf0f8fcfe
157 #define NBNXN_INT_MASK_DIAG_J8_1  0x0080c0e0
158
159
160 #ifdef NBNXN_SEARCH_SSE
161 /* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
162 #define NBNXN_BBXXXX
163 /* Size of bounding box corners quadruplet */
164 #define NNBSBB_XXXX      (NNBSBB_D*DIM*STRIDE_8BB)
165 #endif
166
167 /* We shift the i-particles backward for PBC.
168  * This leads to more conditionals than shifting forward.
169  * We do this to get more balanced pair lists.
170  */
171 #define NBNXN_SHIFT_BACKWARD
172
173
174 /* This define is a lazy way to avoid interdependence of the grid
175  * and searching data structures.
176  */
177 #define NBNXN_NA_SC_MAX (GPU_NSUBCELL*NBNXN_GPU_CLUSTER_SIZE)
178
179
180 static void nbs_cycle_clear(nbnxn_cycle_t *cc)
181 {
182     int i;
183
184     for(i=0; i<enbsCCnr; i++)
185     {
186         cc[i].count = 0;
187         cc[i].c     = 0;
188     }
189 }
190
191 static double Mcyc_av(const nbnxn_cycle_t *cc)
192 {
193     return (double)cc->c*1e-6/cc->count;
194 }
195
196 static void nbs_cycle_print(FILE *fp,const nbnxn_search_t nbs)
197 {
198     int n;
199     int t;
200
201     fprintf(fp,"\n");
202     fprintf(fp,"ns %4d grid %4.1f search %4.1f red.f %5.3f",
203             nbs->cc[enbsCCgrid].count,
204             Mcyc_av(&nbs->cc[enbsCCgrid]),
205             Mcyc_av(&nbs->cc[enbsCCsearch]),
206             Mcyc_av(&nbs->cc[enbsCCreducef]));
207
208     if (nbs->nthread_max > 1)
209     {
210         if (nbs->cc[enbsCCcombine].count > 0)
211         {
212             fprintf(fp," comb %5.2f",
213                     Mcyc_av(&nbs->cc[enbsCCcombine]));
214         }
215         fprintf(fp," s. th");
216         for(t=0; t<nbs->nthread_max; t++)
217         {
218             fprintf(fp," %4.1f",
219                     Mcyc_av(&nbs->work[t].cc[enbsCCsearch]));
220         }
221     }
222     fprintf(fp,"\n");
223 }
224
225 static void nbnxn_grid_init(nbnxn_grid_t * grid)
226 {
227     grid->cxy_na      = NULL;
228     grid->cxy_ind     = NULL;
229     grid->cxy_nalloc  = 0;
230     grid->bb          = NULL;
231     grid->bbj         = NULL;
232     grid->nc_nalloc   = 0;
233 }
234
235 static int get_2log(int n)
236 {
237     int log2;
238
239     log2 = 0;
240     while ((1<<log2) < n)
241     {
242         log2++;
243     }
244     if ((1<<log2) != n)
245     {
246         gmx_fatal(FARGS,"nbnxn na_c (%d) is not a power of 2",n);
247     }
248
249     return log2;
250 }
251
252 static int nbnxn_kernel_to_ci_size(int nb_kernel_type)
253 {
254     switch (nb_kernel_type)
255     {
256     case nbk4x4_PlainC:
257     case nbk4xN_X86_SIMD128:
258     case nbk4xN_X86_SIMD256:
259         return NBNXN_CPU_CLUSTER_I_SIZE;
260     case nbk8x8x8_CUDA:
261     case nbk8x8x8_PlainC:
262         /* The cluster size for super/sub lists is only set here.
263          * Any value should work for the pair-search and atomdata code.
264          * The kernels, of course, might require a particular value.
265          */
266         return NBNXN_GPU_CLUSTER_SIZE;
267     default:
268         gmx_incons("unknown kernel type");
269     }
270
271     return 0;
272 }
273
274 int nbnxn_kernel_to_cj_size(int nb_kernel_type)
275 {
276     switch (nb_kernel_type)
277     {
278     case nbk4x4_PlainC:
279         return NBNXN_CPU_CLUSTER_I_SIZE;
280     case nbk4xN_X86_SIMD128:
281         /* Number of reals that fit in SIMD (128 bits = 16 bytes) */
282         return 16/sizeof(real);
283     case nbk4xN_X86_SIMD256:
284         /* Number of reals that fit in SIMD (256 bits = 32 bytes) */
285         return 32/sizeof(real);
286     case nbk8x8x8_CUDA:
287     case nbk8x8x8_PlainC:
288         return nbnxn_kernel_to_ci_size(nb_kernel_type);
289     default:
290         gmx_incons("unknown kernel type");
291     }
292
293     return 0;
294 }
295
296 static int ci_to_cj(int na_cj_2log,int ci)
297 {
298     switch (na_cj_2log)
299     {
300     case 2: return  ci;     break;
301     case 1: return (ci<<1); break;
302     case 3: return (ci>>1); break;
303     }
304
305     return 0;
306 }
307
308 gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type)
309 {
310     if (nb_kernel_type == nbkNotSet)
311     {
312         gmx_fatal(FARGS, "Non-bonded kernel type not set for Verlet-style pair-list.");
313     }
314
315     switch (nb_kernel_type)
316     {
317     case nbk8x8x8_CUDA:
318     case nbk8x8x8_PlainC:
319         return FALSE;
320
321     case nbk4x4_PlainC:
322     case nbk4xN_X86_SIMD128:
323     case nbk4xN_X86_SIMD256:
324         return TRUE;
325
326     default:
327         gmx_incons("Invalid nonbonded kernel type passed!");
328         return FALSE;
329     }
330 }
331
332 void nbnxn_init_search(nbnxn_search_t * nbs_ptr,
333                        ivec *n_dd_cells,
334                        gmx_domdec_zones_t *zones,
335                        int nthread_max)
336 {
337     nbnxn_search_t nbs;
338     int d,g,t;
339
340     snew(nbs,1);
341     *nbs_ptr = nbs;
342
343     nbs->DomDec = (n_dd_cells != NULL);
344
345     clear_ivec(nbs->dd_dim);
346     nbs->ngrid = 1;
347     if (nbs->DomDec)
348     {
349         nbs->zones = zones;
350
351         for(d=0; d<DIM; d++)
352         {
353             if ((*n_dd_cells)[d] > 1)
354             {
355                 nbs->dd_dim[d] = 1;
356                 /* Each grid matches a DD zone */
357                 nbs->ngrid *= 2;
358             }
359         }
360     }
361
362     snew(nbs->grid,nbs->ngrid);
363     for(g=0; g<nbs->ngrid; g++)
364     {
365         nbnxn_grid_init(&nbs->grid[g]);
366     }
367     nbs->cell        = NULL;
368     nbs->cell_nalloc = 0;
369     nbs->a           = NULL;
370     nbs->a_nalloc    = 0;
371
372     nbs->nthread_max = nthread_max;
373
374     /* Initialize the work data structures for each thread */
375     snew(nbs->work,nbs->nthread_max);
376     for(t=0; t<nbs->nthread_max; t++)
377     {
378         nbs->work[t].cxy_na           = NULL;
379         nbs->work[t].cxy_na_nalloc    = 0;
380         nbs->work[t].sort_work        = NULL;
381         nbs->work[t].sort_work_nalloc = 0;
382     }
383
384     /* Initialize detailed nbsearch cycle counting */
385     nbs->print_cycles = (getenv("GMX_NBNXN_CYCLE") != 0);
386     nbs->search_count = 0;
387     nbs_cycle_clear(nbs->cc);
388     for(t=0; t<nbs->nthread_max; t++)
389     {
390         nbs_cycle_clear(nbs->work[t].cc);
391     }
392 }
393
394 static real grid_atom_density(int n,rvec corner0,rvec corner1)
395 {
396     rvec size;
397
398     rvec_sub(corner1,corner0,size);
399
400     return n/(size[XX]*size[YY]*size[ZZ]);
401 }
402
403 static int set_grid_size_xy(const nbnxn_search_t nbs,
404                             nbnxn_grid_t *grid,
405                             int n,rvec corner0,rvec corner1,
406                             real atom_density,
407                             int XFormat)
408 {
409     rvec size;
410     int  na_c;
411     real adens,tlen,tlen_x,tlen_y,nc_max;
412     int  t;
413
414     rvec_sub(corner1,corner0,size);
415
416     if (n > grid->na_sc)
417     {
418         /* target cell length */
419         if (grid->bSimple)
420         {
421             /* To minimize the zero interactions, we should make
422              * the largest of the i/j cell cubic.
423              */
424             na_c = max(grid->na_c,grid->na_cj);
425
426             /* Approximately cubic cells */
427             tlen   = pow(na_c/atom_density,1.0/3.0);
428             tlen_x = tlen;
429             tlen_y = tlen;
430         }
431         else
432         {
433             /* Approximately cubic sub cells */
434             tlen   = pow(grid->na_c/atom_density,1.0/3.0);
435             tlen_x = tlen*GPU_NSUBCELL_X;
436             tlen_y = tlen*GPU_NSUBCELL_Y;
437         }
438         /* We round ncx and ncy down, because we get less cell pairs
439          * in the nbsist when the fixed cell dimensions (x,y) are
440          * larger than the variable one (z) than the other way around.
441          */
442         grid->ncx = max(1,(int)(size[XX]/tlen_x));
443         grid->ncy = max(1,(int)(size[YY]/tlen_y));
444     }
445     else
446     {
447         grid->ncx = 1;
448         grid->ncy = 1;
449     }
450
451     /* We need one additional cell entry for particles moved by DD */
452     if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
453     {
454         grid->cxy_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
455         srenew(grid->cxy_na,grid->cxy_nalloc);
456         srenew(grid->cxy_ind,grid->cxy_nalloc+1);
457     }
458     for(t=0; t<nbs->nthread_max; t++)
459     {
460         if (grid->ncx*grid->ncy+1 > nbs->work[t].cxy_na_nalloc)
461         {
462             nbs->work[t].cxy_na_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
463             srenew(nbs->work[t].cxy_na,nbs->work[t].cxy_na_nalloc);
464         }
465     }
466
467     /* Worst case scenario of 1 atom in each last cell */
468     if (grid->na_cj <= grid->na_c)
469     {
470         nc_max = n/grid->na_sc + grid->ncx*grid->ncy;
471     }
472     else
473     {
474         nc_max = n/grid->na_sc + grid->ncx*grid->ncy*grid->na_cj/grid->na_c;
475     }
476
477     if (nc_max > grid->nc_nalloc)
478     {
479         int bb_nalloc;
480
481         grid->nc_nalloc = over_alloc_large(nc_max);
482         srenew(grid->nsubc,grid->nc_nalloc);
483         srenew(grid->bbcz,grid->nc_nalloc*NNBSBB_D);
484 #ifdef NBNXN_8BB_SSE
485         bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX;
486 #else
487         bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL*NNBSBB_B;
488 #endif
489         sfree_aligned(grid->bb);
490         /* This snew also zeros the contents, this avoid possible
491          * floating exceptions in SSE with the unused bb elements.
492          */
493         snew_aligned(grid->bb,bb_nalloc,16);
494
495         if (grid->bSimple)
496         {
497             if (grid->na_cj == grid->na_c)
498             {
499                 grid->bbj = grid->bb;
500             }
501             else
502             {
503                 sfree_aligned(grid->bbj);
504                 snew_aligned(grid->bbj,bb_nalloc*grid->na_c/grid->na_cj,16);
505             }
506         }
507
508         srenew(grid->flags,grid->nc_nalloc);
509     }
510
511     copy_rvec(corner0,grid->c0);
512     copy_rvec(corner1,grid->c1);
513     grid->sx = size[XX]/grid->ncx;
514     grid->sy = size[YY]/grid->ncy;
515     grid->inv_sx = 1/grid->sx;
516     grid->inv_sy = 1/grid->sy;
517
518     return nc_max;
519 }
520
521 #define SORT_GRID_OVERSIZE 2
522 #define SGSF (SORT_GRID_OVERSIZE + 1)
523
524 static void sort_atoms(int dim,gmx_bool Backwards,
525                        int *a,int n,rvec *x,
526                        real h0,real invh,int nsort,int *sort)
527 {
528     int i,c;
529     int zi,zim;
530     int cp,tmp;
531
532     if (n <= 1)
533     {
534         /* Nothing to do */
535         return;
536     }
537
538     /* For small oversize factors clearing the whole area is fastest.
539      * For large oversize we should clear the used elements after use.
540      */
541     for(i=0; i<nsort; i++)
542     {
543         sort[i] = -1;
544     }
545     /* Sort the particles using a simple index sort */
546     for(i=0; i<n; i++)
547     {
548         /* The cast takes care of float-point rounding effects below zero.
549          * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
550          * times the box height out of the box.
551          */
552         zi = (int)((x[a[i]][dim] - h0)*invh);
553
554 #ifdef DEBUG_NBNXN_GRIDDING
555         if (zi < 0 || zi >= nsort)
556         {
557             gmx_fatal(FARGS,"(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d\n",
558                       a[i],'x'+dim,x[a[i]][dim],h0,invh,zi,nsort);
559         }
560 #endif
561
562         /* Ideally this particle should go in sort cell zi,
563          * but that might already be in use,
564          * in that case find the first empty cell higher up
565          */
566         if (sort[zi] < 0)
567         {
568             sort[zi] = a[i];
569         }
570         else
571         {
572             /* We have multiple atoms in the same sorting slot.
573              * Sort on real z for minimal bounding box size.
574              * There is an extra check for identical z to ensure
575              * well-defined output order, independent of input order
576              * to ensure binary reproducibility after restarts.
577              */
578             while(sort[zi] >= 0 && ( x[a[i]][dim] >  x[sort[zi]][dim] ||
579                                     (x[a[i]][dim] == x[sort[zi]][dim] &&
580                                      a[i] > sort[zi])))
581             {
582                 zi++;
583             }
584
585             if (sort[zi] >= 0)
586             {
587                 /* Shift all elements by one slot until we find an empty slot */
588                 cp = sort[zi];
589                 zim = zi + 1;
590                 while (sort[zim] >= 0)
591                 {
592                     tmp = sort[zim];
593                     sort[zim] = cp;
594                     cp  = tmp;
595                     zim++;
596                 }
597                 sort[zim] = cp;
598             }
599             sort[zi] = a[i];
600         }
601     }
602
603     c = 0;
604     if (!Backwards)
605     {
606         for(zi=0; zi<nsort; zi++)
607         {
608             if (sort[zi] >= 0)
609             {
610                 a[c++] = sort[zi];
611             }
612         }
613     }
614     else
615     {
616         for(zi=nsort-1; zi>=0; zi--)
617         {
618             if (sort[zi] >= 0)
619             {
620                 a[c++] = sort[zi];
621             }
622         }
623     }
624     if (c < n)
625     {
626         gmx_incons("Lost particles while sorting");
627     }
628 }
629
630 #ifdef GMX_DOUBLE
631 #define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
632 #define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
633 #else
634 #define R2F_D(x) (x)
635 #define R2F_U(x) (x)
636 #endif
637
638 /* Coordinate order x,y,z, bb order xyz0 */
639 static void calc_bounding_box(int na,int stride,const real *x,float *bb)
640 {
641     int  i,j;
642     real xl,xh,yl,yh,zl,zh;
643
644     i = 0;
645     xl = x[i+XX];
646     xh = x[i+XX];
647     yl = x[i+YY];
648     yh = x[i+YY];
649     zl = x[i+ZZ];
650     zh = x[i+ZZ];
651     i += stride;
652     for(j=1; j<na; j++)
653     {
654         xl = min(xl,x[i+XX]);
655         xh = max(xh,x[i+XX]);
656         yl = min(yl,x[i+YY]);
657         yh = max(yh,x[i+YY]);
658         zl = min(zl,x[i+ZZ]);
659         zh = max(zh,x[i+ZZ]);
660         i += stride;
661     }
662     /* Note: possible double to float conversion here */
663     bb[BBL_X] = R2F_D(xl);
664     bb[BBL_Y] = R2F_D(yl);
665     bb[BBL_Z] = R2F_D(zl);
666     bb[BBU_X] = R2F_U(xh);
667     bb[BBU_Y] = R2F_U(yh);
668     bb[BBU_Z] = R2F_U(zh);
669 }
670
671 /* Packed coordinates, bb order xyz0 */
672 static void calc_bounding_box_x_x4(int na,const real *x,float *bb)
673 {
674     int  j;
675     real xl,xh,yl,yh,zl,zh;
676
677     xl = x[XX*PACK_X4];
678     xh = x[XX*PACK_X4];
679     yl = x[YY*PACK_X4];
680     yh = x[YY*PACK_X4];
681     zl = x[ZZ*PACK_X4];
682     zh = x[ZZ*PACK_X4];
683     for(j=1; j<na; j++)
684     {
685         xl = min(xl,x[j+XX*PACK_X4]);
686         xh = max(xh,x[j+XX*PACK_X4]);
687         yl = min(yl,x[j+YY*PACK_X4]);
688         yh = max(yh,x[j+YY*PACK_X4]);
689         zl = min(zl,x[j+ZZ*PACK_X4]);
690         zh = max(zh,x[j+ZZ*PACK_X4]);
691     }
692     /* Note: possible double to float conversion here */
693     bb[BBL_X] = R2F_D(xl);
694     bb[BBL_Y] = R2F_D(yl);
695     bb[BBL_Z] = R2F_D(zl);
696     bb[BBU_X] = R2F_U(xh);
697     bb[BBU_Y] = R2F_U(yh);
698     bb[BBU_Z] = R2F_U(zh);
699 }
700
701 /* Packed coordinates, bb order xyz0 */
702 static void calc_bounding_box_x_x8(int na,const real *x,float *bb)
703 {
704     int  j;
705     real xl,xh,yl,yh,zl,zh;
706
707     xl = x[XX*PACK_X8];
708     xh = x[XX*PACK_X8];
709     yl = x[YY*PACK_X8];
710     yh = x[YY*PACK_X8];
711     zl = x[ZZ*PACK_X8];
712     zh = x[ZZ*PACK_X8];
713     for(j=1; j<na; j++)
714     {
715         xl = min(xl,x[j+XX*PACK_X8]);
716         xh = max(xh,x[j+XX*PACK_X8]);
717         yl = min(yl,x[j+YY*PACK_X8]);
718         yh = max(yh,x[j+YY*PACK_X8]);
719         zl = min(zl,x[j+ZZ*PACK_X8]);
720         zh = max(zh,x[j+ZZ*PACK_X8]);
721     }
722     /* Note: possible double to float conversion here */
723     bb[BBL_X] = R2F_D(xl);
724     bb[BBL_Y] = R2F_D(yl);
725     bb[BBL_Z] = R2F_D(zl);
726     bb[BBU_X] = R2F_U(xh);
727     bb[BBU_Y] = R2F_U(yh);
728     bb[BBU_Z] = R2F_U(zh);
729 }
730
731 #ifdef NBNXN_SEARCH_SSE
732
733 /* Packed coordinates, bb order xyz0 */
734 static void calc_bounding_box_x_x4_halves(int na,const real *x,
735                                           float *bb,float *bbj)
736 {
737     calc_bounding_box_x_x4(min(na,2),x,bbj);
738
739     if (na > 2)
740     {
741         calc_bounding_box_x_x4(min(na-2,2),x+(PACK_X4>>1),bbj+NNBSBB_B);
742     }
743     else
744     {
745         /* Set the "empty" bounding box to the same as the first one,
746          * so we don't need to treat special cases in the rest of the code.
747          */
748         _mm_store_ps(bbj+NNBSBB_B         ,_mm_load_ps(bbj));
749         _mm_store_ps(bbj+NNBSBB_B+NNBSBB_C,_mm_load_ps(bbj+NNBSBB_C));
750     }
751
752     _mm_store_ps(bb         ,_mm_min_ps(_mm_load_ps(bbj),
753                                         _mm_load_ps(bbj+NNBSBB_B)));
754     _mm_store_ps(bb+NNBSBB_C,_mm_max_ps(_mm_load_ps(bbj+NNBSBB_C),
755                                         _mm_load_ps(bbj+NNBSBB_B+NNBSBB_C)));
756 }
757
758 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
759 static void calc_bounding_box_xxxx(int na,int stride,const real *x,float *bb)
760 {
761     int  i,j;
762     real xl,xh,yl,yh,zl,zh;
763
764     i = 0;
765     xl = x[i+XX];
766     xh = x[i+XX];
767     yl = x[i+YY];
768     yh = x[i+YY];
769     zl = x[i+ZZ];
770     zh = x[i+ZZ];
771     i += stride;
772     for(j=1; j<na; j++)
773     {
774         xl = min(xl,x[i+XX]);
775         xh = max(xh,x[i+XX]);
776         yl = min(yl,x[i+YY]);
777         yh = max(yh,x[i+YY]);
778         zl = min(zl,x[i+ZZ]);
779         zh = max(zh,x[i+ZZ]);
780         i += stride;
781     }
782     /* Note: possible double to float conversion here */
783     bb[0*STRIDE_8BB] = R2F_D(xl);
784     bb[1*STRIDE_8BB] = R2F_D(yl);
785     bb[2*STRIDE_8BB] = R2F_D(zl);
786     bb[3*STRIDE_8BB] = R2F_U(xh);
787     bb[4*STRIDE_8BB] = R2F_U(yh);
788     bb[5*STRIDE_8BB] = R2F_U(zh);
789 }
790
791 #endif /* NBNXN_SEARCH_SSE */
792
793 #ifdef NBNXN_SEARCH_SSE_SINGLE
794
795 /* Coordinate order xyz?, bb order xyz0 */
796 static void calc_bounding_box_sse(int na,const float *x,float *bb)
797 {
798     __m128 bb_0_SSE,bb_1_SSE;
799     __m128 x_SSE;
800
801     int  i;
802
803     bb_0_SSE = _mm_load_ps(x);
804     bb_1_SSE = bb_0_SSE;
805
806     for(i=1; i<na; i++)
807     {
808         x_SSE    = _mm_load_ps(x+i*NNBSBB_C);
809         bb_0_SSE = _mm_min_ps(bb_0_SSE,x_SSE);
810         bb_1_SSE = _mm_max_ps(bb_1_SSE,x_SSE);
811     }
812
813     _mm_store_ps(bb  ,bb_0_SSE);
814     _mm_store_ps(bb+4,bb_1_SSE);
815 }
816
817 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
818 static void calc_bounding_box_xxxx_sse(int na,const float *x,
819                                        float *bb_work,
820                                        real *bb)
821 {
822     calc_bounding_box_sse(na,x,bb_work);
823
824     bb[0*STRIDE_8BB] = bb_work[BBL_X];
825     bb[1*STRIDE_8BB] = bb_work[BBL_Y];
826     bb[2*STRIDE_8BB] = bb_work[BBL_Z];
827     bb[3*STRIDE_8BB] = bb_work[BBU_X];
828     bb[4*STRIDE_8BB] = bb_work[BBU_Y];
829     bb[5*STRIDE_8BB] = bb_work[BBU_Z];
830 }
831
832 #endif /* NBNXN_SEARCH_SSE_SINGLE */
833
834 #ifdef NBNXN_SEARCH_SSE
835
836 /* Combines pairs of consecutive bounding boxes */
837 static void combine_bounding_box_pairs(nbnxn_grid_t *grid,const float *bb)
838 {
839     int    i,j,sc2,nc2,c2;
840     __m128 min_SSE,max_SSE;
841
842     for(i=0; i<grid->ncx*grid->ncy; i++)
843     {
844         /* Starting bb in a column is expected to be 2-aligned */
845         sc2 = grid->cxy_ind[i]>>1;
846         /* For odd numbers skip the last bb here */
847         nc2 = (grid->cxy_na[i]+3)>>(2+1);
848         for(c2=sc2; c2<sc2+nc2; c2++)
849         {
850             min_SSE = _mm_min_ps(_mm_load_ps(bb+(c2*4+0)*NNBSBB_C),
851                                  _mm_load_ps(bb+(c2*4+2)*NNBSBB_C));
852             max_SSE = _mm_max_ps(_mm_load_ps(bb+(c2*4+1)*NNBSBB_C),
853                                  _mm_load_ps(bb+(c2*4+3)*NNBSBB_C));
854             _mm_store_ps(grid->bbj+(c2*2+0)*NNBSBB_C,min_SSE);
855             _mm_store_ps(grid->bbj+(c2*2+1)*NNBSBB_C,max_SSE);
856         }
857         if (((grid->cxy_na[i]+3)>>2) & 1)
858         {
859             /* Copy the last bb for odd bb count in this column */
860             for(j=0; j<NNBSBB_C; j++)
861             {
862                 grid->bbj[(c2*2+0)*NNBSBB_C+j] = bb[(c2*4+0)*NNBSBB_C+j];
863                 grid->bbj[(c2*2+1)*NNBSBB_C+j] = bb[(c2*4+1)*NNBSBB_C+j];
864             }
865         }
866     }
867 }
868
869 #endif
870
871
872 /* Prints the average bb size, used for debug output */
873 static void print_bbsizes_simple(FILE *fp,
874                                  const nbnxn_search_t nbs,
875                                  const nbnxn_grid_t *grid)
876 {
877     int  c,d;
878     dvec ba;
879
880     clear_dvec(ba);
881     for(c=0; c<grid->nc; c++)
882     {
883         for(d=0; d<DIM; d++)
884         {
885             ba[d] += grid->bb[c*NNBSBB_B+NNBSBB_C+d] - grid->bb[c*NNBSBB_B+d];
886         }
887     }
888     dsvmul(1.0/grid->nc,ba,ba);
889
890     fprintf(fp,"ns bb: %4.2f %4.2f %4.2f  %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
891             nbs->box[XX][XX]/grid->ncx,
892             nbs->box[YY][YY]/grid->ncy,
893             nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/grid->nc,
894             ba[XX],ba[YY],ba[ZZ],
895             ba[XX]*grid->ncx/nbs->box[XX][XX],
896             ba[YY]*grid->ncy/nbs->box[YY][YY],
897             ba[ZZ]*grid->nc/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
898 }
899
900 /* Prints the average bb size, used for debug output */
901 static void print_bbsizes_supersub(FILE *fp,
902                                    const nbnxn_search_t nbs,
903                                    const nbnxn_grid_t *grid)
904 {
905     int  ns,c,s;
906     dvec ba;
907
908     clear_dvec(ba);
909     ns = 0;
910     for(c=0; c<grid->nc; c++)
911     {
912 #ifdef NBNXN_BBXXXX
913         for(s=0; s<grid->nsubc[c]; s+=STRIDE_8BB)
914         {
915             int cs_w,i,d;
916
917             cs_w = (c*GPU_NSUBCELL + s)/STRIDE_8BB;
918             for(i=0; i<STRIDE_8BB; i++)
919             {
920                 for(d=0; d<DIM; d++)
921                 {
922                     ba[d] +=
923                         grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_8BB+i] -
924                         grid->bb[cs_w*NNBSBB_XXXX+     d *STRIDE_8BB+i];
925                 }
926             }
927         }
928 #else
929         for(s=0; s<grid->nsubc[c]; s++)
930         {
931             int cs,d;
932
933             cs = c*GPU_NSUBCELL + s;
934             for(d=0; d<DIM; d++)
935             {
936                 ba[d] +=
937                     grid->bb[cs*NNBSBB_B+NNBSBB_C+d] -
938                     grid->bb[cs*NNBSBB_B         +d];
939             }
940         }
941 #endif
942         ns += grid->nsubc[c];
943     }
944     dsvmul(1.0/ns,ba,ba);
945
946     fprintf(fp,"ns bb: %4.2f %4.2f %4.2f  %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
947             nbs->box[XX][XX]/(grid->ncx*GPU_NSUBCELL_X),
948             nbs->box[YY][YY]/(grid->ncy*GPU_NSUBCELL_Y),
949             nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z),
950             ba[XX],ba[YY],ba[ZZ],
951             ba[XX]*grid->ncx*GPU_NSUBCELL_X/nbs->box[XX][XX],
952             ba[YY]*grid->ncy*GPU_NSUBCELL_Y/nbs->box[YY][YY],
953             ba[ZZ]*grid->nc*GPU_NSUBCELL_Z/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
954 }
955
956 /* Potentially sorts atoms on LJ coefficients !=0 and ==0.
957  * Also sets interaction flags.
958  */
959 void sort_on_lj(nbnxn_atomdata_t *nbat,int na_c,
960                 int a0,int a1,const int *atinfo,
961                 int *order,
962                 int *flags)
963 {
964     int subc,s,a,n1,n2,a_lj_max,i,j;
965     int sort1[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
966     int sort2[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
967     gmx_bool haveQ;
968
969     *flags = 0;
970
971     subc = 0;
972     for(s=a0; s<a1; s+=na_c)
973     {
974         /* Make lists for this (sub-)cell on atoms with and without LJ */
975         n1 = 0;
976         n2 = 0;
977         haveQ = FALSE;
978         a_lj_max = -1;
979         for(a=s; a<min(s+na_c,a1); a++)
980         {
981             haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
982
983             if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
984             {
985                 sort1[n1++] = order[a];
986                 a_lj_max = a;
987             }
988             else
989             {
990                 sort2[n2++] = order[a];
991             }
992         }
993
994         /* If we don't have atom with LJ, there's nothing to sort */
995         if (n1 > 0)
996         {
997             *flags |= NBNXN_CI_DO_LJ(subc);
998
999             if (2*n1 <= na_c)
1000             {
1001                 /* Only sort when strictly necessary. Ordering particles
1002                  * Ordering particles can lead to less accurate summation
1003                  * due to rounding, both for LJ and Coulomb interactions.
1004                  */
1005                 if (2*(a_lj_max - s) >= na_c)
1006                 {
1007                     for(i=0; i<n1; i++)
1008                     {
1009                         order[a0+i] = sort1[i];
1010                     }
1011                     for(j=0; j<n2; j++)
1012                     {
1013                         order[a0+n1+j] = sort2[j];
1014                     }
1015                 }
1016
1017                 *flags |= NBNXN_CI_HALF_LJ(subc);
1018             }
1019         }
1020         if (haveQ)
1021         {
1022             *flags |= NBNXN_CI_DO_COUL(subc);
1023         }
1024         subc++;
1025     }
1026 }
1027
1028 /* Fill a pair search cell with atoms.
1029  * Potentially sorts atoms and sets the interaction flags.
1030  */
1031 void fill_cell(const nbnxn_search_t nbs,
1032                nbnxn_grid_t *grid,
1033                nbnxn_atomdata_t *nbat,
1034                int a0,int a1,
1035                const int *atinfo,
1036                rvec *x,
1037                int sx,int sy, int sz,
1038                float *bb_work)
1039 {
1040     int    na,a;
1041     size_t offset;
1042     float  *bb_ptr;
1043
1044     na = a1 - a0;
1045
1046     if (grid->bSimple)
1047     {
1048         sort_on_lj(nbat,grid->na_c,a0,a1,atinfo,nbs->a,
1049                    grid->flags+(a0>>grid->na_c_2log)-grid->cell0);
1050     }
1051
1052     /* Now we have sorted the atoms, set the cell indices */
1053     for(a=a0; a<a1; a++)
1054     {
1055         nbs->cell[nbs->a[a]] = a;
1056     }
1057
1058     copy_rvec_to_nbat_real(nbs->a+a0,a1-a0,grid->na_c,x,
1059                            nbat->XFormat,nbat->x,a0,
1060                            sx,sy,sz);
1061
1062     if (nbat->XFormat == nbatX4)
1063     {
1064         /* Store the bounding boxes as xyz.xyz. */
1065         offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
1066         bb_ptr = grid->bb + offset;
1067
1068 #if defined GMX_DOUBLE && defined NBNXN_SEARCH_SSE
1069         if (2*grid->na_cj == grid->na_c)
1070         {
1071             calc_bounding_box_x_x4_halves(na,nbat->x+X4_IND_A(a0),bb_ptr,
1072                                           grid->bbj+offset*2);
1073         }
1074         else
1075 #endif
1076         {
1077             calc_bounding_box_x_x4(na,nbat->x+X4_IND_A(a0),bb_ptr);
1078         }
1079     }
1080     else if (nbat->XFormat == nbatX8)
1081     {
1082         /* Store the bounding boxes as xyz.xyz. */
1083         offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
1084         bb_ptr = grid->bb + offset;
1085
1086         calc_bounding_box_x_x8(na,nbat->x+X8_IND_A(a0),bb_ptr);
1087     }
1088 #ifdef NBNXN_BBXXXX
1089     else if (!grid->bSimple)
1090     {
1091         /* Store the bounding boxes in a format convenient
1092          * for SSE calculations: xxxxyyyyzzzz...
1093                              */
1094         bb_ptr =
1095             grid->bb +
1096             ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_8BB_2LOG))*NNBSBB_XXXX +
1097             (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_8BB-1));
1098
1099 #ifdef NBNXN_SEARCH_SSE_SINGLE
1100         if (nbat->XFormat == nbatXYZQ)
1101         {
1102             calc_bounding_box_xxxx_sse(na,nbat->x+a0*nbat->xstride,
1103                                        bb_work,bb_ptr);
1104         }
1105         else
1106 #endif
1107         {
1108             calc_bounding_box_xxxx(na,nbat->xstride,nbat->x+a0*nbat->xstride,
1109                                    bb_ptr);
1110         }
1111         if (gmx_debug_at)
1112         {
1113             fprintf(debug,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
1114                     sx,sy,sz,
1115                     bb_ptr[0*STRIDE_8BB],bb_ptr[3*STRIDE_8BB],
1116                     bb_ptr[1*STRIDE_8BB],bb_ptr[4*STRIDE_8BB],
1117                     bb_ptr[2*STRIDE_8BB],bb_ptr[5*STRIDE_8BB]);
1118         }
1119     }
1120 #endif
1121     else
1122     {
1123         /* Store the bounding boxes as xyz.xyz. */
1124         bb_ptr = grid->bb+((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
1125
1126         calc_bounding_box(na,nbat->xstride,nbat->x+a0*nbat->xstride,
1127                           bb_ptr);
1128
1129         if (gmx_debug_at)
1130         {
1131             int bbo;
1132             bbo = (a0 - grid->cell0*grid->na_sc)/grid->na_c;
1133             fprintf(debug,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
1134                     sx,sy,sz,
1135                     (grid->bb+bbo*NNBSBB_B)[BBL_X],
1136                     (grid->bb+bbo*NNBSBB_B)[BBU_X],
1137                     (grid->bb+bbo*NNBSBB_B)[BBL_Y],
1138                     (grid->bb+bbo*NNBSBB_B)[BBU_Y],
1139                     (grid->bb+bbo*NNBSBB_B)[BBL_Z],
1140                     (grid->bb+bbo*NNBSBB_B)[BBU_Z]);
1141         }
1142     }
1143 }
1144
1145 /* Spatially sort the atoms within one grid column */
1146 static void sort_columns_simple(const nbnxn_search_t nbs,
1147                                 int dd_zone,
1148                                 nbnxn_grid_t *grid,
1149                                 int a0,int a1,
1150                                 const int *atinfo,
1151                                 rvec *x,
1152                                 nbnxn_atomdata_t *nbat,
1153                                 int cxy_start,int cxy_end,
1154                                 int *sort_work)
1155 {
1156     int  cxy;
1157     int  cx,cy,cz,ncz,cfilled,c;
1158     int  na,ash,ind,a;
1159     int  na_c,ash_c;
1160
1161     if (debug)
1162     {
1163         fprintf(debug,"cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1164                 grid->cell0,cxy_start,cxy_end,a0,a1);
1165     }
1166
1167     /* Sort the atoms within each x,y column in 3 dimensions */
1168     for(cxy=cxy_start; cxy<cxy_end; cxy++)
1169     {
1170         cx = cxy/grid->ncy;
1171         cy = cxy - cx*grid->ncy;
1172
1173         na  = grid->cxy_na[cxy];
1174         ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
1175         ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
1176
1177         /* Sort the atoms within each x,y column on z coordinate */
1178         sort_atoms(ZZ,FALSE,
1179                    nbs->a+ash,na,x,
1180                    grid->c0[ZZ],
1181                    ncz*grid->na_sc*SORT_GRID_OVERSIZE/nbs->box[ZZ][ZZ],
1182                    ncz*grid->na_sc*SGSF,sort_work);
1183
1184         /* Fill the ncz cells in this column */
1185         cfilled = grid->cxy_ind[cxy];
1186         for(cz=0; cz<ncz; cz++)
1187         {
1188             c  = grid->cxy_ind[cxy] + cz ;
1189
1190             ash_c = ash + cz*grid->na_sc;
1191             na_c  = min(grid->na_sc,na-(ash_c-ash));
1192
1193             fill_cell(nbs,grid,nbat,
1194                       ash_c,ash_c+na_c,atinfo,x,
1195                       grid->na_sc*cx + (dd_zone >> 2),
1196                       grid->na_sc*cy + (dd_zone & 3),
1197                       grid->na_sc*cz,
1198                       NULL);
1199
1200             /* This copy to bbcz is not really necessary.
1201              * But it allows to use the same grid search code
1202              * for the simple and supersub cell setups.
1203              */
1204             if (na_c > 0)
1205             {
1206                 cfilled = c;
1207             }
1208             grid->bbcz[c*NNBSBB_D  ] = grid->bb[cfilled*NNBSBB_B+2];
1209             grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled*NNBSBB_B+6];
1210         }
1211
1212         /* Set the unused atom indices to -1 */
1213         for(ind=na; ind<ncz*grid->na_sc; ind++)
1214         {
1215             nbs->a[ash+ind] = -1;
1216         }
1217     }
1218 }
1219
1220 /* Spatially sort the atoms within one grid column */
1221 static void sort_columns_supersub(const nbnxn_search_t nbs,
1222                                   int dd_zone,
1223                                   nbnxn_grid_t *grid,
1224                                   int a0,int a1,
1225                                   const int *atinfo,
1226                                   rvec *x,
1227                                   nbnxn_atomdata_t *nbat,
1228                                   int cxy_start,int cxy_end,
1229                                   int *sort_work)
1230 {
1231     int  cxy;
1232     int  cx,cy,cz=-1,c=-1,ncz;
1233     int  na,ash,na_c,ind,a;
1234     int  subdiv_z,sub_z,na_z,ash_z;
1235     int  subdiv_y,sub_y,na_y,ash_y;
1236     int  subdiv_x,sub_x,na_x,ash_x;
1237
1238     /* cppcheck-suppress unassignedVariable */
1239     float bb_work_array[NNBSBB_B+3],*bb_work_align;
1240
1241     bb_work_align = (float *)(((size_t)(bb_work_array+3)) & (~((size_t)15)));
1242
1243     if (debug)
1244     {
1245         fprintf(debug,"cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1246                 grid->cell0,cxy_start,cxy_end,a0,a1);
1247     }
1248
1249     subdiv_x = grid->na_c;
1250     subdiv_y = GPU_NSUBCELL_X*subdiv_x;
1251     subdiv_z = GPU_NSUBCELL_Y*subdiv_y;
1252
1253     /* Sort the atoms within each x,y column in 3 dimensions */
1254     for(cxy=cxy_start; cxy<cxy_end; cxy++)
1255     {
1256         cx = cxy/grid->ncy;
1257         cy = cxy - cx*grid->ncy;
1258
1259         na  = grid->cxy_na[cxy];
1260         ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
1261         ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
1262
1263         /* Sort the atoms within each x,y column on z coordinate */
1264         sort_atoms(ZZ,FALSE,
1265                    nbs->a+ash,na,x,
1266                    grid->c0[ZZ],
1267                    ncz*grid->na_sc*SORT_GRID_OVERSIZE/nbs->box[ZZ][ZZ],
1268                    ncz*grid->na_sc*SGSF,sort_work);
1269
1270         /* This loop goes over the supercells and subcells along z at once */
1271         for(sub_z=0; sub_z<ncz*GPU_NSUBCELL_Z; sub_z++)
1272         {
1273             ash_z = ash + sub_z*subdiv_z;
1274             na_z  = min(subdiv_z,na-(ash_z-ash));
1275
1276             /* We have already sorted on z */
1277
1278             if (sub_z % GPU_NSUBCELL_Z == 0)
1279             {
1280                 cz = sub_z/GPU_NSUBCELL_Z;
1281                 c  = grid->cxy_ind[cxy] + cz ;
1282
1283                 /* The number of atoms in this supercell */
1284                 na_c = min(grid->na_sc,na-(ash_z-ash));
1285
1286                 grid->nsubc[c] = min(GPU_NSUBCELL,(na_c+grid->na_c-1)/grid->na_c);
1287
1288                 /* Store the z-boundaries of the super cell */
1289                 grid->bbcz[c*NNBSBB_D  ] = x[nbs->a[ash_z]][ZZ];
1290                 grid->bbcz[c*NNBSBB_D+1] = x[nbs->a[ash_z+na_c-1]][ZZ];
1291             }
1292
1293 #if GPU_NSUBCELL_Y > 1
1294             /* Sort the atoms along y */
1295             sort_atoms(YY,(sub_z & 1),
1296                        nbs->a+ash_z,na_z,x,
1297                        grid->c0[YY]+cy*grid->sy,grid->inv_sy,
1298                        subdiv_y*SGSF,sort_work);
1299 #endif
1300
1301             for(sub_y=0; sub_y<GPU_NSUBCELL_Y; sub_y++)
1302             {
1303                 ash_y = ash_z + sub_y*subdiv_y;
1304                 na_y  = min(subdiv_y,na-(ash_y-ash));
1305
1306 #if GPU_NSUBCELL_X > 1
1307                 /* Sort the atoms along x */
1308                 sort_atoms(XX,((cz*GPU_NSUBCELL_Y + sub_y) & 1),
1309                            nbs->a+ash_y,na_y,x,
1310                            grid->c0[XX]+cx*grid->sx,grid->inv_sx,
1311                            subdiv_x*SGSF,sort_work);
1312 #endif
1313
1314                 for(sub_x=0; sub_x<GPU_NSUBCELL_X; sub_x++)
1315                 {
1316                     ash_x = ash_y + sub_x*subdiv_x;
1317                     na_x  = min(subdiv_x,na-(ash_x-ash));
1318
1319                     fill_cell(nbs,grid,nbat,
1320                               ash_x,ash_x+na_x,atinfo,x,
1321                               grid->na_c*(cx*GPU_NSUBCELL_X+sub_x) + (dd_zone >> 2),
1322                               grid->na_c*(cy*GPU_NSUBCELL_Y+sub_y) + (dd_zone & 3),
1323                               grid->na_c*sub_z,
1324                               bb_work_align);
1325                 }
1326             }
1327         }
1328
1329         /* Set the unused atom indices to -1 */
1330         for(ind=na; ind<ncz*grid->na_sc; ind++)
1331         {
1332             nbs->a[ash+ind] = -1;
1333         }
1334     }
1335 }
1336
1337 /* Determine in which grid column atoms should go */
1338 static void calc_column_indices(nbnxn_grid_t *grid,
1339                                 int a0,int a1,
1340                                 rvec *x,const int *move,
1341                                 int thread,int nthread,
1342                                 int *cell,
1343                                 int *cxy_na)
1344 {
1345     int  n0,n1,i;
1346     int  cx,cy;
1347
1348     /* We add one extra cell for particles which moved during DD */
1349     for(i=0; i<grid->ncx*grid->ncy+1; i++)
1350     {
1351         cxy_na[i] = 0;
1352     }
1353
1354     n0 = a0 + (int)((thread+0)*(a1 - a0))/nthread;
1355     n1 = a0 + (int)((thread+1)*(a1 - a0))/nthread;
1356     for(i=n0; i<n1; i++)
1357     {
1358         if (move == NULL || move[i] >= 0)
1359         {
1360             /* We need to be careful with rounding,
1361              * particles might be a few bits outside the local box.
1362              * The int cast takes care of the lower bound,
1363              * we need to explicitly take care of the upper bound.
1364              */
1365             cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
1366             if (cx == grid->ncx)
1367             {
1368                 cx = grid->ncx - 1;
1369             }
1370             cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
1371             if (cy == grid->ncy)
1372             {
1373                 cy = grid->ncy - 1;
1374             }
1375             /* For the moment cell contains only the, grid local,
1376              * x and y indices, not z.
1377              */
1378             cell[i] = cx*grid->ncy + cy;
1379
1380 #ifdef DEBUG_NBNXN_GRIDDING
1381             if (cell[i] < 0 || cell[i] >= grid->ncx*grid->ncy)
1382             {
1383                 gmx_fatal(FARGS,
1384                           "grid cell cx %d cy %d out of range (max %d %d)",
1385                           cx,cy,grid->ncx,grid->ncy);
1386             }
1387 #endif
1388         }
1389         else
1390         {
1391             /* Put this moved particle after the end of the grid,
1392              * so we can process it later without using conditionals.
1393              */
1394             cell[i] = grid->ncx*grid->ncy;
1395         }
1396
1397         cxy_na[cell[i]]++;
1398     }
1399 }
1400
1401 /* Determine in which grid cells the atoms should go */
1402 static void calc_cell_indices(const nbnxn_search_t nbs,
1403                               int dd_zone,
1404                               nbnxn_grid_t *grid,
1405                               int a0,int a1,
1406                               const int *atinfo,
1407                               rvec *x,
1408                               const int *move,
1409                               nbnxn_atomdata_t *nbat)
1410 {
1411     int  n0,n1,i;
1412     int  cx,cy,cxy,ncz_max,ncz;
1413     int  nthread,thread;
1414     int  *cxy_na,cxy_na_i;
1415
1416     nthread = gmx_omp_nthreads_get(emntPairsearch);
1417
1418 #pragma omp parallel for num_threads(nthread) schedule(static)
1419     for(thread=0; thread<nthread; thread++)
1420     {
1421         calc_column_indices(grid,a0,a1,x,move,thread,nthread,
1422                             nbs->cell,nbs->work[thread].cxy_na);
1423     }
1424
1425     /* Make the cell index as a function of x and y */
1426     ncz_max = 0;
1427     ncz = 0;
1428     grid->cxy_ind[0] = 0;
1429     for(i=0; i<grid->ncx*grid->ncy+1; i++)
1430     {
1431         /* We set ncz_max at the beginning of the loop iso at the end
1432          * to skip i=grid->ncx*grid->ncy which are moved particles
1433          * that do not need to be ordered on the grid.
1434          */
1435         if (ncz > ncz_max)
1436         {
1437             ncz_max = ncz;
1438         }
1439         cxy_na_i = nbs->work[0].cxy_na[i];
1440         for(thread=1; thread<nthread; thread++)
1441         {
1442             cxy_na_i += nbs->work[thread].cxy_na[i];
1443         }
1444         ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
1445         if (nbat->XFormat == nbatX8)
1446         {
1447             /* Make the number of cell a multiple of 2 */
1448             ncz = (ncz + 1) & ~1;
1449         }
1450         grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
1451         /* Clear cxy_na, so we can reuse the array below */
1452         grid->cxy_na[i] = 0;
1453     }
1454     grid->nc = grid->cxy_ind[grid->ncx*grid->ncy] - grid->cxy_ind[0];
1455
1456     nbat->natoms = (grid->cell0 + grid->nc)*grid->na_sc;
1457
1458     if (debug)
1459     {
1460         fprintf(debug,"ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1461                 grid->na_sc,grid->na_c,grid->nc,
1462                 grid->ncx,grid->ncy,grid->nc/((double)(grid->ncx*grid->ncy)),
1463                 ncz_max);
1464         if (gmx_debug_at)
1465         {
1466             i = 0;
1467             for(cy=0; cy<grid->ncy; cy++)
1468             {
1469                 for(cx=0; cx<grid->ncx; cx++)
1470                 {
1471                     fprintf(debug," %2d",grid->cxy_ind[i+1]-grid->cxy_ind[i]);
1472                     i++;
1473                 }
1474                 fprintf(debug,"\n");
1475             }
1476         }
1477     }
1478
1479     /* Make sure the work array for sorting is large enough */
1480     if (ncz_max*grid->na_sc*SGSF > nbs->work[0].sort_work_nalloc)
1481     {
1482         for(thread=0; thread<nbs->nthread_max; thread++)
1483         {
1484             nbs->work[thread].sort_work_nalloc =
1485                 over_alloc_large(ncz_max*grid->na_sc*SGSF);
1486             srenew(nbs->work[thread].sort_work,
1487                    nbs->work[thread].sort_work_nalloc);
1488         }
1489     }
1490
1491     /* Now we know the dimensions we can fill the grid.
1492      * This is the first, unsorted fill. We sort the columns after this.
1493      */
1494     for(i=a0; i<a1; i++)
1495     {
1496         /* At this point nbs->cell contains the local grid x,y indices */
1497         cxy = nbs->cell[i];
1498         nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
1499     }
1500
1501     /* Set the cell indices for the moved particles */
1502     n0 = grid->nc*grid->na_sc;
1503     n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
1504     for(i=n0; i<n1; i++)
1505     {
1506         nbs->cell[nbs->a[i]] = i;
1507     }
1508
1509     /* Sort the super-cell columns along z into the sub-cells. */
1510 #pragma omp parallel for num_threads(nbs->nthread_max) schedule(static)
1511     for(thread=0; thread<nbs->nthread_max; thread++)
1512     {
1513         if (grid->bSimple)
1514         {
1515             sort_columns_simple(nbs,dd_zone,grid,a0,a1,atinfo,x,nbat,
1516                                 ((thread+0)*grid->ncx*grid->ncy)/nthread,
1517                                 ((thread+1)*grid->ncx*grid->ncy)/nthread,
1518                                 nbs->work[thread].sort_work);
1519         }
1520         else
1521         {
1522             sort_columns_supersub(nbs,dd_zone,grid,a0,a1,atinfo,x,nbat,
1523                                   ((thread+0)*grid->ncx*grid->ncy)/nthread,
1524                                   ((thread+1)*grid->ncx*grid->ncy)/nthread,
1525                                   nbs->work[thread].sort_work);
1526         }
1527     }
1528
1529 #ifdef NBNXN_SEARCH_SSE
1530     if (grid->bSimple && nbat->XFormat == nbatX8)
1531     {
1532         combine_bounding_box_pairs(grid,grid->bb);
1533     }
1534 #endif
1535
1536     if (!grid->bSimple)
1537     {
1538         grid->nsubc_tot = 0;
1539         for(i=0; i<grid->nc; i++)
1540         {
1541             grid->nsubc_tot += grid->nsubc[i];
1542         }
1543     }
1544
1545     if (debug)
1546     {
1547         if (grid->bSimple)
1548         {
1549             print_bbsizes_simple(debug,nbs,grid);
1550         }
1551         else
1552         {
1553             fprintf(debug,"ns non-zero sub-cells: %d average atoms %.2f\n",
1554                     grid->nsubc_tot,(a1-a0)/(double)grid->nsubc_tot);
1555
1556             print_bbsizes_supersub(debug,nbs,grid);
1557         }
1558     }
1559 }
1560
1561 static void init_grid_flags(nbnxn_cellblock_flags *flags,
1562                             const nbnxn_grid_t *grid)
1563 {
1564     int cb;
1565
1566     flags->ncb = (grid->nc + NBNXN_CELLBLOCK_SIZE - 1)/NBNXN_CELLBLOCK_SIZE;
1567     if (flags->ncb > flags->flag_nalloc)
1568     {
1569         flags->flag_nalloc = over_alloc_large(flags->ncb);
1570         srenew(flags->flag,flags->flag_nalloc);
1571     }
1572     for(cb=0; cb<flags->ncb; cb++)
1573     {
1574         flags->flag[cb] = 0;
1575     }
1576
1577     flags->bUse = TRUE;
1578 }
1579
1580 /* Sets up a grid and puts the atoms on the grid.
1581  * This function only operates on one domain of the domain decompostion.
1582  * Note that without domain decomposition there is only one domain.
1583  */
1584 void nbnxn_put_on_grid(nbnxn_search_t nbs,
1585                        int ePBC,matrix box,
1586                        int dd_zone,
1587                        rvec corner0,rvec corner1,
1588                        int a0,int a1,
1589                        real atom_density,
1590                        const int *atinfo,
1591                        rvec *x,
1592                        int nmoved,int *move,
1593                        int nb_kernel_type,
1594                        nbnxn_atomdata_t *nbat)
1595 {
1596     nbnxn_grid_t *grid;
1597     int n;
1598     int nc_max_grid,nc_max;
1599
1600     grid = &nbs->grid[dd_zone];
1601
1602     nbs_cycle_start(&nbs->cc[enbsCCgrid]);
1603
1604     grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
1605
1606     grid->na_c      = nbnxn_kernel_to_ci_size(nb_kernel_type);
1607     grid->na_cj     = nbnxn_kernel_to_cj_size(nb_kernel_type);
1608     grid->na_sc     = (grid->bSimple ? 1 : GPU_NSUBCELL)*grid->na_c;
1609     grid->na_c_2log = get_2log(grid->na_c);
1610
1611     nbat->na_c = grid->na_c;
1612
1613     if (dd_zone == 0)
1614     {
1615         grid->cell0 = 0;
1616     }
1617     else
1618     {
1619         grid->cell0 =
1620             (nbs->grid[dd_zone-1].cell0 + nbs->grid[dd_zone-1].nc)*
1621             nbs->grid[dd_zone-1].na_sc/grid->na_sc;
1622     }
1623
1624     n = a1 - a0;
1625
1626     if (dd_zone == 0)
1627     {
1628         nbs->ePBC = ePBC;
1629         copy_mat(box,nbs->box);
1630
1631         if (atom_density >= 0)
1632         {
1633             grid->atom_density = atom_density;
1634         }
1635         else
1636         {
1637             grid->atom_density = grid_atom_density(n-nmoved,corner0,corner1);
1638         }
1639
1640         grid->cell0 = 0;
1641
1642         nbs->natoms_local    = a1 - nmoved;
1643         /* We assume that nbnxn_put_on_grid is called first
1644          * for the local atoms (dd_zone=0).
1645          */
1646         nbs->natoms_nonlocal = a1 - nmoved;
1647     }
1648     else
1649     {
1650         nbs->natoms_nonlocal = max(nbs->natoms_nonlocal,a1);
1651     }
1652
1653     nc_max_grid = set_grid_size_xy(nbs,grid,n-nmoved,corner0,corner1,
1654                                    nbs->grid[0].atom_density,
1655                                    nbat->XFormat);
1656
1657     nc_max = grid->cell0 + nc_max_grid;
1658
1659     if (a1 > nbs->cell_nalloc)
1660     {
1661         nbs->cell_nalloc = over_alloc_large(a1);
1662         srenew(nbs->cell,nbs->cell_nalloc);
1663     }
1664
1665     /* To avoid conditionals we store the moved particles at the end of a,
1666      * make sure we have enough space.
1667      */
1668     if (nc_max*grid->na_sc + nmoved > nbs->a_nalloc)
1669     {
1670         nbs->a_nalloc = over_alloc_large(nc_max*grid->na_sc + nmoved);
1671         srenew(nbs->a,nbs->a_nalloc);
1672     }
1673
1674     if (nc_max*grid->na_sc > nbat->nalloc)
1675     {
1676         nbnxn_atomdata_realloc(nbat,nc_max*grid->na_sc);
1677     }
1678
1679     calc_cell_indices(nbs,dd_zone,grid,a0,a1,atinfo,x,move,nbat);
1680
1681     if (dd_zone == 0)
1682     {
1683         nbat->natoms_local = nbat->natoms;
1684     }
1685
1686     init_grid_flags(&grid->cellblock_flags,grid);
1687
1688     nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
1689 }
1690
1691 /* Calls nbnxn_put_on_grid for all non-local domains */
1692 void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs,
1693                                 const gmx_domdec_zones_t *zones,
1694                                 const int *atinfo,
1695                                 rvec *x,
1696                                 int nb_kernel_type,
1697                                 nbnxn_atomdata_t *nbat)
1698 {
1699     int  zone,d;
1700     rvec c0,c1;
1701
1702     for(zone=1; zone<zones->n; zone++)
1703     {
1704         for(d=0; d<DIM; d++)
1705         {
1706             c0[d] = zones->size[zone].bb_x0[d];
1707             c1[d] = zones->size[zone].bb_x1[d];
1708         }
1709
1710         nbnxn_put_on_grid(nbs,nbs->ePBC,NULL,
1711                           zone,c0,c1,
1712                           zones->cg_range[zone],
1713                           zones->cg_range[zone+1],
1714                           -1,
1715                           atinfo,
1716                           x,
1717                           0,NULL,
1718                           nb_kernel_type,
1719                           nbat);
1720     }
1721 }
1722
1723 /* Add simple grid type information to the local super/sub grid */
1724 void nbnxn_grid_add_simple(nbnxn_search_t nbs,
1725                            nbnxn_atomdata_t *nbat)
1726 {
1727     nbnxn_grid_t *grid;
1728     float *bbcz,*bb;
1729     int ncd,sc;
1730
1731     grid = &nbs->grid[0];
1732
1733     if (grid->bSimple)
1734     {
1735         gmx_incons("nbnxn_grid_simple called with a simple grid");
1736     }
1737
1738     ncd = grid->na_sc/NBNXN_CPU_CLUSTER_I_SIZE;
1739
1740     if (grid->nc*ncd > grid->nc_nalloc_simple)
1741     {
1742         grid->nc_nalloc_simple = over_alloc_large(grid->nc*ncd);
1743         srenew(grid->bbcz_simple,grid->nc_nalloc_simple*NNBSBB_D);
1744         srenew(grid->bb_simple,grid->nc_nalloc_simple*NNBSBB_B);
1745         srenew(grid->flags_simple,grid->nc_nalloc_simple);
1746         if (nbat->XFormat)
1747         {
1748             sfree_aligned(grid->bbj);
1749             snew_aligned(grid->bbj,grid->nc_nalloc_simple/2,16);
1750         }
1751     }
1752
1753     bbcz = grid->bbcz_simple;
1754     bb   = grid->bb_simple;
1755
1756 #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
1757     for(sc=0; sc<grid->nc; sc++)
1758     {
1759         int c,tx,na;
1760
1761         for(c=0; c<ncd; c++)
1762         {
1763             tx = sc*ncd + c;
1764
1765             na = NBNXN_CPU_CLUSTER_I_SIZE;
1766             while (na > 0 &&
1767                    nbat->type[tx*NBNXN_CPU_CLUSTER_I_SIZE+na-1] == nbat->ntype-1)
1768             {
1769                 na--;
1770             }
1771
1772             if (na > 0)
1773             {
1774                 switch (nbat->XFormat)
1775                 {
1776                 case nbatX4:
1777                     /* PACK_X4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
1778                     calc_bounding_box_x_x4(na,nbat->x+tx*STRIDE_P4,
1779                                            bb+tx*NNBSBB_B);
1780                     break;
1781                 case nbatX8:
1782                     /* PACK_X8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
1783                     calc_bounding_box_x_x8(na,nbat->x+X8_IND_A(tx*NBNXN_CPU_CLUSTER_I_SIZE),
1784                                            bb+tx*NNBSBB_B);
1785                     break;
1786                 default:
1787                     calc_bounding_box(na,nbat->xstride,
1788                                       nbat->x+tx*NBNXN_CPU_CLUSTER_I_SIZE*nbat->xstride,
1789                                       bb+tx*NNBSBB_B);
1790                     break;
1791                 }
1792                 bbcz[tx*NNBSBB_D+0] = bb[tx*NNBSBB_B         +ZZ];
1793                 bbcz[tx*NNBSBB_D+1] = bb[tx*NNBSBB_B+NNBSBB_C+ZZ];
1794
1795                 /* No interaction optimization yet here */
1796                 grid->flags_simple[tx] = NBNXN_CI_DO_LJ(0) | NBNXN_CI_DO_COUL(0);
1797             }
1798             else
1799             {
1800                 grid->flags_simple[tx] = 0;
1801             }
1802         }
1803     }
1804
1805 #ifdef NBNXN_SEARCH_SSE
1806     if (grid->bSimple && nbat->XFormat == nbatX8)
1807     {
1808         combine_bounding_box_pairs(grid,grid->bb_simple);
1809     }
1810 #endif
1811 }
1812
1813 void nbnxn_get_ncells(nbnxn_search_t nbs,int *ncx,int *ncy)
1814 {
1815     *ncx = nbs->grid[0].ncx;
1816     *ncy = nbs->grid[0].ncy;
1817 }
1818
1819 void nbnxn_get_atomorder(nbnxn_search_t nbs,int **a,int *n)
1820 {
1821     const nbnxn_grid_t *grid;
1822
1823     grid = &nbs->grid[0];
1824
1825     /* Return the atom order for the home cell (index 0) */
1826     *a  = nbs->a;
1827
1828     *n = grid->cxy_ind[grid->ncx*grid->ncy]*grid->na_sc;
1829 }
1830
1831 void nbnxn_set_atomorder(nbnxn_search_t nbs)
1832 {
1833     nbnxn_grid_t *grid;
1834     int ao,cx,cy,cxy,cz,j;
1835
1836     /* Set the atom order for the home cell (index 0) */
1837     grid = &nbs->grid[0];
1838
1839     ao = 0;
1840     for(cx=0; cx<grid->ncx; cx++)
1841     {
1842         for(cy=0; cy<grid->ncy; cy++)
1843         {
1844             cxy = cx*grid->ncy + cy;
1845             j   = grid->cxy_ind[cxy]*grid->na_sc;
1846             for(cz=0; cz<grid->cxy_na[cxy]; cz++)
1847             {
1848                 nbs->a[j]     = ao;
1849                 nbs->cell[ao] = j;
1850                 ao++;
1851                 j++;
1852             }
1853         }
1854     }
1855 }
1856
1857 /* Determines the cell range along one dimension that
1858  * the bounding box b0 - b1 sees.
1859  */
1860 static void get_cell_range(real b0,real b1,
1861                            int nc,real c0,real s,real invs,
1862                            real d2,real r2,int *cf,int *cl)
1863 {
1864     *cf = max((int)((b0 - c0)*invs),0);
1865
1866     while (*cf > 0 && d2 + sqr((b0 - c0) - (*cf-1+1)*s) < r2)
1867     {
1868         (*cf)--;
1869     }
1870
1871     *cl = min((int)((b1 - c0)*invs),nc-1);
1872     while (*cl < nc-1 && d2 + sqr((*cl+1)*s - (b1 - c0)) < r2)
1873     {
1874         (*cl)++;
1875     }
1876 }
1877
1878 /* Reference code calculating the distance^2 between two bounding boxes */
1879 static float box_dist2(float bx0,float bx1,float by0,
1880                        float by1,float bz0,float bz1,
1881                        const float *bb)
1882 {
1883     float d2;
1884     float dl,dh,dm,dm0;
1885
1886     d2 = 0;
1887
1888     dl  = bx0 - bb[BBU_X];
1889     dh  = bb[BBL_X] - bx1;
1890     dm  = max(dl,dh);
1891     dm0 = max(dm,0);
1892     d2 += dm0*dm0;
1893
1894     dl  = by0 - bb[BBU_Y];
1895     dh  = bb[BBL_Y] - by1;
1896     dm  = max(dl,dh);
1897     dm0 = max(dm,0);
1898     d2 += dm0*dm0;
1899
1900     dl  = bz0 - bb[BBU_Z];
1901     dh  = bb[BBL_Z] - bz1;
1902     dm  = max(dl,dh);
1903     dm0 = max(dm,0);
1904     d2 += dm0*dm0;
1905
1906     return d2;
1907 }
1908
1909 /* Plain C code calculating the distance^2 between two bounding boxes */
1910 static float subc_bb_dist2(int si,const float *bb_i_ci,
1911                            int csj,const float *bb_j_all)
1912 {
1913     const float *bb_i,*bb_j;
1914     float d2;
1915     float dl,dh,dm,dm0;
1916
1917     bb_i = bb_i_ci  +  si*NNBSBB_B;
1918     bb_j = bb_j_all + csj*NNBSBB_B;
1919
1920     d2 = 0;
1921
1922     dl  = bb_i[BBL_X] - bb_j[BBU_X];
1923     dh  = bb_j[BBL_X] - bb_i[BBU_X];
1924     dm  = max(dl,dh);
1925     dm0 = max(dm,0);
1926     d2 += dm0*dm0;
1927
1928     dl  = bb_i[BBL_Y] - bb_j[BBU_Y];
1929     dh  = bb_j[BBL_Y] - bb_i[BBU_Y];
1930     dm  = max(dl,dh);
1931     dm0 = max(dm,0);
1932     d2 += dm0*dm0;
1933
1934     dl  = bb_i[BBL_Z] - bb_j[BBU_Z];
1935     dh  = bb_j[BBL_Z] - bb_i[BBU_Z];
1936     dm  = max(dl,dh);
1937     dm0 = max(dm,0);
1938     d2 += dm0*dm0;
1939
1940     return d2;
1941 }
1942
1943 #ifdef NBNXN_SEARCH_SSE
1944
1945 /* SSE code for bb distance for bb format xyz0 */
1946 static float subc_bb_dist2_sse(int na_c,
1947                               int si,const float *bb_i_ci,
1948                               int csj,const float *bb_j_all)
1949 {
1950     const float *bb_i,*bb_j;
1951
1952     __m128 bb_i_SSE0,bb_i_SSE1;
1953     __m128 bb_j_SSE0,bb_j_SSE1;
1954     __m128 dl_SSE;
1955     __m128 dh_SSE;
1956     __m128 dm_SSE;
1957     __m128 dm0_SSE;
1958     __m128 d2_SSE;
1959 #ifndef GMX_X86_SSE4_1
1960     float d2_array[7],*d2_align;
1961
1962     d2_align = (float *)(((size_t)(d2_array+3)) & (~((size_t)15)));
1963 #else
1964     float d2;
1965 #endif
1966
1967     bb_i = bb_i_ci  +  si*NNBSBB_B;
1968     bb_j = bb_j_all + csj*NNBSBB_B;
1969
1970     bb_i_SSE0 = _mm_load_ps(bb_i);
1971     bb_i_SSE1 = _mm_load_ps(bb_i+NNBSBB_C);
1972     bb_j_SSE0 = _mm_load_ps(bb_j);
1973     bb_j_SSE1 = _mm_load_ps(bb_j+NNBSBB_C);
1974
1975     dl_SSE    = _mm_sub_ps(bb_i_SSE0,bb_j_SSE1);
1976     dh_SSE    = _mm_sub_ps(bb_j_SSE0,bb_i_SSE1);
1977
1978     dm_SSE    = _mm_max_ps(dl_SSE,dh_SSE);
1979     dm0_SSE   = _mm_max_ps(dm_SSE,_mm_setzero_ps());
1980 #ifndef GMX_X86_SSE4_1
1981     d2_SSE    = _mm_mul_ps(dm0_SSE,dm0_SSE);
1982
1983     _mm_store_ps(d2_align,d2_SSE);
1984
1985     return d2_align[0] + d2_align[1] + d2_align[2];
1986 #else
1987     /* SSE4.1 dot product of components 0,1,2 */
1988     d2_SSE    = _mm_dp_ps(dm0_SSE,dm0_SSE,0x71);
1989
1990     _mm_store_ss(&d2,d2_SSE);
1991
1992     return d2;
1993 #endif
1994 }
1995
1996 /* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
1997 #define SUBC_BB_DIST2_SSE_XXXX_INNER(si,bb_i,d2) \
1998 {                                                \
1999     int    shi;                                  \
2000                                                  \
2001     __m128 dx_0,dy_0,dz_0;                       \
2002     __m128 dx_1,dy_1,dz_1;                       \
2003                                                  \
2004     __m128 mx,my,mz;                             \
2005     __m128 m0x,m0y,m0z;                          \
2006                                                  \
2007     __m128 d2x,d2y,d2z;                          \
2008     __m128 d2s,d2t;                              \
2009                                                  \
2010     shi = si*NNBSBB_D*DIM;                       \
2011                                                  \
2012     xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_8BB);   \
2013     yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_8BB);   \
2014     zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_8BB);   \
2015     xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_8BB);   \
2016     yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_8BB);   \
2017     zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_8BB);   \
2018                                                  \
2019     dx_0 = _mm_sub_ps(xi_l,xj_h);                \
2020     dy_0 = _mm_sub_ps(yi_l,yj_h);                \
2021     dz_0 = _mm_sub_ps(zi_l,zj_h);                \
2022                                                  \
2023     dx_1 = _mm_sub_ps(xj_l,xi_h);                \
2024     dy_1 = _mm_sub_ps(yj_l,yi_h);                \
2025     dz_1 = _mm_sub_ps(zj_l,zi_h);                \
2026                                                  \
2027     mx   = _mm_max_ps(dx_0,dx_1);                \
2028     my   = _mm_max_ps(dy_0,dy_1);                \
2029     mz   = _mm_max_ps(dz_0,dz_1);                \
2030                                                  \
2031     m0x  = _mm_max_ps(mx,zero);                  \
2032     m0y  = _mm_max_ps(my,zero);                  \
2033     m0z  = _mm_max_ps(mz,zero);                  \
2034                                                  \
2035     d2x  = _mm_mul_ps(m0x,m0x);                  \
2036     d2y  = _mm_mul_ps(m0y,m0y);                  \
2037     d2z  = _mm_mul_ps(m0z,m0z);                  \
2038                                                  \
2039     d2s  = _mm_add_ps(d2x,d2y);                  \
2040     d2t  = _mm_add_ps(d2s,d2z);                  \
2041                                                  \
2042     _mm_store_ps(d2+si,d2t);                     \
2043 }
2044
2045 /* SSE code for nsi bb distances for bb format xxxxyyyyzzzz */
2046 static void subc_bb_dist2_sse_xxxx(const float *bb_j,
2047                                    int nsi,const float *bb_i,
2048                                    float *d2)
2049 {
2050     __m128 xj_l,yj_l,zj_l;
2051     __m128 xj_h,yj_h,zj_h;
2052     __m128 xi_l,yi_l,zi_l;
2053     __m128 xi_h,yi_h,zi_h;
2054
2055     __m128 zero;
2056
2057     zero = _mm_setzero_ps();
2058
2059     xj_l = _mm_set1_ps(bb_j[0*STRIDE_8BB]);
2060     yj_l = _mm_set1_ps(bb_j[1*STRIDE_8BB]);
2061     zj_l = _mm_set1_ps(bb_j[2*STRIDE_8BB]);
2062     xj_h = _mm_set1_ps(bb_j[3*STRIDE_8BB]);
2063     yj_h = _mm_set1_ps(bb_j[4*STRIDE_8BB]);
2064     zj_h = _mm_set1_ps(bb_j[5*STRIDE_8BB]);
2065
2066     /* Here we "loop" over si (0,STRIDE_8BB) from 0 to nsi with step STRIDE_8BB.
2067      * But as we know the number of iterations is 1 or 2, we unroll manually.
2068      */
2069     SUBC_BB_DIST2_SSE_XXXX_INNER(0,bb_i,d2);
2070     if (STRIDE_8BB < nsi)
2071     {
2072         SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_8BB,bb_i,d2);
2073     }
2074 }
2075
2076 #endif /* NBNXN_SEARCH_SSE */
2077
2078 /* Plain C function which determines if any atom pair between two cells
2079  * is within distance sqrt(rl2).
2080  */
2081 static gmx_bool subc_in_range_x(int na_c,
2082                                 int si,const real *x_i,
2083                                 int csj,int stride,const real *x_j,
2084                                 real rl2)
2085 {
2086     int  i,j,i0,j0;
2087     real d2;
2088
2089     for(i=0; i<na_c; i++)
2090     {
2091         i0 = (si*na_c + i)*DIM;
2092         for(j=0; j<na_c; j++)
2093         {
2094             j0 = (csj*na_c + j)*stride;
2095
2096             d2 = sqr(x_i[i0  ] - x_j[j0  ]) +
2097                  sqr(x_i[i0+1] - x_j[j0+1]) +
2098                  sqr(x_i[i0+2] - x_j[j0+2]);
2099
2100             if (d2 < rl2)
2101             {
2102                 return TRUE;
2103             }
2104         }
2105     }
2106
2107     return FALSE;
2108 }
2109
2110 /* SSE function which determines if any atom pair between two cells,
2111  * both with 8 atoms, is within distance sqrt(rl2).
2112  */
2113 static gmx_bool subc_in_range_sse8(int na_c,
2114                                    int si,const real *x_i,
2115                                    int csj,int stride,const real *x_j,
2116                                    real rl2)
2117 {
2118 #ifdef NBNXN_SEARCH_SSE_SINGLE
2119     __m128 ix_SSE0,iy_SSE0,iz_SSE0;
2120     __m128 ix_SSE1,iy_SSE1,iz_SSE1;
2121
2122     __m128 rc2_SSE;
2123
2124     int na_c_sse;
2125     int j0,j1;
2126
2127     rc2_SSE   = _mm_set1_ps(rl2);
2128
2129     na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_8BB;
2130     ix_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_8BB);
2131     iy_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_8BB);
2132     iz_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_8BB);
2133     ix_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_8BB);
2134     iy_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_8BB);
2135     iz_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_8BB);
2136
2137     /* We loop from the outer to the inner particles to maximize
2138      * the chance that we find a pair in range quickly and return.
2139      */
2140     j0 = csj*na_c;
2141     j1 = j0 + na_c - 1;
2142     while (j0 < j1)
2143     {
2144         __m128 jx0_SSE,jy0_SSE,jz0_SSE;
2145         __m128 jx1_SSE,jy1_SSE,jz1_SSE;
2146
2147         __m128 dx_SSE0,dy_SSE0,dz_SSE0;
2148         __m128 dx_SSE1,dy_SSE1,dz_SSE1;
2149         __m128 dx_SSE2,dy_SSE2,dz_SSE2;
2150         __m128 dx_SSE3,dy_SSE3,dz_SSE3;
2151
2152         __m128 rsq_SSE0;
2153         __m128 rsq_SSE1;
2154         __m128 rsq_SSE2;
2155         __m128 rsq_SSE3;
2156
2157         __m128 wco_SSE0;
2158         __m128 wco_SSE1;
2159         __m128 wco_SSE2;
2160         __m128 wco_SSE3;
2161         __m128 wco_any_SSE01,wco_any_SSE23,wco_any_SSE;
2162
2163         jx0_SSE = _mm_load1_ps(x_j+j0*stride+0);
2164         jy0_SSE = _mm_load1_ps(x_j+j0*stride+1);
2165         jz0_SSE = _mm_load1_ps(x_j+j0*stride+2);
2166
2167         jx1_SSE = _mm_load1_ps(x_j+j1*stride+0);
2168         jy1_SSE = _mm_load1_ps(x_j+j1*stride+1);
2169         jz1_SSE = _mm_load1_ps(x_j+j1*stride+2);
2170
2171         /* Calculate distance */
2172         dx_SSE0            = _mm_sub_ps(ix_SSE0,jx0_SSE);
2173         dy_SSE0            = _mm_sub_ps(iy_SSE0,jy0_SSE);
2174         dz_SSE0            = _mm_sub_ps(iz_SSE0,jz0_SSE);
2175         dx_SSE1            = _mm_sub_ps(ix_SSE1,jx0_SSE);
2176         dy_SSE1            = _mm_sub_ps(iy_SSE1,jy0_SSE);
2177         dz_SSE1            = _mm_sub_ps(iz_SSE1,jz0_SSE);
2178         dx_SSE2            = _mm_sub_ps(ix_SSE0,jx1_SSE);
2179         dy_SSE2            = _mm_sub_ps(iy_SSE0,jy1_SSE);
2180         dz_SSE2            = _mm_sub_ps(iz_SSE0,jz1_SSE);
2181         dx_SSE3            = _mm_sub_ps(ix_SSE1,jx1_SSE);
2182         dy_SSE3            = _mm_sub_ps(iy_SSE1,jy1_SSE);
2183         dz_SSE3            = _mm_sub_ps(iz_SSE1,jz1_SSE);
2184
2185         /* rsq = dx*dx+dy*dy+dz*dz */
2186         rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
2187         rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
2188         rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
2189         rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
2190
2191         wco_SSE0           = _mm_cmplt_ps(rsq_SSE0,rc2_SSE);
2192         wco_SSE1           = _mm_cmplt_ps(rsq_SSE1,rc2_SSE);
2193         wco_SSE2           = _mm_cmplt_ps(rsq_SSE2,rc2_SSE);
2194         wco_SSE3           = _mm_cmplt_ps(rsq_SSE3,rc2_SSE);
2195
2196         wco_any_SSE01      = _mm_or_ps(wco_SSE0,wco_SSE1);
2197         wco_any_SSE23      = _mm_or_ps(wco_SSE2,wco_SSE3);
2198         wco_any_SSE        = _mm_or_ps(wco_any_SSE01,wco_any_SSE23);
2199
2200         if (_mm_movemask_ps(wco_any_SSE))
2201         {
2202             return TRUE;
2203         }
2204
2205         j0++;
2206         j1--;
2207     }
2208     return FALSE;
2209
2210 #else
2211     /* No SSE */
2212     gmx_incons("SSE function called without SSE support");
2213
2214     return TRUE;
2215 #endif
2216 }
2217
2218 /* Returns the j sub-cell for index cj_ind */
2219 static int nbl_cj(const nbnxn_pairlist_t *nbl,int cj_ind)
2220 {
2221     return nbl->cj4[cj_ind>>2].cj[cj_ind & 3];
2222 }
2223
2224 /* Returns the i-interaction mask of the j sub-cell for index cj_ind */
2225 static unsigned nbl_imask0(const nbnxn_pairlist_t *nbl,int cj_ind)
2226 {
2227     return nbl->cj4[cj_ind>>2].imei[0].imask;
2228 }
2229
2230 /* Ensures there is enough space for extra extra exclusion masks */
2231 static void check_excl_space(nbnxn_pairlist_t *nbl,int extra)
2232 {
2233     if (nbl->nexcl+extra > nbl->excl_nalloc)
2234     {
2235         nbl->excl_nalloc = over_alloc_small(nbl->nexcl+extra);
2236         nbnxn_realloc_void((void **)&nbl->excl,
2237                            nbl->nexcl*sizeof(*nbl->excl),
2238                            nbl->excl_nalloc*sizeof(*nbl->excl),
2239                            nbl->alloc,nbl->free);
2240     }
2241 }
2242
2243 /* Ensures there is enough space for ncell extra j-cells in the list */
2244 static void check_subcell_list_space_simple(nbnxn_pairlist_t *nbl,
2245                                             int ncell)
2246 {
2247     int cj_max;
2248
2249     cj_max = nbl->ncj + ncell;
2250
2251     if (cj_max > nbl->cj_nalloc)
2252     {
2253         nbl->cj_nalloc = over_alloc_small(cj_max);
2254         nbnxn_realloc_void((void **)&nbl->cj,
2255                            nbl->ncj*sizeof(*nbl->cj),
2256                            nbl->cj_nalloc*sizeof(*nbl->cj),
2257                            nbl->alloc,nbl->free);
2258     }
2259 }
2260
2261 /* Ensures there is enough space for ncell extra j-subcells in the list */
2262 static void check_subcell_list_space_supersub(nbnxn_pairlist_t *nbl,
2263                                               int nsupercell)
2264 {
2265     int ncj4_max,j4,j,w,t;
2266
2267 #define NWARP       2
2268 #define WARP_SIZE  32
2269
2270     /* We can have maximally nsupercell*GPU_NSUBCELL sj lists */
2271     /* We can store 4 j-subcell - i-supercell pairs in one struct.
2272      * since we round down, we need one extra entry.
2273      */
2274     ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL + 4-1) >> 2);
2275
2276     if (ncj4_max > nbl->cj4_nalloc)
2277     {
2278         nbl->cj4_nalloc = over_alloc_small(ncj4_max);
2279         nbnxn_realloc_void((void **)&nbl->cj4,
2280                            nbl->work->cj4_init*sizeof(*nbl->cj4),
2281                            nbl->cj4_nalloc*sizeof(*nbl->cj4),
2282                            nbl->alloc,nbl->free);
2283     }
2284
2285     if (ncj4_max > nbl->work->cj4_init)
2286     {
2287         for(j4=nbl->work->cj4_init; j4<ncj4_max; j4++)
2288         {
2289             /* No i-subcells and no excl's in the list initially */
2290             for(w=0; w<NWARP; w++)
2291             {
2292                 nbl->cj4[j4].imei[w].imask    = 0U;
2293                 nbl->cj4[j4].imei[w].excl_ind = 0;
2294
2295             }
2296         }
2297         nbl->work->cj4_init = ncj4_max;
2298     }
2299 }
2300
2301 /* Set all excl masks for one GPU warp no exclusions */
2302 static void set_no_excls(nbnxn_excl_t *excl)
2303 {
2304     int t;
2305
2306     for(t=0; t<WARP_SIZE; t++)
2307     {
2308         /* Turn all interaction bits on */
2309         excl->pair[t] = NBNXN_INT_MASK_ALL;
2310     }
2311 }
2312
2313 /* Initializes a single nbnxn_pairlist_t data structure */
2314 static void nbnxn_init_pairlist(nbnxn_pairlist_t *nbl,
2315                                 gmx_bool bSimple,
2316                                 nbnxn_alloc_t *alloc,
2317                                 nbnxn_free_t  *free)
2318 {
2319     if (alloc == NULL)
2320     {
2321         nbl->alloc = nbnxn_alloc_aligned;
2322     }
2323     else
2324     {
2325         nbl->alloc = alloc;
2326     }
2327     if (free == NULL)
2328     {
2329         nbl->free = nbnxn_free_aligned;
2330     }
2331     else
2332     {
2333         nbl->free = free;
2334     }
2335
2336     nbl->bSimple     = bSimple;
2337     nbl->na_sc       = 0;
2338     nbl->na_ci       = 0;
2339     nbl->na_cj       = 0;
2340     nbl->nci         = 0;
2341     nbl->ci          = NULL;
2342     nbl->ci_nalloc   = 0;
2343     nbl->ncj         = 0;
2344     nbl->cj          = NULL;
2345     nbl->cj_nalloc   = 0;
2346     nbl->ncj4        = 0;
2347     /* We need one element extra in sj, so alloc initially with 1 */
2348     nbl->cj4_nalloc  = 0;
2349     nbl->cj4         = NULL;
2350     nbl->nci_tot     = 0;
2351
2352     if (!nbl->bSimple)
2353     {
2354         nbl->excl        = NULL;
2355         nbl->excl_nalloc = 0;
2356         nbl->nexcl       = 0;
2357         check_excl_space(nbl,1);
2358         nbl->nexcl       = 1;
2359         set_no_excls(&nbl->excl[0]);
2360     }
2361
2362     snew(nbl->work,1);
2363 #ifdef NBNXN_BBXXXX
2364     snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,16);
2365 #else
2366     snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,16);
2367 #endif
2368     snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,16);
2369 #ifdef NBNXN_SEARCH_SSE
2370     snew_aligned(nbl->work->x_ci_x86_simd128,1,16);
2371 #ifdef GMX_X86_AVX_256
2372     snew_aligned(nbl->work->x_ci_x86_simd256,1,32);
2373 #endif
2374 #endif
2375     snew_aligned(nbl->work->d2,GPU_NSUBCELL,16);
2376 }
2377
2378 void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
2379                              gmx_bool bSimple, gmx_bool bCombined,
2380                              nbnxn_alloc_t *alloc,
2381                              nbnxn_free_t  *free)
2382 {
2383     int i;
2384
2385     nbl_list->bSimple   = bSimple;
2386     nbl_list->bCombined = bCombined;
2387
2388     nbl_list->nnbl = gmx_omp_nthreads_get(emntNonbonded);
2389
2390     if (!nbl_list->bCombined &&
2391         nbl_list->nnbl > NBNXN_CELLBLOCK_MAX_THREADS)
2392     {
2393         gmx_fatal(FARGS,"%d OpenMP threads were requested. Since the non-bonded force buffer reduction is prohibitively slow with more than %d threads, we do not allow this. Use %d or less OpenMP threads.",
2394                   nbl_list->nnbl,NBNXN_CELLBLOCK_MAX_THREADS,NBNXN_CELLBLOCK_MAX_THREADS);
2395     }
2396
2397     snew(nbl_list->nbl,nbl_list->nnbl);
2398     /* Execute in order to avoid memory interleaving between threads */
2399 #pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)
2400     for(i=0; i<nbl_list->nnbl; i++)
2401     {
2402         /* Allocate the nblist data structure locally on each thread
2403          * to optimize memory access for NUMA architectures.
2404          */
2405         snew(nbl_list->nbl[i],1);
2406
2407         /* Only list 0 is used on the GPU, use normal allocation for i>0 */
2408         if (i == 0)
2409         {
2410             nbnxn_init_pairlist(nbl_list->nbl[i],nbl_list->bSimple,alloc,free);
2411         }
2412         else
2413         {
2414             nbnxn_init_pairlist(nbl_list->nbl[i],nbl_list->bSimple,NULL,NULL);
2415         }
2416     }
2417 }
2418
2419 /* Print statistics of a pair list, used for debug output */
2420 static void print_nblist_statistics_simple(FILE *fp,const nbnxn_pairlist_t *nbl,
2421                                            const nbnxn_search_t nbs,real rl)
2422 {
2423     const nbnxn_grid_t *grid;
2424     int cs[SHIFTS];
2425     int s,i,j;
2426     int npexcl;
2427
2428     /* This code only produces correct statistics with domain decomposition */
2429     grid = &nbs->grid[0];
2430
2431     fprintf(fp,"nbl nci %d ncj %d\n",
2432             nbl->nci,nbl->ncj);
2433     fprintf(fp,"nbl na_sc %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
2434             nbl->na_sc,rl,nbl->ncj,nbl->ncj/(double)grid->nc,
2435             nbl->ncj/(double)grid->nc*grid->na_sc,
2436             nbl->ncj/(double)grid->nc*grid->na_sc/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nc*grid->na_sc/det(nbs->box)));
2437
2438     fprintf(fp,"nbl average j cell list length %.1f\n",
2439             0.25*nbl->ncj/(double)nbl->nci);
2440
2441     for(s=0; s<SHIFTS; s++)
2442     {
2443         cs[s] = 0;
2444     }
2445     npexcl = 0;
2446     for(i=0; i<nbl->nci; i++)
2447     {
2448         cs[nbl->ci[i].shift & NBNXN_CI_SHIFT] +=
2449             nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start;
2450
2451         j = nbl->ci[i].cj_ind_start;
2452         while (j < nbl->ci[i].cj_ind_end &&
2453                nbl->cj[j].excl != NBNXN_INT_MASK_ALL)
2454         {
2455             npexcl++;
2456             j++;
2457         }
2458     }
2459     fprintf(fp,"nbl cell pairs, total: %d excl: %d %.1f%%\n",
2460             nbl->ncj,npexcl,100*npexcl/(double)nbl->ncj);
2461     for(s=0; s<SHIFTS; s++)
2462     {
2463         if (cs[s] > 0)
2464         {
2465             fprintf(fp,"nbl shift %2d ncj %3d\n",s,cs[s]);
2466         }
2467     }
2468 }
2469
2470 /* Print statistics of a pair lists, used for debug output */
2471 static void print_nblist_statistics_supersub(FILE *fp,const nbnxn_pairlist_t *nbl,
2472                                              const nbnxn_search_t nbs,real rl)
2473 {
2474     const nbnxn_grid_t *grid;
2475     int i,j4,j,si,b;
2476     int c[GPU_NSUBCELL+1];
2477
2478     /* This code only produces correct statistics with domain decomposition */
2479     grid = &nbs->grid[0];
2480
2481     fprintf(fp,"nbl nsci %d ncj4 %d nsi %d excl4 %d\n",
2482             nbl->nsci,nbl->ncj4,nbl->nci_tot,nbl->nexcl);
2483     fprintf(fp,"nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
2484             nbl->na_ci,rl,nbl->nci_tot,nbl->nci_tot/(double)grid->nsubc_tot,
2485             nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c,
2486             nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nsubc_tot*grid->na_c/det(nbs->box)));
2487
2488     fprintf(fp,"nbl average j super cell list length %.1f\n",
2489             0.25*nbl->ncj4/(double)nbl->nsci);
2490     fprintf(fp,"nbl average i sub cell list length %.1f\n",
2491             nbl->nci_tot/(0.25*nbl->ncj4));
2492
2493     for(si=0; si<=GPU_NSUBCELL; si++)
2494     {
2495         c[si] = 0;
2496     }
2497     for(i=0; i<nbl->nsci; i++)
2498     {
2499         for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
2500         {
2501             for(j=0; j<4; j++)
2502             {
2503                 b = 0;
2504                 for(si=0; si<GPU_NSUBCELL; si++)
2505                 {
2506                     if (nbl->cj4[j4].imei[0].imask & (1U << (j*GPU_NSUBCELL + si)))
2507                     {
2508                         b++;
2509                     }
2510                 }
2511                 c[b]++;
2512             }
2513         }
2514     }
2515     for(b=0; b<=GPU_NSUBCELL; b++)
2516     {
2517         fprintf(fp,"nbl j-list #i-subcell %d %7d %4.1f\n",
2518                 b,c[b],100.0*c[b]/(double)(nbl->ncj4*NBNXN_GPU_JGROUP_SIZE));
2519     }
2520 }
2521
2522 /* Print the full pair list, used for debug output */
2523 static void print_supersub_nsp(const char *fn,
2524                                const nbnxn_pairlist_t *nbl,
2525                                int iloc)
2526 {
2527     char buf[STRLEN];
2528     FILE *fp;
2529     int i,nsp,j4,p;
2530
2531     sprintf(buf,"%s_%s.xvg",fn,NONLOCAL_I(iloc) ? "nl" : "l");
2532     fp = ffopen(buf,"w");
2533
2534     for(i=0; i<nbl->nci; i++)
2535     {
2536         nsp = 0;
2537         for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
2538         {
2539             for(p=0; p<NBNXN_GPU_JGROUP_SIZE*GPU_NSUBCELL; p++)
2540             {
2541                 nsp += (nbl->cj4[j4].imei[0].imask >> p) & 1;
2542             }
2543         }
2544         fprintf(fp,"%4d %3d %3d\n",
2545                 i,
2546                 nsp,
2547                 nbl->sci[i].cj4_ind_end-nbl->sci[i].cj4_ind_start);
2548     }
2549
2550     fclose(fp);
2551 }
2552
2553 /* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp */
2554 static void low_get_nbl_exclusions(nbnxn_pairlist_t *nbl,int cj4,
2555                                    int warp,nbnxn_excl_t **excl)
2556 {
2557     if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
2558     {
2559         /* No exclusions set, make a new list entry */
2560         nbl->cj4[cj4].imei[warp].excl_ind = nbl->nexcl;
2561         nbl->nexcl++;
2562         *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
2563         set_no_excls(*excl);
2564     }
2565     else
2566     {
2567         /* We already have some exclusions, new ones can be added to the list */
2568         *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
2569     }
2570 }
2571
2572 /* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp,
2573  * allocates extra memory, if necessary.
2574  */
2575 static void get_nbl_exclusions_1(nbnxn_pairlist_t *nbl,int cj4,
2576                                  int warp,nbnxn_excl_t **excl)
2577 {
2578     if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
2579     {
2580         /* We need to make a new list entry, check if we have space */
2581         check_excl_space(nbl,1);
2582     }
2583     low_get_nbl_exclusions(nbl,cj4,warp,excl);
2584 }
2585
2586 /* Returns pointers to the exclusion mask for cj4-unit cj4 for both warps,
2587  * allocates extra memory, if necessary.
2588  */
2589 static void get_nbl_exclusions_2(nbnxn_pairlist_t *nbl,int cj4,
2590                                  nbnxn_excl_t **excl_w0,
2591                                  nbnxn_excl_t **excl_w1)
2592 {
2593     /* Check for space we might need */
2594     check_excl_space(nbl,2);
2595
2596     low_get_nbl_exclusions(nbl,cj4,0,excl_w0);
2597     low_get_nbl_exclusions(nbl,cj4,1,excl_w1);
2598 }
2599
2600 /* Sets the self exclusions i=j and pair exclusions i>j */
2601 static void set_self_and_newton_excls_supersub(nbnxn_pairlist_t *nbl,
2602                                                int cj4_ind,int sj_offset,
2603                                                int si)
2604 {
2605     nbnxn_excl_t *excl[2];
2606     int  ei,ej,w;
2607
2608     /* Here we only set the set self and double pair exclusions */
2609
2610     get_nbl_exclusions_2(nbl,cj4_ind,&excl[0],&excl[1]);
2611
2612     /* Only minor < major bits set */
2613     for(ej=0; ej<nbl->na_ci; ej++)
2614     {
2615         w = (ej>>2);
2616         for(ei=ej; ei<nbl->na_ci; ei++)
2617         {
2618             excl[w]->pair[(ej&(4-1))*nbl->na_ci+ei] &=
2619                 ~(1U << (sj_offset*GPU_NSUBCELL+si));
2620         }
2621     }
2622 }
2623
2624 /* Returns a diagonal or off-diagonal interaction mask for plain C lists */
2625 static unsigned int get_imask(gmx_bool rdiag,int ci,int cj)
2626 {
2627     return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
2628 }
2629
2630 #ifdef NBNXN_SEARCH_SSE
2631 /* Returns a diagonal or off-diagonal interaction mask for SIMD128 lists */
2632 static unsigned int get_imask_x86_simd128(gmx_bool rdiag,int ci,int cj)
2633 {
2634 #ifndef GMX_DOUBLE /* cj-size = 4 */
2635     return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
2636 #else              /* cj-size = 2 */
2637     return (rdiag && ci*2 == cj ? NBNXN_INT_MASK_DIAG_J2_0 :
2638             (rdiag && ci*2+1 == cj ? NBNXN_INT_MASK_DIAG_J2_1 :
2639              NBNXN_INT_MASK_ALL));
2640 #endif
2641 }
2642
2643 #ifdef GMX_X86_AVX_256
2644 /* Returns a diagonal or off-diagonal interaction mask for SIMD256 lists */
2645 static unsigned int get_imask_x86_simd256(gmx_bool rdiag,int ci,int cj)
2646 {
2647 #ifndef GMX_DOUBLE /* cj-size = 8 */
2648     return (rdiag && ci == cj*2 ? NBNXN_INT_MASK_DIAG_J8_0 :
2649             (rdiag && ci == cj*2+1 ? NBNXN_INT_MASK_DIAG_J8_1 :
2650              NBNXN_INT_MASK_ALL));
2651 #else              /* cj-size = 2 */
2652     return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
2653 #endif
2654 }
2655 #endif
2656 #endif /* NBNXN_SEARCH_SSE */
2657
2658 /* Plain C code for making a pair list of cell ci vs cell cjf-cjl.
2659  * Checks bounding box distances and possibly atom pair distances.
2660  */
2661 static void make_cluster_list_simple(const nbnxn_grid_t *gridj,
2662                                      nbnxn_pairlist_t *nbl,
2663                                      int ci,int cjf,int cjl,
2664                                      gmx_bool remove_sub_diag,
2665                                      const real *x_j,
2666                                      real rl2,float rbb2,
2667                                      int *ndistc)
2668 {
2669     const nbnxn_list_work_t *work;
2670
2671     const float *bb_ci;
2672     const real  *x_ci;
2673
2674     gmx_bool   InRange;
2675     real       d2;
2676     int        cjf_gl,cjl_gl,cj;
2677
2678     work = nbl->work;
2679
2680     bb_ci = nbl->work->bb_ci;
2681     x_ci  = nbl->work->x_ci;
2682
2683     InRange = FALSE;
2684     while (!InRange && cjf <= cjl)
2685     {
2686         d2 = subc_bb_dist2(0,bb_ci,cjf,gridj->bb);
2687         *ndistc += 2;
2688
2689         /* Check if the distance is within the distance where
2690          * we use only the bounding box distance rbb,
2691          * or within the cut-off and there is at least one atom pair
2692          * within the cut-off.
2693          */
2694         if (d2 < rbb2)
2695         {
2696             InRange = TRUE;
2697         }
2698         else if (d2 < rl2)
2699         {
2700             int i,j;
2701
2702             cjf_gl = gridj->cell0 + cjf;
2703             for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
2704             {
2705                 for(j=0; j<NBNXN_CPU_CLUSTER_I_SIZE; j++)
2706                 {
2707                     InRange = InRange ||
2708                         (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
2709                          sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
2710                          sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
2711                 }
2712             }
2713             *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
2714         }
2715         if (!InRange)
2716         {
2717             cjf++;
2718         }
2719     }
2720     if (!InRange)
2721     {
2722         return;
2723     }
2724
2725     InRange = FALSE;
2726     while (!InRange && cjl > cjf)
2727     {
2728         d2 = subc_bb_dist2(0,bb_ci,cjl,gridj->bb);
2729         *ndistc += 2;
2730
2731         /* Check if the distance is within the distance where
2732          * we use only the bounding box distance rbb,
2733          * or within the cut-off and there is at least one atom pair
2734          * within the cut-off.
2735          */
2736         if (d2 < rbb2)
2737         {
2738             InRange = TRUE;
2739         }
2740         else if (d2 < rl2)
2741         {
2742             int i,j;
2743
2744             cjl_gl = gridj->cell0 + cjl;
2745             for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
2746             {
2747                 for(j=0; j<NBNXN_CPU_CLUSTER_I_SIZE; j++)
2748                 {
2749                     InRange = InRange ||
2750                         (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
2751                          sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
2752                          sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
2753                 }
2754             }
2755             *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
2756         }
2757         if (!InRange)
2758         {
2759             cjl--;
2760         }
2761     }
2762
2763     if (cjf <= cjl)
2764     {
2765         for(cj=cjf; cj<=cjl; cj++)
2766         {
2767             /* Store cj and the interaction mask */
2768             nbl->cj[nbl->ncj].cj   = gridj->cell0 + cj;
2769             nbl->cj[nbl->ncj].excl = get_imask(remove_sub_diag,ci,cj);
2770             nbl->ncj++;
2771         }
2772         /* Increase the closing index in i super-cell list */
2773         nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
2774     }
2775 }
2776
2777 #ifdef NBNXN_SEARCH_SSE
2778 /* Include make_cluster_list_x86_simd128/256 */
2779 #define GMX_MM128_HERE
2780 #include "gmx_x86_simd_macros.h"
2781 #define STRIDE_S  PACK_X4
2782 #include "nbnxn_search_x86_simd.h"
2783 #undef STRIDE_S
2784 #undef GMX_MM128_HERE
2785 #ifdef GMX_X86_AVX_256
2786 /* Include make_cluster_list_x86_simd128/256 */
2787 #define GMX_MM256_HERE
2788 #include "gmx_x86_simd_macros.h"
2789 #define STRIDE_S  GMX_X86_SIMD_WIDTH_HERE
2790 #include "nbnxn_search_x86_simd.h"
2791 #undef STRIDE_S
2792 #undef GMX_MM256_HERE
2793 #endif
2794 #endif
2795
2796 /* Plain C or SSE code for making a pair list of super-cell sci vs scj.
2797  * Checks bounding box distances and possibly atom pair distances.
2798  */
2799 static void make_cluster_list_supersub(const nbnxn_search_t nbs,
2800                                        const nbnxn_grid_t *gridi,
2801                                        const nbnxn_grid_t *gridj,
2802                                        nbnxn_pairlist_t *nbl,
2803                                        int sci,int scj,
2804                                        gmx_bool sci_equals_scj,
2805                                        int stride,const real *x,
2806                                        real rl2,float rbb2,
2807                                        int *ndistc)
2808 {
2809     int  na_c;
2810     int  npair;
2811     int  cjo,ci1,ci,cj,cj_gl;
2812     int  cj4_ind,cj_offset;
2813     unsigned imask;
2814     nbnxn_cj4_t *cj4;
2815     const float *bb_ci;
2816     const real *x_ci;
2817     float *d2l,d2;
2818     int  w;
2819 #define PRUNE_LIST_CPU_ONE
2820 #ifdef PRUNE_LIST_CPU_ONE
2821     int  ci_last=-1;
2822 #endif
2823
2824     d2l = nbl->work->d2;
2825
2826     bb_ci = nbl->work->bb_ci;
2827     x_ci  = nbl->work->x_ci;
2828
2829     na_c = gridj->na_c;
2830
2831     for(cjo=0; cjo<gridj->nsubc[scj]; cjo++)
2832     {
2833         cj4_ind   = (nbl->work->cj_ind >> 2);
2834         cj_offset = nbl->work->cj_ind - cj4_ind*NBNXN_GPU_JGROUP_SIZE;
2835         cj4       = &nbl->cj4[cj4_ind];
2836
2837         cj = scj*GPU_NSUBCELL + cjo;
2838
2839         cj_gl = gridj->cell0*GPU_NSUBCELL + cj;
2840
2841         /* Initialize this j-subcell i-subcell list */
2842         cj4->cj[cj_offset] = cj_gl;
2843         imask              = 0;
2844
2845         if (sci_equals_scj)
2846         {
2847             ci1 = cjo + 1;
2848         }
2849         else
2850         {
2851             ci1 = gridi->nsubc[sci];
2852         }
2853
2854 #ifdef NBNXN_BBXXXX
2855         /* Determine all ci1 bb distances in one call with SSE */
2856         subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_8BB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_8BB-1)),
2857                                ci1,bb_ci,d2l);
2858         *ndistc += na_c*2;
2859 #endif
2860
2861         npair = 0;
2862         /* We use a fixed upper-bound instead of ci1 to help optimization */
2863         for(ci=0; ci<GPU_NSUBCELL; ci++)
2864         {
2865             if (ci == ci1)
2866             {
2867                 break;
2868             }
2869
2870 #ifndef NBNXN_BBXXXX
2871             /* Determine the bb distance between ci and cj */
2872             d2l[ci] = subc_bb_dist2(ci,bb_ci,cj,gridj->bb);
2873             *ndistc += 2;
2874 #endif
2875             d2 = d2l[ci];
2876
2877 #ifdef PRUNE_LIST_CPU_ALL
2878             /* Check if the distance is within the distance where
2879              * we use only the bounding box distance rbb,
2880              * or within the cut-off and there is at least one atom pair
2881              * within the cut-off. This check is very costly.
2882              */
2883             *ndistc += na_c*na_c;
2884             if (d2 < rbb2 ||
2885                 (d2 < rl2 && subc_in_range_x(na_c,ci,x_ci,cj_gl,stride,x,rl2)))
2886 #else
2887             /* Check if the distance between the two bounding boxes
2888              * in within the pair-list cut-off.
2889              */
2890             if (d2 < rl2)
2891 #endif
2892             {
2893                 /* Flag this i-subcell to be taken into account */
2894                 imask |= (1U << (cj_offset*GPU_NSUBCELL+ci));
2895
2896 #ifdef PRUNE_LIST_CPU_ONE
2897                 ci_last = ci;
2898 #endif
2899
2900                 npair++;
2901             }
2902         }
2903
2904 #ifdef PRUNE_LIST_CPU_ONE
2905         /* If we only found 1 pair, check if any atoms are actually
2906          * within the cut-off, so we could get rid of it.
2907          */
2908         if (npair == 1 && d2l[ci_last] >= rbb2)
2909         {
2910             /* Avoid using function pointers here, as it's slower */
2911             if (
2912 #ifdef NBNXN_8BB_SSE
2913                 !subc_in_range_sse8
2914 #else
2915                 !subc_in_range_x
2916 #endif
2917                                 (na_c,ci_last,x_ci,cj_gl,stride,x,rl2))
2918             {
2919                 imask &= ~(1U << (cj_offset*GPU_NSUBCELL+ci_last));
2920                 npair--;
2921             }
2922         }
2923 #endif
2924
2925         if (npair > 0)
2926         {
2927             /* We have a useful sj entry, close it now */
2928
2929             /* Set the exclucions for the ci== sj entry.
2930              * Here we don't bother to check if this entry is actually flagged,
2931              * as it will nearly always be in the list.
2932              */
2933             if (sci_equals_scj)
2934             {
2935                 set_self_and_newton_excls_supersub(nbl,cj4_ind,cj_offset,cjo);
2936             }
2937
2938             /* Copy the cluster interaction mask to the list */
2939             for(w=0; w<NWARP; w++)
2940             {
2941                 cj4->imei[w].imask |= imask;
2942             }
2943
2944             nbl->work->cj_ind++;
2945
2946             /* Keep the count */
2947             nbl->nci_tot += npair;
2948
2949             /* Increase the closing index in i super-cell list */
2950             nbl->sci[nbl->nsci].cj4_ind_end = ((nbl->work->cj_ind+4-1)>>2);
2951         }
2952     }
2953 }
2954
2955 /* Set all atom-pair exclusions from the topology stored in excl
2956  * as masks in the pair-list for simple list i-entry nbl_ci
2957  */
2958 static void set_ci_top_excls(const nbnxn_search_t nbs,
2959                              nbnxn_pairlist_t *nbl,
2960                              gmx_bool diagRemoved,
2961                              int na_ci_2log,
2962                              int na_cj_2log,
2963                              const nbnxn_ci_t *nbl_ci,
2964                              const t_blocka *excl)
2965 {
2966     const int *cell;
2967     int ci;
2968     int cj_ind_first,cj_ind_last;
2969     int cj_first,cj_last;
2970     int ndirect;
2971     int i,ai,aj,si,eind,ge,se;
2972     int found,cj_ind_0,cj_ind_1,cj_ind_m;
2973     int cj_m;
2974     gmx_bool Found_si;
2975     int si_ind;
2976     nbnxn_excl_t *nbl_excl;
2977     int inner_i,inner_e;
2978
2979     cell = nbs->cell;
2980
2981     if (nbl_ci->cj_ind_end == nbl_ci->cj_ind_start)
2982     {
2983         /* Empty list */
2984         return;
2985     }
2986
2987     ci = nbl_ci->ci;
2988
2989     cj_ind_first = nbl_ci->cj_ind_start;
2990     cj_ind_last  = nbl->ncj - 1;
2991
2992     cj_first = nbl->cj[cj_ind_first].cj;
2993     cj_last  = nbl->cj[cj_ind_last].cj;
2994
2995     /* Determine how many contiguous j-cells we have starting
2996      * from the first i-cell. This number can be used to directly
2997      * calculate j-cell indices for excluded atoms.
2998      */
2999     ndirect = 0;
3000     if (na_ci_2log == na_cj_2log)
3001     {
3002         while (cj_ind_first + ndirect <= cj_ind_last &&
3003                nbl->cj[cj_ind_first+ndirect].cj == ci + ndirect)
3004         {
3005             ndirect++;
3006         }
3007     }
3008 #ifdef NBNXN_SEARCH_SSE
3009     else
3010     {
3011         while (cj_ind_first + ndirect <= cj_ind_last &&
3012                nbl->cj[cj_ind_first+ndirect].cj == ci_to_cj(na_cj_2log,ci) + ndirect)
3013         {
3014             ndirect++;
3015         }
3016     }
3017 #endif
3018
3019     /* Loop over the atoms in the i super-cell */
3020     for(i=0; i<nbl->na_sc; i++)
3021     {
3022         ai = nbs->a[ci*nbl->na_sc+i];
3023         if (ai >= 0)
3024         {
3025             si  = (i>>na_ci_2log);
3026
3027             /* Loop over the topology-based exclusions for this i-atom */
3028             for(eind=excl->index[ai]; eind<excl->index[ai+1]; eind++)
3029             {
3030                 aj = excl->a[eind];
3031
3032                 if (aj == ai)
3033                 {
3034                     /* The self exclusion are already set, save some time */
3035                     continue;
3036                 }
3037
3038                 ge = cell[aj];
3039
3040                 /* Without shifts we only calculate interactions j>i
3041                  * for one-way pair-lists.
3042                  */
3043                 if (diagRemoved && ge <= ci*nbl->na_sc + i)
3044                 {
3045                     continue;
3046                 }
3047
3048                 se = (ge >> na_cj_2log);
3049
3050                 /* Could the cluster se be in our list? */
3051                 if (se >= cj_first && se <= cj_last)
3052                 {
3053                     if (se < cj_first + ndirect)
3054                     {
3055                         /* We can calculate cj_ind directly from se */
3056                         found = cj_ind_first + se - cj_first;
3057                     }
3058                     else
3059                     {
3060                         /* Search for se using bisection */
3061                         found = -1;
3062                         cj_ind_0 = cj_ind_first + ndirect;
3063                         cj_ind_1 = cj_ind_last + 1;
3064                         while (found == -1 && cj_ind_0 < cj_ind_1)
3065                         {
3066                             cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
3067
3068                             cj_m = nbl->cj[cj_ind_m].cj;
3069
3070                             if (se == cj_m)
3071                             {
3072                                 found = cj_ind_m;
3073                             }
3074                             else if (se < cj_m)
3075                             {
3076                                 cj_ind_1 = cj_ind_m;
3077                             }
3078                             else
3079                             {
3080                                 cj_ind_0 = cj_ind_m + 1;
3081                             }
3082                         }
3083                     }
3084
3085                     if (found >= 0)
3086                     {
3087                         inner_i = i  - (si << na_ci_2log);
3088                         inner_e = ge - (se << na_cj_2log);
3089
3090                         nbl->cj[found].excl &= ~(1U<<((inner_i<<na_cj_2log) + inner_e));
3091                     }
3092                 }
3093             }
3094         }
3095     }
3096 }
3097
3098 /* Set all atom-pair exclusions from the topology stored in excl
3099  * as masks in the pair-list for i-super-cell entry nbl_sci
3100  */
3101 static void set_sci_top_excls(const nbnxn_search_t nbs,
3102                               nbnxn_pairlist_t *nbl,
3103                               gmx_bool diagRemoved,
3104                               int na_c_2log,
3105                               const nbnxn_sci_t *nbl_sci,
3106                               const t_blocka *excl)
3107 {
3108     const int *cell;
3109     int na_c;
3110     int sci;
3111     int cj_ind_first,cj_ind_last;
3112     int cj_first,cj_last;
3113     int ndirect;
3114     int i,ai,aj,si,eind,ge,se;
3115     int found,cj_ind_0,cj_ind_1,cj_ind_m;
3116     int cj_m;
3117     gmx_bool Found_si;
3118     int si_ind;
3119     nbnxn_excl_t *nbl_excl;
3120     int inner_i,inner_e,w;
3121
3122     cell = nbs->cell;
3123
3124     na_c = nbl->na_ci;
3125
3126     if (nbl_sci->cj4_ind_end == nbl_sci->cj4_ind_start)
3127     {
3128         /* Empty list */
3129         return;
3130     }
3131
3132     sci = nbl_sci->sci;
3133
3134     cj_ind_first = nbl_sci->cj4_ind_start*NBNXN_GPU_JGROUP_SIZE;
3135     cj_ind_last  = nbl->work->cj_ind - 1;
3136
3137     cj_first = nbl->cj4[nbl_sci->cj4_ind_start].cj[0];
3138     cj_last  = nbl_cj(nbl,cj_ind_last);
3139
3140     /* Determine how many contiguous j-clusters we have starting
3141      * from the first i-cluster. This number can be used to directly
3142      * calculate j-cluster indices for excluded atoms.
3143      */
3144     ndirect = 0;
3145     while (cj_ind_first + ndirect <= cj_ind_last &&
3146            nbl_cj(nbl,cj_ind_first+ndirect) == sci*GPU_NSUBCELL + ndirect)
3147     {
3148         ndirect++;
3149     }
3150
3151     /* Loop over the atoms in the i super-cell */
3152     for(i=0; i<nbl->na_sc; i++)
3153     {
3154         ai = nbs->a[sci*nbl->na_sc+i];
3155         if (ai >= 0)
3156         {
3157             si  = (i>>na_c_2log);
3158
3159             /* Loop over the topology-based exclusions for this i-atom */
3160             for(eind=excl->index[ai]; eind<excl->index[ai+1]; eind++)
3161             {
3162                 aj = excl->a[eind];
3163
3164                 if (aj == ai)
3165                 {
3166                     /* The self exclusion are already set, save some time */
3167                     continue;
3168                 }
3169
3170                 ge = cell[aj];
3171
3172                 /* Without shifts we only calculate interactions j>i
3173                  * for one-way pair-lists.
3174                  */
3175                 if (diagRemoved && ge <= sci*nbl->na_sc + i)
3176                 {
3177                     continue;
3178                 }
3179
3180                 se = ge>>na_c_2log;
3181                 /* Could the cluster se be in our list? */
3182                 if (se >= cj_first && se <= cj_last)
3183                 {
3184                     if (se < cj_first + ndirect)
3185                     {
3186                         /* We can calculate cj_ind directly from se */
3187                         found = cj_ind_first + se - cj_first;
3188                     }
3189                     else
3190                     {
3191                         /* Search for se using bisection */
3192                         found = -1;
3193                         cj_ind_0 = cj_ind_first + ndirect;
3194                         cj_ind_1 = cj_ind_last + 1;
3195                         while (found == -1 && cj_ind_0 < cj_ind_1)
3196                         {
3197                             cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
3198
3199                             cj_m = nbl_cj(nbl,cj_ind_m);
3200
3201                             if (se == cj_m)
3202                             {
3203                                 found = cj_ind_m;
3204                             }
3205                             else if (se < cj_m)
3206                             {
3207                                 cj_ind_1 = cj_ind_m;
3208                             }
3209                             else
3210                             {
3211                                 cj_ind_0 = cj_ind_m + 1;
3212                             }
3213                         }
3214                     }
3215
3216                     if (found >= 0)
3217                     {
3218                         inner_i = i  - si*na_c;
3219                         inner_e = ge - se*na_c;
3220
3221 /* Macro for getting the index of atom a within a cluster */
3222 #define AMODI(a)  ((a) & (NBNXN_CPU_CLUSTER_I_SIZE - 1))
3223 /* Macro for converting an atom number to a cluster number */
3224 #define A2CI(a)   ((a) >> NBNXN_CPU_CLUSTER_I_SIZE_2LOG)
3225
3226                         if (nbl_imask0(nbl,found) & (1U << (AMODI(found)*GPU_NSUBCELL + si)))
3227                         {
3228                             w       = (inner_e >> 2);
3229
3230                             get_nbl_exclusions_1(nbl,A2CI(found),w,&nbl_excl);
3231
3232                             nbl_excl->pair[AMODI(inner_e)*nbl->na_ci+inner_i] &=
3233                                 ~(1U << (AMODI(found)*GPU_NSUBCELL + si));
3234                         }
3235
3236 #undef AMODI
3237 #undef A2CI
3238                     }
3239                 }
3240             }
3241         }
3242     }
3243 }
3244
3245 /* Reallocate the simple ci list for at least n entries */
3246 static void nb_realloc_ci(nbnxn_pairlist_t *nbl,int n)
3247 {
3248     nbl->ci_nalloc = over_alloc_small(n);
3249     nbnxn_realloc_void((void **)&nbl->ci,
3250                        nbl->nci*sizeof(*nbl->ci),
3251                        nbl->ci_nalloc*sizeof(*nbl->ci),
3252                        nbl->alloc,nbl->free);
3253 }
3254
3255 /* Reallocate the super-cell sci list for at least n entries */
3256 static void nb_realloc_sci(nbnxn_pairlist_t *nbl,int n)
3257 {
3258     nbl->sci_nalloc = over_alloc_small(n);
3259     nbnxn_realloc_void((void **)&nbl->sci,
3260                        nbl->nsci*sizeof(*nbl->sci),
3261                        nbl->sci_nalloc*sizeof(*nbl->sci),
3262                        nbl->alloc,nbl->free);
3263 }
3264
3265 /* Make a new ci entry at index nbl->nci */
3266 static void new_ci_entry(nbnxn_pairlist_t *nbl,int ci,int shift,int flags,
3267                          nbnxn_list_work_t *work)
3268 {
3269     if (nbl->nci + 1 > nbl->ci_nalloc)
3270     {
3271         nb_realloc_ci(nbl,nbl->nci+1);
3272     }
3273     nbl->ci[nbl->nci].ci            = ci;
3274     nbl->ci[nbl->nci].shift         = shift;
3275     /* Store the interaction flags along with the shift */
3276     nbl->ci[nbl->nci].shift        |= flags;
3277     nbl->ci[nbl->nci].cj_ind_start  = nbl->ncj;
3278     nbl->ci[nbl->nci].cj_ind_end    = nbl->ncj;
3279 }
3280
3281 /* Make a new sci entry at index nbl->nsci */
3282 static void new_sci_entry(nbnxn_pairlist_t *nbl,int sci,int shift,int flags,
3283                           nbnxn_list_work_t *work)
3284 {
3285     if (nbl->nsci + 1 > nbl->sci_nalloc)
3286     {
3287         nb_realloc_sci(nbl,nbl->nsci+1);
3288     }
3289     nbl->sci[nbl->nsci].sci           = sci;
3290     nbl->sci[nbl->nsci].shift         = shift;
3291     nbl->sci[nbl->nsci].cj4_ind_start = nbl->ncj4;
3292     nbl->sci[nbl->nsci].cj4_ind_end   = nbl->ncj4;
3293 }
3294
3295 /* Sort the simple j-list cj on exclusions.
3296  * Entries with exclusions will all be sorted to the beginning of the list.
3297  */
3298 static void sort_cj_excl(nbnxn_cj_t *cj,int ncj,
3299                          nbnxn_list_work_t *work)
3300 {
3301     int jnew,j;
3302
3303     if (ncj > work->cj_nalloc)
3304     {
3305         work->cj_nalloc = over_alloc_large(ncj);
3306         srenew(work->cj,work->cj_nalloc);
3307     }
3308
3309     /* Make a list of the j-cells involving exclusions */
3310     jnew = 0;
3311     for(j=0; j<ncj; j++)
3312     {
3313         if (cj[j].excl != NBNXN_INT_MASK_ALL)
3314         {
3315             work->cj[jnew++] = cj[j];
3316         }
3317     }
3318     /* Check if there are exclusions at all or not just the first entry */
3319     if (!((jnew == 0) ||
3320           (jnew == 1 && cj[0].excl != NBNXN_INT_MASK_ALL)))
3321     {
3322         for(j=0; j<ncj; j++)
3323         {
3324             if (cj[j].excl == NBNXN_INT_MASK_ALL)
3325             {
3326                 work->cj[jnew++] = cj[j];
3327             }
3328         }
3329         for(j=0; j<ncj; j++)
3330         {
3331             cj[j] = work->cj[j];
3332         }
3333     }
3334 }
3335
3336 /* Close this simple list i entry */
3337 static void close_ci_entry_simple(nbnxn_pairlist_t *nbl)
3338 {
3339     int jlen;
3340
3341     /* All content of the new ci entry have already been filled correctly,
3342      * we only need to increase the count here (for non empty lists).
3343      */
3344     jlen = nbl->ci[nbl->nci].cj_ind_end - nbl->ci[nbl->nci].cj_ind_start;
3345     if (jlen > 0)
3346     {
3347         sort_cj_excl(nbl->cj+nbl->ci[nbl->nci].cj_ind_start,jlen,nbl->work);
3348
3349         if (nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0))
3350         {
3351             nbl->work->ncj_hlj += jlen;
3352         }
3353         else if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
3354         {
3355             nbl->work->ncj_noq += jlen;
3356         }
3357
3358         nbl->nci++;
3359     }
3360 }
3361
3362 /* Split sci entry for load balancing on the GPU.
3363  * As we only now the current count on our own thread,
3364  * we will need to estimate the current total amount of i-entries.
3365  * As the lists get concatenated later, this estimate depends
3366  * both on nthread and our own thread index thread.
3367  */
3368 static void split_sci_entry(nbnxn_pairlist_t *nbl,
3369                             int nsp_max_av,gmx_bool progBal,int nc_bal,
3370                             int thread,int nthread)
3371 {
3372     int nsci_est;
3373     int nsp_max;
3374     int cj4_start,cj4_end,j4len,cj4;
3375     int sci;
3376     int nsp,nsp_sci,nsp_cj4,nsp_cj4_e,nsp_cj4_p;
3377     int p;
3378
3379     /* Estimate the total numbers of ci's of the nblist combined
3380      * over all threads using the target number of ci's.
3381      */
3382     nsci_est = nc_bal*thread/nthread + nbl->nsci;
3383     if (progBal)
3384     {
3385         /* The first ci blocks should be larger, to avoid overhead.
3386          * The last ci blocks should be smaller, to improve load balancing.
3387          */
3388         nsp_max = max(1,
3389                       nsp_max_av*nc_bal*3/(2*(nsci_est - 1 + nc_bal)));
3390     }
3391     else
3392     {
3393         nsp_max = nsp_max_av;
3394     }
3395
3396     cj4_start = nbl->sci[nbl->nsci-1].cj4_ind_start;
3397     cj4_end   = nbl->sci[nbl->nsci-1].cj4_ind_end;
3398     j4len = cj4_end - cj4_start;
3399
3400     if (j4len > 1 && j4len*GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE > nsp_max)
3401     {
3402         /* Remove the last ci entry and process the cj4's again */
3403         nbl->nsci -= 1;
3404
3405         sci        = nbl->nsci;
3406         cj4        = cj4_start;
3407         nsp        = 0;
3408         nsp_sci    = 0;
3409         nsp_cj4_e  = 0;
3410         nsp_cj4    = 0;
3411         while (cj4 < cj4_end)
3412         {
3413             nsp_cj4_p = nsp_cj4;
3414             nsp_cj4   = 0;
3415             for(p=0; p<GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE; p++)
3416             {
3417                 nsp_cj4 += (nbl->cj4[cj4].imei[0].imask >> p) & 1;
3418             }
3419             nsp += nsp_cj4;
3420
3421             if (nsp > nsp_max && nsp > nsp_cj4)
3422             {
3423                 nbl->sci[sci].cj4_ind_end = cj4;
3424                 sci++;
3425                 nbl->nsci++;
3426                 if (nbl->nsci+1 > nbl->sci_nalloc)
3427                 {
3428                     nb_realloc_sci(nbl,nbl->nsci+1);
3429                 }
3430                 nbl->sci[sci].sci           = nbl->sci[nbl->nsci-1].sci;
3431                 nbl->sci[sci].shift         = nbl->sci[nbl->nsci-1].shift;
3432                 nbl->sci[sci].cj4_ind_start = cj4;
3433                 nsp_sci   = nsp - nsp_cj4;
3434                 nsp_cj4_e = nsp_cj4_p;
3435                 nsp       = nsp_cj4;
3436             }
3437
3438             cj4++;
3439         }
3440
3441         /* Put the remaining cj4's in a new ci entry */
3442         nbl->sci[sci].cj4_ind_end = cj4_end;
3443
3444         /* Possibly balance out the last two ci's
3445          * by moving the last cj4 of the second last ci.
3446          */
3447         if (nsp_sci - nsp_cj4_e >= nsp + nsp_cj4_e)
3448         {
3449             nbl->sci[sci-1].cj4_ind_end--;
3450             nbl->sci[sci].cj4_ind_start--;
3451         }
3452
3453         sci++;
3454         nbl->nsci++;
3455     }
3456 }
3457
3458 /* Clost this super/sub list i entry */
3459 static void close_ci_entry_supersub(nbnxn_pairlist_t *nbl,
3460                                     int nsp_max_av,
3461                                     gmx_bool progBal,int nc_bal,
3462                                     int thread,int nthread)
3463 {
3464     int j4len,tlen;
3465     int nb,b;
3466
3467     /* All content of the new ci entry have already been filled correctly,
3468      * we only need to increase the count here (for non empty lists).
3469      */
3470     j4len = nbl->sci[nbl->nsci].cj4_ind_end - nbl->sci[nbl->nsci].cj4_ind_start;
3471     if (j4len > 0)
3472     {
3473         /* We can only have complete blocks of 4 j-entries in a list,
3474          * so round the count up before closing.
3475          */
3476         nbl->ncj4         = ((nbl->work->cj_ind + 4-1) >> 2);
3477         nbl->work->cj_ind = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
3478
3479         nbl->nsci++;
3480
3481         if (nsp_max_av > 0)
3482         {
3483             split_sci_entry(nbl,nsp_max_av,progBal,nc_bal,thread,nthread);
3484         }
3485     }
3486 }
3487
3488 /* Syncs the working array before adding another grid pair to the list */
3489 static void sync_work(nbnxn_pairlist_t *nbl)
3490 {
3491     if (!nbl->bSimple)
3492     {
3493         nbl->work->cj_ind   = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
3494         nbl->work->cj4_init = nbl->ncj4;
3495     }
3496 }
3497
3498 /* Clears an nbnxn_pairlist_t data structure */
3499 static void clear_pairlist(nbnxn_pairlist_t *nbl)
3500 {
3501     nbl->nci           = 0;
3502     nbl->nsci          = 0;
3503     nbl->ncj           = 0;
3504     nbl->ncj4          = 0;
3505     nbl->nci_tot       = 0;
3506     nbl->nexcl         = 1;
3507
3508     nbl->work->ncj_noq = 0;
3509     nbl->work->ncj_hlj = 0;
3510 }
3511
3512 /* Sets a simple list i-cell bounding box, including PBC shift */
3513 static void set_icell_bb_simple(const float *bb,int ci,
3514                                 real shx,real shy,real shz,
3515                                 float *bb_ci)
3516 {
3517     int ia;
3518
3519     ia = ci*NNBSBB_B;
3520     bb_ci[BBL_X] = bb[ia+BBL_X] + shx;
3521     bb_ci[BBL_Y] = bb[ia+BBL_Y] + shy;
3522     bb_ci[BBL_Z] = bb[ia+BBL_Z] + shz;
3523     bb_ci[BBU_X] = bb[ia+BBU_X] + shx;
3524     bb_ci[BBU_Y] = bb[ia+BBU_Y] + shy;
3525     bb_ci[BBU_Z] = bb[ia+BBU_Z] + shz;
3526 }
3527
3528 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
3529 static void set_icell_bb_supersub(const float *bb,int ci,
3530                                   real shx,real shy,real shz,
3531                                   float *bb_ci)
3532 {
3533     int ia,m,i;
3534
3535 #ifdef NBNXN_BBXXXX
3536     ia = ci*(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX;
3537     for(m=0; m<(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX; m+=NNBSBB_XXXX)
3538     {
3539         for(i=0; i<STRIDE_8BB; i++)
3540         {
3541             bb_ci[m+0*STRIDE_8BB+i] = bb[ia+m+0*STRIDE_8BB+i] + shx;
3542             bb_ci[m+1*STRIDE_8BB+i] = bb[ia+m+1*STRIDE_8BB+i] + shy;
3543             bb_ci[m+2*STRIDE_8BB+i] = bb[ia+m+2*STRIDE_8BB+i] + shz;
3544             bb_ci[m+3*STRIDE_8BB+i] = bb[ia+m+3*STRIDE_8BB+i] + shx;
3545             bb_ci[m+4*STRIDE_8BB+i] = bb[ia+m+4*STRIDE_8BB+i] + shy;
3546             bb_ci[m+5*STRIDE_8BB+i] = bb[ia+m+5*STRIDE_8BB+i] + shz;
3547         }
3548     }
3549 #else
3550     ia = ci*GPU_NSUBCELL*NNBSBB_B;
3551     for(i=0; i<GPU_NSUBCELL*NNBSBB_B; i+=NNBSBB_B)
3552     {
3553         bb_ci[BBL_X] = bb[ia+BBL_X] + shx;
3554         bb_ci[BBL_Y] = bb[ia+BBL_Y] + shy;
3555         bb_ci[BBL_Z] = bb[ia+BBL_Z] + shz;
3556         bb_ci[BBU_X] = bb[ia+BBU_X] + shx;
3557         bb_ci[BBU_Y] = bb[ia+BBU_Y] + shy;
3558         bb_ci[BBU_Z] = bb[ia+BBU_Z] + shz;
3559     }
3560 #endif
3561 }
3562
3563 /* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
3564 static void icell_set_x_simple(int ci,
3565                                real shx,real shy,real shz,
3566                                int na_c,
3567                                int stride,const real *x,
3568                                nbnxn_list_work_t *work)
3569 {
3570     int  ia,i;
3571
3572     ia = ci*NBNXN_CPU_CLUSTER_I_SIZE;
3573
3574     for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE; i++)
3575     {
3576         work->x_ci[i*STRIDE_XYZ+XX] = x[(ia+i)*stride+XX] + shx;
3577         work->x_ci[i*STRIDE_XYZ+YY] = x[(ia+i)*stride+YY] + shy;
3578         work->x_ci[i*STRIDE_XYZ+ZZ] = x[(ia+i)*stride+ZZ] + shz;
3579     }
3580 }
3581
3582 /* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
3583 static void icell_set_x_supersub(int ci,
3584                                  real shx,real shy,real shz,
3585                                  int na_c,
3586                                  int stride,const real *x,
3587                                  nbnxn_list_work_t *work)
3588 {
3589     int  ia,i;
3590     real *x_ci;
3591
3592     x_ci = work->x_ci;
3593
3594     ia = ci*GPU_NSUBCELL*na_c;
3595     for(i=0; i<GPU_NSUBCELL*na_c; i++)
3596     {
3597         x_ci[i*DIM + XX] = x[(ia+i)*stride + XX] + shx;
3598         x_ci[i*DIM + YY] = x[(ia+i)*stride + YY] + shy;
3599         x_ci[i*DIM + ZZ] = x[(ia+i)*stride + ZZ] + shz;
3600     }
3601 }
3602
3603 #ifdef NBNXN_SEARCH_SSE
3604 /* Copies PBC shifted super-cell packed atom coordinates to working array */
3605 static void icell_set_x_supersub_sse8(int ci,
3606                                       real shx,real shy,real shz,
3607                                       int na_c,
3608                                       int stride,const real *x,
3609                                       nbnxn_list_work_t *work)
3610 {
3611     int  si,io,ia,i,j;
3612     real *x_ci;
3613
3614     x_ci = work->x_ci;
3615
3616     for(si=0; si<GPU_NSUBCELL; si++)
3617     {
3618         for(i=0; i<na_c; i+=STRIDE_8BB)
3619         {
3620             io = si*na_c + i;
3621             ia = ci*GPU_NSUBCELL*na_c + io;
3622             for(j=0; j<STRIDE_8BB; j++)
3623             {
3624                 x_ci[io*DIM + j + XX*STRIDE_8BB] = x[(ia+j)*stride+XX] + shx;
3625                 x_ci[io*DIM + j + YY*STRIDE_8BB] = x[(ia+j)*stride+YY] + shy;
3626                 x_ci[io*DIM + j + ZZ*STRIDE_8BB] = x[(ia+j)*stride+ZZ] + shz;
3627             }
3628         }
3629     }
3630 }
3631 #endif
3632
3633 static real nbnxn_rlist_inc_nonloc_fac = 0.6;
3634
3635 /* Due to the cluster size the effective pair-list is longer than
3636  * that of a simple atom pair-list. This function gives the extra distance.
3637  */
3638 real nbnxn_get_rlist_effective_inc(int cluster_size,real atom_density)
3639 {
3640     return ((0.5 + nbnxn_rlist_inc_nonloc_fac)*sqr(((cluster_size) - 1.0)/(cluster_size))*pow((cluster_size)/(atom_density),1.0/3.0));
3641 }
3642
3643 /* Estimates the interaction volume^2 for non-local interactions */
3644 static real nonlocal_vol2(const gmx_domdec_zones_t *zones,rvec ls,real r)
3645 {
3646     int  z,d;
3647     real cl,ca,za;
3648     real vold_est;
3649     real vol2_est_tot;
3650
3651     vol2_est_tot = 0;
3652
3653     /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
3654      * not home interaction volume^2. As these volumes are not additive,
3655      * this is an overestimate, but it would only be significant in the limit
3656      * of small cells, where we anyhow need to split the lists into
3657      * as small parts as possible.
3658      */
3659
3660     for(z=0; z<zones->n; z++)
3661     {
3662         if (zones->shift[z][XX] + zones->shift[z][YY] + zones->shift[z][ZZ] == 1)
3663         {
3664             cl = 0;
3665             ca = 1;
3666             za = 1;
3667             for(d=0; d<DIM; d++)
3668             {
3669                 if (zones->shift[z][d] == 0)
3670                 {
3671                     cl += 0.5*ls[d];
3672                     ca *= ls[d];
3673                     za *= zones->size[z].x1[d] - zones->size[z].x0[d];
3674                 }
3675             }
3676
3677             /* 4 octants of a sphere */
3678             vold_est  = 0.25*M_PI*r*r*r*r;
3679             /* 4 quarter pie slices on the edges */
3680             vold_est += 4*cl*M_PI/6.0*r*r*r;
3681             /* One rectangular volume on a face */
3682             vold_est += ca*0.5*r*r;
3683
3684             vol2_est_tot += vold_est*za;
3685         }
3686     }
3687
3688     return vol2_est_tot;
3689 }
3690
3691 /* Estimates the average size of a full j-list for super/sub setup */
3692 static int get_nsubpair_max(const nbnxn_search_t nbs,
3693                             int iloc,
3694                             real rlist,
3695                             int min_ci_balanced)
3696 {
3697     const nbnxn_grid_t *grid;
3698     rvec ls;
3699     real xy_diag2,r_eff_sup,vol_est,nsp_est,nsp_est_nl;
3700     int  nsubpair_max;
3701
3702     grid = &nbs->grid[0];
3703
3704     ls[XX] = (grid->c1[XX] - grid->c0[XX])/(grid->ncx*GPU_NSUBCELL_X);
3705     ls[YY] = (grid->c1[YY] - grid->c0[YY])/(grid->ncy*GPU_NSUBCELL_Y);
3706     ls[ZZ] = (grid->c1[ZZ] - grid->c0[ZZ])*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z);
3707
3708     /* The average squared length of the diagonal of a sub cell */
3709     xy_diag2 = ls[XX]*ls[XX] + ls[YY]*ls[YY] + ls[ZZ]*ls[ZZ];
3710
3711     /* The formulas below are a heuristic estimate of the average nsj per si*/
3712     r_eff_sup = rlist + nbnxn_rlist_inc_nonloc_fac*sqr((grid->na_c - 1.0)/grid->na_c)*sqrt(xy_diag2/3);
3713
3714     if (!nbs->DomDec || nbs->zones->n == 1)
3715     {
3716         nsp_est_nl = 0;
3717     }
3718     else
3719     {
3720         nsp_est_nl =
3721             sqr(grid->atom_density/grid->na_c)*
3722             nonlocal_vol2(nbs->zones,ls,r_eff_sup);
3723     }
3724
3725     if (LOCAL_I(iloc))
3726     {
3727         /* Sub-cell interacts with itself */
3728         vol_est  = ls[XX]*ls[YY]*ls[ZZ];
3729         /* 6/2 rectangular volume on the faces */
3730         vol_est += (ls[XX]*ls[YY] + ls[XX]*ls[ZZ] + ls[YY]*ls[ZZ])*r_eff_sup;
3731         /* 12/2 quarter pie slices on the edges */
3732         vol_est += 2*(ls[XX] + ls[YY] + ls[ZZ])*0.25*M_PI*sqr(r_eff_sup);
3733         /* 4 octants of a sphere */
3734         vol_est += 0.5*4.0/3.0*M_PI*pow(r_eff_sup,3);
3735
3736         nsp_est = grid->nsubc_tot*vol_est*grid->atom_density/grid->na_c;
3737
3738         /* Subtract the non-local pair count */
3739         nsp_est -= nsp_est_nl;
3740
3741         if (debug)
3742         {
3743             fprintf(debug,"nsp_est local %5.1f non-local %5.1f\n",
3744                     nsp_est,nsp_est_nl);
3745         }
3746     }
3747     else
3748     {
3749         nsp_est = nsp_est_nl;
3750     }
3751
3752     if (min_ci_balanced <= 0 || grid->nc >= min_ci_balanced || grid->nc == 0)
3753     {
3754         /* We don't need to worry */
3755         nsubpair_max = -1;
3756     }
3757     else
3758     {
3759         /* Thus the (average) maximum j-list size should be as follows */
3760         nsubpair_max = max(1,(int)(nsp_est/min_ci_balanced+0.5));
3761
3762         /* Since the target value is a maximum (this avoid high outliers,
3763          * which lead to load imbalance), not average, we get more lists
3764          * than we ask for (to compensate we need to add GPU_NSUBCELL*4/4).
3765          * But more importantly, the optimal GPU performance moves
3766          * to lower number of block for very small blocks.
3767          * To compensate we add the maximum pair count per cj4.
3768          */
3769         nsubpair_max += GPU_NSUBCELL*NBNXN_CPU_CLUSTER_I_SIZE;
3770     }
3771
3772     if (debug)
3773     {
3774         fprintf(debug,"nbl nsp estimate %.1f, nsubpair_max %d\n",
3775                 nsp_est,nsubpair_max);
3776     }
3777
3778     return nsubpair_max;
3779 }
3780
3781 /* Debug list print function */
3782 static void print_nblist_ci_cj(FILE *fp,const nbnxn_pairlist_t *nbl)
3783 {
3784     int i,j;
3785
3786     for(i=0; i<nbl->nci; i++)
3787     {
3788         fprintf(fp,"ci %4d  shift %2d  ncj %3d\n",
3789                 nbl->ci[i].ci,nbl->ci[i].shift,
3790                 nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start);
3791
3792         for(j=nbl->ci[i].cj_ind_start; j<nbl->ci[i].cj_ind_end; j++)
3793         {
3794             fprintf(fp,"  cj %5d  imask %x\n",
3795                     nbl->cj[j].cj,
3796                     nbl->cj[j].excl);
3797         }
3798     }
3799 }
3800
3801 /* Debug list print function */
3802 static void print_nblist_sci_cj(FILE *fp,const nbnxn_pairlist_t *nbl)
3803 {
3804     int i,j4,j;
3805
3806     for(i=0; i<nbl->nsci; i++)
3807     {
3808         fprintf(fp,"ci %4d  shift %2d  ncj4 %2d\n",
3809                 nbl->sci[i].sci,nbl->sci[i].shift,
3810                 nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start);
3811
3812         for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
3813         {
3814             for(j=0; j<4; j++)
3815             {
3816                 fprintf(fp,"  sj %5d  imask %x\n",
3817                         nbl->cj4[j4].cj[j],
3818                         nbl->cj4[j4].imei[0].imask);
3819             }
3820         }
3821     }
3822 }
3823
3824 /* Combine pair lists *nbl generated on multiple threads nblc */
3825 static void combine_nblists(int nnbl,nbnxn_pairlist_t **nbl,
3826                             nbnxn_pairlist_t *nblc)
3827 {
3828     int nsci,ncj4,nexcl;
3829     int n,i;
3830
3831     if (nblc->bSimple)
3832     {
3833         gmx_incons("combine_nblists does not support simple lists");
3834     }
3835
3836     nsci  = nblc->nsci;
3837     ncj4  = nblc->ncj4;
3838     nexcl = nblc->nexcl;
3839     for(i=0; i<nnbl; i++)
3840     {
3841         nsci  += nbl[i]->nsci;
3842         ncj4  += nbl[i]->ncj4;
3843         nexcl += nbl[i]->nexcl;
3844     }
3845
3846     if (nsci > nblc->sci_nalloc)
3847     {
3848         nb_realloc_sci(nblc,nsci);
3849     }
3850     if (ncj4 > nblc->cj4_nalloc)
3851     {
3852         nblc->cj4_nalloc = over_alloc_small(ncj4);
3853         nbnxn_realloc_void((void **)&nblc->cj4,
3854                            nblc->ncj4*sizeof(*nblc->cj4),
3855                            nblc->cj4_nalloc*sizeof(*nblc->cj4),
3856                            nblc->alloc,nblc->free);
3857     }
3858     if (nexcl > nblc->excl_nalloc)
3859     {
3860         nblc->excl_nalloc = over_alloc_small(nexcl);
3861         nbnxn_realloc_void((void **)&nblc->excl,
3862                            nblc->nexcl*sizeof(*nblc->excl),
3863                            nblc->excl_nalloc*sizeof(*nblc->excl),
3864                            nblc->alloc,nblc->free);
3865     }
3866
3867     /* Each thread should copy its own data to the combined arrays,
3868      * as otherwise data will go back and forth between different caches.
3869      */
3870 #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
3871     for(n=0; n<nnbl; n++)
3872     {
3873         int sci_offset;
3874         int cj4_offset;
3875         int ci_offset;
3876         int excl_offset;
3877         int i,j4;
3878         const nbnxn_pairlist_t *nbli;
3879
3880         /* Determine the offset in the combined data for our thread */
3881         sci_offset  = nblc->nsci;
3882         cj4_offset  = nblc->ncj4;
3883         ci_offset   = nblc->nci_tot;
3884         excl_offset = nblc->nexcl;
3885
3886         for(i=0; i<n; i++)
3887         {
3888             sci_offset  += nbl[i]->nsci;
3889             cj4_offset  += nbl[i]->ncj4;
3890             ci_offset   += nbl[i]->nci_tot;
3891             excl_offset += nbl[i]->nexcl;
3892         }
3893
3894         nbli = nbl[n];
3895
3896         for(i=0; i<nbli->nsci; i++)
3897         {
3898             nblc->sci[sci_offset+i]                = nbli->sci[i];
3899             nblc->sci[sci_offset+i].cj4_ind_start += cj4_offset;
3900             nblc->sci[sci_offset+i].cj4_ind_end   += cj4_offset;
3901         }
3902
3903         for(j4=0; j4<nbli->ncj4; j4++)
3904         {
3905             nblc->cj4[cj4_offset+j4] = nbli->cj4[j4];
3906             nblc->cj4[cj4_offset+j4].imei[0].excl_ind += excl_offset;
3907             nblc->cj4[cj4_offset+j4].imei[1].excl_ind += excl_offset;
3908         }
3909
3910         for(j4=0; j4<nbli->nexcl; j4++)
3911         {
3912             nblc->excl[excl_offset+j4] = nbli->excl[j4];
3913         }
3914     }
3915
3916     for(n=0; n<nnbl; n++)
3917     {
3918         nblc->nsci    += nbl[n]->nsci;
3919         nblc->ncj4    += nbl[n]->ncj4;
3920         nblc->nci_tot += nbl[n]->nci_tot;
3921         nblc->nexcl   += nbl[n]->nexcl;
3922     }
3923 }
3924
3925 /* Returns the next ci to be processes by our thread */
3926 static gmx_bool next_ci(const nbnxn_grid_t *grid,
3927                         int conv,
3928                         int nth,int ci_block,
3929                         int *ci_x,int *ci_y,
3930                         int *ci_b,int *ci)
3931 {
3932     (*ci_b)++;
3933     (*ci)++;
3934
3935     if (*ci_b == ci_block)
3936     {
3937         /* Jump to the next block assigned to this task */
3938         *ci   += (nth - 1)*ci_block;
3939         *ci_b  = 0;
3940     }
3941
3942     if (*ci >= grid->nc*conv)
3943     {
3944         return FALSE;
3945     }
3946
3947     while (*ci >= grid->cxy_ind[*ci_x*grid->ncy + *ci_y + 1]*conv)
3948     {
3949         *ci_y += 1;
3950         if (*ci_y == grid->ncy)
3951         {
3952             *ci_x += 1;
3953             *ci_y  = 0;
3954         }
3955     }
3956
3957     return TRUE;
3958 }
3959
3960 /* Returns the distance^2 for which we put cell pairs in the list
3961  * without checking atom pair distances. This is usually < rlist^2.
3962  */
3963 static float boundingbox_only_distance2(const nbnxn_grid_t *gridi,
3964                                         const nbnxn_grid_t *gridj,
3965                                         real rlist,
3966                                         gmx_bool simple)
3967 {
3968     /* If the distance between two sub-cell bounding boxes is less
3969      * than this distance, do not check the distance between
3970      * all particle pairs in the sub-cell, since then it is likely
3971      * that the box pair has atom pairs within the cut-off.
3972      * We use the nblist cut-off minus 0.5 times the average x/y diagonal
3973      * spacing of the sub-cells. Around 40% of the checked pairs are pruned.
3974      * Using more than 0.5 gains at most 0.5%.
3975      * If forces are calculated more than twice, the performance gain
3976      * in the force calculation outweighs the cost of checking.
3977      * Note that with subcell lists, the atom-pair distance check
3978      * is only performed when only 1 out of 8 sub-cells in within range,
3979      * this is because the GPU is much faster than the cpu.
3980      */
3981     real bbx,bby;
3982     real rbb2;
3983
3984     bbx = 0.5*(gridi->sx + gridj->sx);
3985     bby = 0.5*(gridi->sy + gridj->sy);
3986     if (!simple)
3987     {
3988         bbx /= GPU_NSUBCELL_X;
3989         bby /= GPU_NSUBCELL_Y;
3990     }
3991
3992     rbb2 = sqr(max(0,rlist - 0.5*sqrt(bbx*bbx + bby*bby)));
3993
3994 #ifndef GMX_DOUBLE
3995     return rbb2;
3996 #else
3997     return (float)((1+GMX_FLOAT_EPS)*rbb2);
3998 #endif
3999 }
4000
4001 static int get_ci_block_size(const nbnxn_grid_t *gridi,
4002                              gmx_bool bDomDec, int nth,
4003                              gmx_bool *bFBufferFlag)
4004 {
4005     const int ci_block_enum = 5;
4006     const int ci_block_denom = 11;
4007     const int ci_block_min_atoms = 16;
4008     int ci_block;
4009
4010     /* Here we decide how to distribute the blocks over the threads.
4011      * We use prime numbers to try to avoid that the grid size becomes
4012      * a multiple of the number of threads, which would lead to some
4013      * threads getting "inner" pairs and others getting boundary pairs,
4014      * which in turns will lead to load imbalance between threads.
4015      * Set the block size as 5/11/ntask times the average number of cells
4016      * in a y,z slab. This should ensure a quite uniform distribution
4017      * of the grid parts of the different thread along all three grid
4018      * zone boundaries with 3D domain decomposition. At the same time
4019      * the blocks will not become too small.
4020      */
4021     ci_block = (gridi->nc*ci_block_enum)/(ci_block_denom*gridi->ncx*nth);
4022
4023     /* Ensure the blocks are not too small: avoids cache invalidation */
4024     if (ci_block*gridi->na_sc < ci_block_min_atoms)
4025     {
4026         ci_block = (ci_block_min_atoms + gridi->na_sc - 1)/gridi->na_sc;
4027     }
4028     
4029     /* Without domain decomposition
4030      * or with less than 3 blocks per task, divide in nth blocks.
4031      */
4032     if (!bDomDec || ci_block*3*nth > gridi->nc)
4033     {
4034         ci_block = (gridi->nc + nth - 1)/nth;
4035         /* With non-interleaved blocks it makes sense to flag which
4036          * part of the force output thread buffer we access.
4037          * We use bit flags, so we have to check if it fits.
4038          */
4039         *bFBufferFlag = (nth > 1 && nth <= sizeof(unsigned int)*8);
4040     }
4041     else
4042     {
4043         *bFBufferFlag = FALSE;
4044     }
4045
4046     return ci_block;
4047 }
4048
4049 /* Generates the part of pair-list nbl assigned to our thread */
4050 static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
4051                                      const nbnxn_grid_t *gridi,
4052                                      const nbnxn_grid_t *gridj,
4053                                      nbnxn_search_work_t *work,
4054                                      const nbnxn_atomdata_t *nbat,
4055                                      const t_blocka *excl,
4056                                      real rlist,
4057                                      int nb_kernel_type,
4058                                      int ci_block,
4059                                      gmx_bool bFBufferFlag,
4060                                      int nsubpair_max,
4061                                      gmx_bool progBal,
4062                                      int min_ci_balanced,
4063                                      int th,int nth,
4064                                      nbnxn_pairlist_t *nbl)
4065 {
4066     int  na_cj_2log;
4067     matrix box;
4068     real rl2;
4069     float rbb2;
4070     int  d;
4071     int  ci_b,ci,ci_x,ci_y,ci_xy,cj;
4072     ivec shp;
4073     int  tx,ty,tz;
4074     int  shift;
4075     gmx_bool bMakeList;
4076     real shx,shy,shz;
4077     int  conv_i,cell0_i;
4078     const float *bb_i,*bbcz_i,*bbcz_j;
4079     const int *flags_i;
4080     real bx0,bx1,by0,by1,bz0,bz1;
4081     real bz1_frac;
4082     real d2cx,d2z,d2z_cx,d2z_cy,d2zx,d2zxy,d2xy;
4083     int  cxf,cxl,cyf,cyf_x,cyl;
4084     int  cx,cy;
4085     int  c0,c1,cs,cf,cl;
4086     int  ndistc;
4087     int  ncpcheck;
4088     int  gridj_flag_shift=0,cj_offset=0;
4089     unsigned *gridj_flag=NULL;
4090     int  ncj_old_i,ncj_old_j;
4091
4092     nbs_cycle_start(&work->cc[enbsCCsearch]);
4093
4094     if (gridj->bSimple != nbl->bSimple)
4095     {
4096         gmx_incons("Grid incompatible with pair-list");
4097     }
4098
4099     sync_work(nbl);
4100     nbl->na_sc = gridj->na_sc;
4101     nbl->na_ci = gridj->na_c;
4102     nbl->na_cj = nbnxn_kernel_to_cj_size(nb_kernel_type);
4103     na_cj_2log = get_2log(nbl->na_cj);
4104
4105     nbl->rlist  = rlist;
4106
4107     if (bFBufferFlag)
4108     {
4109         init_grid_flags(&work->gridi_flags,gridi);
4110         init_grid_flags(&work->gridj_flags,gridj);
4111
4112         /* To flag j-blocks for gridj, we need to convert j-clusters to flag blocks */
4113         gridj_flag_shift = 0;
4114         while ((nbl->na_cj<<gridj_flag_shift) < NBNXN_CELLBLOCK_SIZE*nbl->na_ci)
4115         {
4116             gridj_flag_shift++;
4117         }
4118         /* We will subtract the cell offset, which is not a multiple of the block size */
4119         cj_offset = ci_to_cj(get_2log(nbl->na_cj),gridj->cell0);
4120
4121         gridj_flag = work->gridj_flags.flag;
4122     }
4123
4124     copy_mat(nbs->box,box);
4125
4126     rl2 = nbl->rlist*nbl->rlist;
4127
4128     rbb2 = boundingbox_only_distance2(gridi,gridj,nbl->rlist,nbl->bSimple);
4129
4130     if (debug)
4131     {
4132         fprintf(debug,"nbl bounding box only distance %f\n",sqrt(rbb2));
4133     }
4134
4135     /* Set the shift range */
4136     for(d=0; d<DIM; d++)
4137     {
4138         /* Check if we need periodicity shifts.
4139          * Without PBC or with domain decomposition we don't need them.
4140          */
4141         if (d >= ePBC2npbcdim(nbs->ePBC) || nbs->dd_dim[d])
4142         {
4143             shp[d] = 0;
4144         }
4145         else
4146         {
4147             if (d == XX &&
4148                 box[XX][XX] - fabs(box[YY][XX]) - fabs(box[ZZ][XX]) < sqrt(rl2))
4149             {
4150                 shp[d] = 2;
4151             }
4152             else
4153             {
4154                 shp[d] = 1;
4155             }
4156         }
4157     }
4158
4159     if (nbl->bSimple && !gridi->bSimple)
4160     {
4161         conv_i  = gridi->na_sc/gridj->na_sc;
4162         bb_i    = gridi->bb_simple;
4163         bbcz_i  = gridi->bbcz_simple;
4164         flags_i = gridi->flags_simple;
4165     }
4166     else
4167     {
4168         conv_i  = 1;
4169         bb_i    = gridi->bb;
4170         bbcz_i  = gridi->bbcz;
4171         flags_i = gridi->flags;
4172     }
4173     cell0_i = gridi->cell0*conv_i;
4174
4175     bbcz_j = gridj->bbcz;
4176
4177     if (conv_i != 1)
4178     {
4179         /* Blocks of the conversion factor - 1 give a large repeat count
4180          * combined with a small block size. This should result in good
4181          * load balancing for both small and large domains.
4182          */
4183         ci_block = conv_i - 1;
4184     }
4185     if (debug)
4186     {
4187         fprintf(debug,"nbl nc_i %d col.av. %.1f ci_block %d\n",
4188                 gridi->nc,gridi->nc/(double)(gridi->ncx*gridi->ncy),ci_block);
4189     }
4190
4191     ndistc = 0;
4192     ncpcheck = 0;
4193
4194     /* Initially ci_b and ci to 1 before where we want them to start,
4195      * as they will both be incremented in next_ci.
4196      */
4197     ci_b = -1;
4198     ci   = th*ci_block - 1;
4199     ci_x = 0;
4200     ci_y = 0;
4201     while (next_ci(gridi,conv_i,nth,ci_block,&ci_x,&ci_y,&ci_b,&ci))
4202     {
4203         if (nbl->bSimple && flags_i[ci] == 0)
4204         {
4205             continue;
4206         }
4207
4208         ncj_old_i = nbl->ncj;
4209
4210         d2cx = 0;
4211         if (gridj != gridi && shp[XX] == 0)
4212         {
4213             if (nbl->bSimple)
4214             {
4215                 bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX];
4216             }
4217             else
4218             {
4219                 bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx;
4220             }
4221             if (bx1 < gridj->c0[XX])
4222             {
4223                 d2cx = sqr(gridj->c0[XX] - bx1);
4224
4225                 if (d2cx >= rl2)
4226                 {
4227                     continue;
4228                 }
4229             }
4230         }
4231
4232         ci_xy = ci_x*gridi->ncy + ci_y;
4233
4234         /* Loop over shift vectors in three dimensions */
4235         for (tz=-shp[ZZ]; tz<=shp[ZZ]; tz++)
4236         {
4237             shz = tz*box[ZZ][ZZ];
4238
4239             bz0 = bbcz_i[ci*NNBSBB_D  ] + shz;
4240             bz1 = bbcz_i[ci*NNBSBB_D+1] + shz;
4241
4242             if (tz == 0)
4243             {
4244                 d2z = 0;
4245             }
4246             else if (tz < 0)
4247             {
4248                 d2z = sqr(bz1);
4249             }
4250             else
4251             {
4252                 d2z = sqr(bz0 - box[ZZ][ZZ]);
4253             }
4254
4255             d2z_cx = d2z + d2cx;
4256
4257             if (d2z_cx >= rl2)
4258             {
4259                 continue;
4260             }
4261
4262             bz1_frac =
4263                 bz1/((real)(gridi->cxy_ind[ci_xy+1] - gridi->cxy_ind[ci_xy]));
4264             if (bz1_frac < 0)
4265             {
4266                 bz1_frac = 0;
4267             }
4268             /* The check with bz1_frac close to or larger than 1 comes later */
4269
4270             for (ty=-shp[YY]; ty<=shp[YY]; ty++)
4271             {
4272                 shy = ty*box[YY][YY] + tz*box[ZZ][YY];
4273
4274                 if (nbl->bSimple)
4275                 {
4276                     by0 = bb_i[ci*NNBSBB_B         +YY] + shy;
4277                     by1 = bb_i[ci*NNBSBB_B+NNBSBB_C+YY] + shy;
4278                 }
4279                 else
4280                 {
4281                     by0 = gridi->c0[YY] + (ci_y  )*gridi->sy + shy;
4282                     by1 = gridi->c0[YY] + (ci_y+1)*gridi->sy + shy;
4283                 }
4284
4285                 get_cell_range(by0,by1,
4286                                gridj->ncy,gridj->c0[YY],gridj->sy,gridj->inv_sy,
4287                                d2z_cx,rl2,
4288                                &cyf,&cyl);
4289
4290                 if (cyf > cyl)
4291                 {
4292                     continue;
4293                 }
4294
4295                 d2z_cy = d2z;
4296                 if (by1 < gridj->c0[YY])
4297                 {
4298                     d2z_cy += sqr(gridj->c0[YY] - by1);
4299                 }
4300                 else if (by0 > gridj->c1[YY])
4301                 {
4302                     d2z_cy += sqr(by0 - gridj->c1[YY]);
4303                 }
4304
4305                 for (tx=-shp[XX]; tx<=shp[XX]; tx++)
4306                 {
4307                     shift = XYZ2IS(tx,ty,tz);
4308
4309 #ifdef NBNXN_SHIFT_BACKWARD
4310                     if (gridi == gridj && shift > CENTRAL)
4311                     {
4312                         continue;
4313                     }
4314 #endif
4315
4316                     shx = tx*box[XX][XX] + ty*box[YY][XX] + tz*box[ZZ][XX];
4317
4318                     if (nbl->bSimple)
4319                     {
4320                         bx0 = bb_i[ci*NNBSBB_B         +XX] + shx;
4321                         bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX] + shx;
4322                     }
4323                     else
4324                     {
4325                         bx0 = gridi->c0[XX] + (ci_x  )*gridi->sx + shx;
4326                         bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx + shx;
4327                     }
4328
4329                     get_cell_range(bx0,bx1,
4330                                    gridj->ncx,gridj->c0[XX],gridj->sx,gridj->inv_sx,
4331                                    d2z_cy,rl2,
4332                                    &cxf,&cxl);
4333
4334                     if (cxf > cxl)
4335                     {
4336                         continue;
4337                     }
4338
4339                     if (nbl->bSimple)
4340                     {
4341                         new_ci_entry(nbl,cell0_i+ci,shift,flags_i[ci],
4342                                      nbl->work);
4343                     }
4344                     else
4345                     {
4346                         new_sci_entry(nbl,cell0_i+ci,shift,flags_i[ci],
4347                                       nbl->work);
4348                     }
4349
4350 #ifndef NBNXN_SHIFT_BACKWARD
4351                     if (cxf < ci_x)
4352 #else
4353                     if (shift == CENTRAL && gridi == gridj &&
4354                         cxf < ci_x)
4355 #endif
4356                     {
4357                         /* Leave the pairs with i > j.
4358                          * x is the major index, so skip half of it.
4359                          */
4360                         cxf = ci_x;
4361                     }
4362
4363                     if (nbl->bSimple)
4364                     {
4365                         set_icell_bb_simple(bb_i,ci,shx,shy,shz,
4366                                             nbl->work->bb_ci);
4367                     }
4368                     else
4369                     {
4370                         set_icell_bb_supersub(bb_i,ci,shx,shy,shz,
4371                                               nbl->work->bb_ci);
4372                     }
4373
4374                     nbs->icell_set_x(cell0_i+ci,shx,shy,shz,
4375                                      gridi->na_c,nbat->xstride,nbat->x,
4376                                      nbl->work);
4377
4378                     for(cx=cxf; cx<=cxl; cx++)
4379                     {
4380                         d2zx = d2z;
4381                         if (gridj->c0[XX] + cx*gridj->sx > bx1)
4382                         {
4383                             d2zx += sqr(gridj->c0[XX] + cx*gridj->sx - bx1);
4384                         }
4385                         else if (gridj->c0[XX] + (cx+1)*gridj->sx < bx0)
4386                         {
4387                             d2zx += sqr(gridj->c0[XX] + (cx+1)*gridj->sx - bx0);
4388                         }
4389
4390 #ifndef NBNXN_SHIFT_BACKWARD
4391                         if (gridi == gridj &&
4392                             cx == 0 && cyf < ci_y)
4393 #else
4394                         if (gridi == gridj &&
4395                             cx == 0 && shift == CENTRAL && cyf < ci_y)
4396 #endif
4397                         {
4398                             /* Leave the pairs with i > j.
4399                              * Skip half of y when i and j have the same x.
4400                              */
4401                             cyf_x = ci_y;
4402                         }
4403                         else
4404                         {
4405                             cyf_x = cyf;
4406                         }
4407
4408                         for(cy=cyf_x; cy<=cyl; cy++)
4409                         {
4410                             c0 = gridj->cxy_ind[cx*gridj->ncy+cy];
4411                             c1 = gridj->cxy_ind[cx*gridj->ncy+cy+1];
4412 #ifdef NBNXN_SHIFT_BACKWARD
4413                             if (gridi == gridj &&
4414                                 shift == CENTRAL && c0 < ci)
4415                             {
4416                                 c0 = ci;
4417                             }
4418 #endif
4419
4420                             d2zxy = d2zx;
4421                             if (gridj->c0[YY] + cy*gridj->sy > by1)
4422                             {
4423                                 d2zxy += sqr(gridj->c0[YY] + cy*gridj->sy - by1);
4424                             }
4425                             else if (gridj->c0[YY] + (cy+1)*gridj->sy < by0)
4426                             {
4427                                 d2zxy += sqr(gridj->c0[YY] + (cy+1)*gridj->sy - by0);
4428                             }
4429                             if (c1 > c0 && d2zxy < rl2)
4430                             {
4431                                 cs = c0 + (int)(bz1_frac*(c1 - c0));
4432                                 if (cs >= c1)
4433                                 {
4434                                     cs = c1 - 1;
4435                                 }
4436
4437                                 d2xy = d2zxy - d2z;
4438
4439                                 /* Find the lowest cell that can possibly
4440                                  * be within range.
4441                                  */
4442                                 cf = cs;
4443                                 while(cf > c0 &&
4444                                       (bbcz_j[cf*NNBSBB_D+1] >= bz0 ||
4445                                        d2xy + sqr(bbcz_j[cf*NNBSBB_D+1] - bz0) < rl2))
4446                                 {
4447                                     cf--;
4448                                 }
4449
4450                                 /* Find the highest cell that can possibly
4451                                  * be within range.
4452                                  */
4453                                 cl = cs;
4454                                 while(cl < c1-1 &&
4455                                       (bbcz_j[cl*NNBSBB_D] <= bz1 ||
4456                                        d2xy + sqr(bbcz_j[cl*NNBSBB_D] - bz1) < rl2))
4457                                 {
4458                                     cl++;
4459                                 }
4460
4461 #ifdef NBNXN_REFCODE
4462                                 {
4463                                     /* Simple reference code, for debugging,
4464                                      * overrides the more complex code above.
4465                                      */
4466                                     int k;
4467                                     cf = c1;
4468                                     cl = -1;
4469                                     for(k=c0; k<c1; k++)
4470                                     {
4471                                         if (box_dist2(bx0,bx1,by0,by1,bz0,bz1,
4472                                                       bb+k*NNBSBB_B) < rl2 &&
4473                                             k < cf)
4474                                         {
4475                                             cf = k;
4476                                         }
4477                                         if (box_dist2(bx0,bx1,by0,by1,bz0,bz1,
4478                                                       bb+k*NNBSBB_B) < rl2 &&
4479                                             k > cl)
4480                                         {
4481                                             cl = k;
4482                                         }
4483                                     }
4484                                 }
4485 #endif
4486
4487                                 if (gridi == gridj)
4488                                 {
4489                                     /* We want each atom/cell pair only once,
4490                                      * only use cj >= ci.
4491                                      */
4492 #ifndef NBNXN_SHIFT_BACKWARD
4493                                     cf = max(cf,ci);
4494 #else
4495                                     if (shift == CENTRAL)
4496                                     {
4497                                         cf = max(cf,ci);
4498                                     }
4499 #endif
4500                                 }
4501
4502                                 if (cf <= cl)
4503                                 {
4504                                     /* For f buffer flags with simple lists */
4505                                     ncj_old_j = nbl->ncj;
4506
4507                                     switch (nb_kernel_type)
4508                                     {
4509                                     case nbk4x4_PlainC:
4510                                         check_subcell_list_space_simple(nbl,cl-cf+1);
4511
4512                                         make_cluster_list_simple(gridj,
4513                                                                  nbl,ci,cf,cl,
4514                                                                  (gridi == gridj && shift == CENTRAL),
4515                                                                  nbat->x,
4516                                                                  rl2,rbb2,
4517                                                                  &ndistc);
4518                                         break;
4519 #ifdef NBNXN_SEARCH_SSE
4520                                     case nbk4xN_X86_SIMD128:
4521                                         check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
4522                                         make_cluster_list_x86_simd128(gridj,
4523                                                                       nbl,ci,cf,cl,
4524                                                                       (gridi == gridj && shift == CENTRAL),
4525                                                                       nbat->x,
4526                                                                       rl2,rbb2,
4527                                                                       &ndistc);
4528                                         break;
4529 #ifdef GMX_X86_AVX_256
4530                                     case nbk4xN_X86_SIMD256:
4531                                         check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
4532                                         make_cluster_list_x86_simd256(gridj,
4533                                                                       nbl,ci,cf,cl,
4534                                                                       (gridi == gridj && shift == CENTRAL),
4535                                                                       nbat->x,
4536                                                                       rl2,rbb2,
4537                                                                       &ndistc);
4538                                         break;
4539 #endif
4540 #endif
4541                                     case nbk8x8x8_PlainC:
4542                                     case nbk8x8x8_CUDA:
4543                                         check_subcell_list_space_supersub(nbl,cl-cf+1);
4544                                         for(cj=cf; cj<=cl; cj++)
4545                                         {
4546                                             make_cluster_list_supersub(nbs,gridi,gridj,
4547                                                                        nbl,ci,cj,
4548                                                                        (gridi == gridj && shift == CENTRAL && ci == cj),
4549                                                                        nbat->xstride,nbat->x,
4550                                                                        rl2,rbb2,
4551                                                                        &ndistc);
4552                                         }
4553                                         break;
4554                                     }
4555                                     ncpcheck += cl - cf + 1;
4556
4557                                     if (bFBufferFlag && nbl->ncj > ncj_old_j)
4558                                     {
4559                                         int cbf,cbl,cb;
4560
4561                                         cbf = (nbl->cj[ncj_old_j].cj - cj_offset) >> gridj_flag_shift;
4562                                         cbl = (nbl->cj[nbl->ncj-1].cj - cj_offset) >> gridj_flag_shift;
4563                                         for(cb=cbf; cb<=cbl; cb++)
4564                                         {
4565                                             gridj_flag[cb] = 1U<<th;
4566                                         }
4567                                     }
4568                                 }
4569                             }
4570                         }
4571                     }
4572
4573                     /* Set the exclusions for this ci list */
4574                     if (nbl->bSimple)
4575                     {
4576                         set_ci_top_excls(nbs,
4577                                          nbl,
4578                                          shift == CENTRAL && gridi == gridj,
4579                                          gridj->na_c_2log,
4580                                          na_cj_2log,
4581                                          &(nbl->ci[nbl->nci]),
4582                                          excl);
4583                     }
4584                     else
4585                     {
4586                         set_sci_top_excls(nbs,
4587                                           nbl,
4588                                           shift == CENTRAL && gridi == gridj,
4589                                           gridj->na_c_2log,
4590                                           &(nbl->sci[nbl->nsci]),
4591                                           excl);
4592                     }
4593
4594                     /* Close this ci list */
4595                     if (nbl->bSimple)
4596                     {
4597                         close_ci_entry_simple(nbl);
4598                     }
4599                     else
4600                     {
4601                         close_ci_entry_supersub(nbl,
4602                                                 nsubpair_max,
4603                                                 progBal,min_ci_balanced,
4604                                                 th,nth);
4605                     }
4606                 }
4607             }
4608         }
4609
4610         if (bFBufferFlag && nbl->ncj > ncj_old_i)
4611         {
4612             work->gridi_flags.flag[ci>>NBNXN_CELLBLOCK_SIZE_2LOG] = 1U<<th;
4613         }
4614     }
4615
4616     work->ndistc = ndistc;
4617
4618     nbs_cycle_stop(&work->cc[enbsCCsearch]);
4619
4620     if (debug)
4621     {
4622         fprintf(debug,"number of distance checks %d\n",ndistc);
4623         fprintf(debug,"ncpcheck %s %d\n",gridi==gridj ? "local" : "non-local",
4624                 ncpcheck);
4625
4626         if (nbl->bSimple)
4627         {
4628             print_nblist_statistics_simple(debug,nbl,nbs,rlist);
4629         }
4630         else
4631         {
4632             print_nblist_statistics_supersub(debug,nbl,nbs,rlist);
4633         }
4634
4635     }
4636 }
4637
4638 static void reduce_cellblock_flags(const nbnxn_search_t nbs,
4639                                    int nnbl,
4640                                    const nbnxn_grid_t *gridi,
4641                                    const nbnxn_grid_t *gridj)
4642 {
4643     int nbl,cb;
4644     const unsigned *flag;
4645
4646     if (gridi->cellblock_flags.bUse)
4647     {
4648         for(nbl=0; nbl<nnbl; nbl++)
4649         {
4650             flag = nbs->work[nbl].gridi_flags.flag;
4651             
4652             for(cb=0; cb<gridi->cellblock_flags.ncb; cb++)
4653             {
4654                 gridi->cellblock_flags.flag[cb] |= flag[cb];
4655             }
4656         }
4657     }
4658     if (gridj->cellblock_flags.bUse)
4659     {
4660         for(nbl=0; nbl<nnbl; nbl++)
4661         {
4662             flag = nbs->work[nbl].gridj_flags.flag;
4663
4664             for(cb=0; cb<gridj->cellblock_flags.ncb; cb++)
4665             {
4666                 gridj->cellblock_flags.flag[cb] |= flag[cb];
4667             }
4668         }
4669     }
4670 }
4671
4672 static void print_reduction_cost(const nbnxn_grid_t *grids,int ngrid,int nnbl)
4673 {
4674     int g,c0,c,cb,nbl;
4675     const nbnxn_grid_t *grid;
4676
4677     for(g=0; g<ngrid; g++)
4678     {
4679         grid = &grids[g];
4680
4681         c0 = 0;
4682         if (grid->cellblock_flags.bUse)
4683         {
4684             c  = 0;
4685             for(cb=0; cb<grid->cellblock_flags.ncb; cb++)
4686             {
4687                 for(nbl=0; nbl<nnbl; nbl++)
4688                 {
4689                     if (grid->cellblock_flags.flag[cb] == 1)
4690                     {
4691                         c0++;
4692                     }
4693                     else if (grid->cellblock_flags.flag[cb] & (1U<<nbl))
4694                     {
4695                         c++;
4696                     }
4697                 }
4698             }
4699         }
4700         else
4701         {
4702             c = nnbl*grid->cellblock_flags.ncb;
4703         }
4704         fprintf(debug,"nbnxn reduction buffers, grid %d: %d flag %d only buf. 0: %4.2f av. reduction: %4.2f\n",
4705                 g,nnbl,grid->cellblock_flags.bUse,
4706                 c0/(double)(grid->cellblock_flags.ncb),
4707                 c/(double)(grid->cellblock_flags.ncb));
4708     }
4709 }
4710
4711 /* Make a local or non-local pair-list, depending on iloc */
4712 void nbnxn_make_pairlist(const nbnxn_search_t nbs,
4713                          const nbnxn_atomdata_t *nbat,
4714                          const t_blocka *excl,
4715                          real rlist,
4716                          int min_ci_balanced,
4717                          nbnxn_pairlist_set_t *nbl_list,
4718                          int iloc,
4719                          int nb_kernel_type,
4720                          t_nrnb *nrnb)
4721 {
4722     nbnxn_grid_t *gridi,*gridj;
4723     int nzi,zi,zj0,zj1,zj;
4724     int nsubpair_max;
4725     int th;
4726     int nnbl;
4727     nbnxn_pairlist_t **nbl;
4728     int ci_block;
4729     gmx_bool CombineNBLists,bFBufferFlag;
4730     int np_tot,np_noq,np_hlj,nap;
4731
4732     nnbl            = nbl_list->nnbl;
4733     nbl             = nbl_list->nbl;
4734     CombineNBLists  = nbl_list->bCombined;
4735
4736     if (debug)
4737     {
4738         fprintf(debug,"ns making %d nblists\n", nnbl);
4739     }
4740
4741     if (nbl_list->bSimple)
4742     {
4743         switch (nb_kernel_type)
4744         {
4745 #ifdef NBNXN_SEARCH_SSE
4746         case nbk4xN_X86_SIMD128:
4747             nbs->icell_set_x = icell_set_x_x86_simd128;
4748             break;
4749 #ifdef GMX_X86_AVX_256
4750         case nbk4xN_X86_SIMD256:
4751             nbs->icell_set_x = icell_set_x_x86_simd256;
4752             break;
4753 #endif
4754 #endif
4755         default:
4756             nbs->icell_set_x = icell_set_x_simple;
4757             break;
4758         }
4759     }
4760     else
4761     {
4762 #ifdef NBNXN_SEARCH_SSE
4763         nbs->icell_set_x = icell_set_x_supersub_sse8;
4764 #else
4765         nbs->icell_set_x = icell_set_x_supersub;
4766 #endif
4767     }
4768
4769     if (LOCAL_I(iloc))
4770     {
4771         /* Only zone (grid) 0 vs 0 */
4772         nzi = 1;
4773         zj0 = 0;
4774         zj1 = 1;
4775     }
4776     else
4777     {
4778         nzi = nbs->zones->nizone;
4779     }
4780
4781     if (!nbl_list->bSimple && min_ci_balanced > 0)
4782     {
4783         nsubpair_max = get_nsubpair_max(nbs,iloc,rlist,min_ci_balanced);
4784     }
4785     else
4786     {
4787         nsubpair_max = 0;
4788     }
4789
4790     /* Clear all pair-lists */
4791     for(th=0; th<nnbl; th++)
4792     {
4793         clear_pairlist(nbl[th]);
4794     }
4795
4796     for(zi=0; zi<nzi; zi++)
4797     {
4798         gridi = &nbs->grid[zi];
4799
4800         if (NONLOCAL_I(iloc))
4801         {
4802             zj0 = nbs->zones->izone[zi].j0;
4803             zj1 = nbs->zones->izone[zi].j1;
4804             if (zi == 0)
4805             {
4806                 zj0++;
4807             }
4808         }
4809         for(zj=zj0; zj<zj1; zj++)
4810         {
4811             gridj = &nbs->grid[zj];
4812
4813             if (debug)
4814             {
4815                 fprintf(debug,"ns search grid %d vs %d\n",zi,zj);
4816             }
4817
4818             nbs_cycle_start(&nbs->cc[enbsCCsearch]);
4819
4820             if (nbl[0]->bSimple && !gridi->bSimple)
4821             {
4822                 /* Hybrid list, determine blocking later */
4823                 ci_block = 0;
4824                 bFBufferFlag = FALSE;
4825             }
4826             else
4827             {
4828                 ci_block = get_ci_block_size(gridi,nbs->DomDec,nnbl,
4829                                              &bFBufferFlag);
4830                 if (CombineNBLists)
4831                 {
4832                     bFBufferFlag = FALSE;
4833                 }
4834             }
4835             if (debug != NULL)
4836             {
4837                 fprintf(debug,"grid %d %d F buffer flags %d\n",
4838                         zi,zj,bFBufferFlag);
4839             }
4840
4841 #pragma omp parallel for num_threads(nnbl) schedule(static)
4842             for(th=0; th<nnbl; th++)
4843             {
4844                 if (CombineNBLists && th > 0)
4845                 {
4846                     clear_pairlist(nbl[th]);
4847                 }
4848
4849                 /* Divide the i super cell equally over the nblists */
4850                 nbnxn_make_pairlist_part(nbs,gridi,gridj,
4851                                          &nbs->work[th],nbat,excl,
4852                                          rlist,
4853                                          nb_kernel_type,
4854                                          ci_block,
4855                                          bFBufferFlag,
4856                                          nsubpair_max,
4857                                          (LOCAL_I(iloc) || nbs->zones->n <= 2),
4858                                          min_ci_balanced,
4859                                          th,nnbl,
4860                                          nbl[th]);
4861             }
4862             nbs_cycle_stop(&nbs->cc[enbsCCsearch]);
4863
4864             np_tot = 0;
4865             np_noq = 0;
4866             np_hlj = 0;
4867             for(th=0; th<nnbl; th++)
4868             {
4869                 inc_nrnb(nrnb,eNR_NBNXN_DIST2,nbs->work[th].ndistc);
4870
4871                 if (nbl_list->bSimple)
4872                 {
4873                     np_tot += nbl[th]->ncj;
4874                     np_noq += nbl[th]->work->ncj_noq;
4875                     np_hlj += nbl[th]->work->ncj_hlj;
4876                 }
4877                 else
4878                 {
4879                     /* This count ignores potential subsequent pair pruning */
4880                     np_tot += nbl[th]->nci_tot;
4881                 }
4882             }
4883             nap = nbl[0]->na_ci*nbl[0]->na_cj;
4884             nbl_list->natpair_ljq = (np_tot - np_noq)*nap - np_hlj*nap/2;
4885             nbl_list->natpair_lj  = np_noq*nap;
4886             nbl_list->natpair_q   = np_hlj*nap/2;
4887
4888             if (CombineNBLists && nnbl > 1)
4889             {
4890                 nbs_cycle_start(&nbs->cc[enbsCCcombine]);
4891
4892                 combine_nblists(nnbl-1,nbl+1,nbl[0]);
4893
4894                 nbs_cycle_stop(&nbs->cc[enbsCCcombine]);
4895             }
4896
4897             if (bFBufferFlag)
4898             {
4899                 reduce_cellblock_flags(nbs,nnbl,gridi,gridj);
4900             }
4901             else
4902             {
4903                 gridi->cellblock_flags.bUse = FALSE;
4904                 gridj->cellblock_flags.bUse = FALSE;
4905             }
4906         }
4907     }
4908
4909     /*
4910     print_supersub_nsp("nsubpair",nbl[0],iloc);
4911     */
4912
4913     /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
4914     if (LOCAL_I(iloc))
4915     {
4916         nbs->search_count++;
4917     }
4918     if (nbs->print_cycles &&
4919         (!nbs->DomDec || (nbs->DomDec && !LOCAL_I(iloc))) &&
4920         nbs->search_count % 100 == 0)
4921     {
4922         nbs_cycle_print(stderr,nbs);
4923     }
4924
4925     if (debug && (CombineNBLists && nnbl > 1))
4926     {
4927         if (nbl[0]->bSimple)
4928         {
4929             print_nblist_statistics_simple(debug,nbl[0],nbs,rlist);
4930         }
4931         else
4932         {
4933             print_nblist_statistics_supersub(debug,nbl[0],nbs,rlist);
4934         }
4935     }
4936
4937     if (gmx_debug_at)
4938     {
4939         if (nbl[0]->bSimple)
4940         {
4941             print_nblist_ci_cj(debug,nbl[0]);
4942         }
4943         else
4944         {
4945             print_nblist_sci_cj(debug,nbl[0]);
4946         }
4947
4948         print_reduction_cost(nbs->grid,nbs->ngrid,nnbl);
4949     }
4950 }