Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_search.c
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2  *
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  *          GROningen MAchine for Chemical Simulations
9  *
10  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
11  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
12  * Copyright (c) 2001-2012, The GROMACS development team,
13  * check out http://www.gromacs.org for more information.
14
15  * This program is free software; you can redistribute it and/or
16  * modify it under the terms of the GNU General Public License
17  * as published by the Free Software Foundation; either version 2
18  * of the License, or (at your option) any later version.
19  *
20  * If you want to redistribute modifications, please consider that
21  * scientific software is very special. Version control is crucial -
22  * bugs must be traceable. We will be happy to consider code for
23  * inclusion in the official distribution, but derived work must not
24  * be called official GROMACS. Details are found in the README & COPYING
25  * files - if they are missing, get the official version at www.gromacs.org.
26  *
27  * To help us fund GROMACS development, we humbly ask that you cite
28  * the papers on the package - you can find them in the top README file.
29  *
30  * For more info, check our website at http://www.gromacs.org
31  */
32
33 #ifdef HAVE_CONFIG_H
34 #include <config.h>
35 #endif
36
37 #include <math.h>
38 #include <string.h>
39 #include "sysstuff.h"
40 #include "smalloc.h"
41 #include "macros.h"
42 #include "maths.h"
43 #include "vec.h"
44 #include "pbc.h"
45 #include "nbnxn_consts.h"
46 #include "nbnxn_internal.h"
47 #include "nbnxn_atomdata.h"
48 #include "nbnxn_search.h"
49 #include "gmx_cyclecounter.h"
50 #include "gmxfio.h"
51 #include "gmx_omp_nthreads.h"
52 #include "nrnb.h"
53
54
55 /* Pair search box lower and upper corner in x,y,z.
56  * Store this in 4 iso 3 reals, which is useful with SSE.
57  * To avoid complicating the code we also use 4 without SSE.
58  */
59 #define NNBSBB_C         4
60 #define NNBSBB_B         (2*NNBSBB_C)
61 /* Pair search box lower and upper bound in z only. */
62 #define NNBSBB_D         2
63 /* Pair search box lower and upper corner x,y,z indices */
64 #define BBL_X  0
65 #define BBL_Y  1
66 #define BBL_Z  2
67 #define BBU_X  4
68 #define BBU_Y  5
69 #define BBU_Z  6
70
71
72 #ifdef NBNXN_SEARCH_SSE
73
74 #ifndef GMX_DOUBLE
75 #define NBNXN_SEARCH_SSE_SINGLE
76 #endif
77
78 /* Include basic SSE2 stuff */
79 #include <emmintrin.h>
80
81 #if defined NBNXN_SEARCH_SSE_SINGLE && GPU_NSUBCELL == 8
82 #define NBNXN_8BB_SSE
83 #endif
84
85 /* The width of SSE/AVX128 with single precision for bounding boxes with GPU.
86  * Here AVX-256 turns out to be slightly slower than AVX-128.
87  */
88 #define STRIDE_8BB        4
89 #define STRIDE_8BB_2LOG   2
90
91 #endif /* NBNXN_SEARCH_SSE */
92
93 #ifdef GMX_NBNXN_SIMD
94
95 /* The functions below are macros as they are performance sensitive */
96
97 /* 4x4 list, pack=4: no complex conversion required */
98 /* i-cluster to j-cluster conversion */
99 #define CI_TO_CJ_J4(ci)   (ci)
100 /* cluster index to coordinate array index conversion */
101 #define X_IND_CI_J4(ci)  ((ci)*STRIDE_P4)
102 #define X_IND_CJ_J4(cj)  ((cj)*STRIDE_P4)
103
104 /* 4x2 list, pack=4: j-cluster size is half the packing width */
105 /* i-cluster to j-cluster conversion */
106 #define CI_TO_CJ_J2(ci)  ((ci)<<1)
107 /* cluster index to coordinate array index conversion */
108 #define X_IND_CI_J2(ci)  ((ci)*STRIDE_P4)
109 #define X_IND_CJ_J2(cj)  (((cj)>>1)*STRIDE_P4 + ((cj) & 1)*(PACK_X4>>1))
110
111 /* 4x8 list, pack=8: i-cluster size is half the packing width */
112 /* i-cluster to j-cluster conversion */
113 #define CI_TO_CJ_J8(ci)  ((ci)>>1)
114 /* cluster index to coordinate array index conversion */
115 #define X_IND_CI_J8(ci)  (((ci)>>1)*STRIDE_P8 + ((ci) & 1)*(PACK_X8>>1))
116 #define X_IND_CJ_J8(cj)  ((cj)*STRIDE_P8)
117
118 /* The j-cluster size is matched to the SIMD width */
119 #if GMX_NBNXN_SIMD_BITWIDTH == 128
120 #ifdef GMX_DOUBLE
121 #define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J2(ci)
122 #define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J2(ci)
123 #define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J2(cj)
124 #else
125 #define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J4(ci)
126 #define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J4(ci)
127 #define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J4(cj)
128 #endif
129 #else
130 #if GMX_NBNXN_SIMD_BITWIDTH == 256
131 #ifdef GMX_DOUBLE
132 #define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J4(ci)
133 #define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J4(ci)
134 #define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J4(cj)
135 #else
136 #define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J8(ci)
137 #define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J8(ci)
138 #define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J8(cj)
139 /* Half SIMD with j-cluster size */
140 #define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J4(ci)
141 #define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J4(ci)
142 #define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J4(cj)
143 #endif
144 #else
145 #error "unsupported GMX_NBNXN_SIMD_WIDTH"
146 #endif
147 #endif
148
149 #endif /* GMX_NBNXN_SIMD */
150
151
152 /* Interaction masks for 4xN atom interactions.
153  * Bit i*CJ_SIZE + j tells if atom i and j interact.
154  */
155 /* All interaction mask is the same for all kernels */
156 #define NBNXN_INT_MASK_ALL        0xffffffff
157 /* 4x4 kernel diagonal mask */
158 #define NBNXN_INT_MASK_DIAG       0x08ce
159 /* 4x2 kernel diagonal masks */
160 #define NBNXN_INT_MASK_DIAG_J2_0  0x0002
161 #define NBNXN_INT_MASK_DIAG_J2_1  0x002F
162 /* 4x8 kernel diagonal masks */
163 #define NBNXN_INT_MASK_DIAG_J8_0  0xf0f8fcfe
164 #define NBNXN_INT_MASK_DIAG_J8_1  0x0080c0e0
165
166
167 #ifdef NBNXN_SEARCH_SSE
168 /* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
169 #define NBNXN_BBXXXX
170 /* Size of bounding box corners quadruplet */
171 #define NNBSBB_XXXX      (NNBSBB_D*DIM*STRIDE_8BB)
172 #endif
173
174 /* We shift the i-particles backward for PBC.
175  * This leads to more conditionals than shifting forward.
176  * We do this to get more balanced pair lists.
177  */
178 #define NBNXN_SHIFT_BACKWARD
179
180
181 /* This define is a lazy way to avoid interdependence of the grid
182  * and searching data structures.
183  */
184 #define NBNXN_NA_SC_MAX (GPU_NSUBCELL*NBNXN_GPU_CLUSTER_SIZE)
185
186
187 static void nbs_cycle_clear(nbnxn_cycle_t *cc)
188 {
189     int i;
190
191     for(i=0; i<enbsCCnr; i++)
192     {
193         cc[i].count = 0;
194         cc[i].c     = 0;
195     }
196 }
197
198 static double Mcyc_av(const nbnxn_cycle_t *cc)
199 {
200     return (double)cc->c*1e-6/cc->count;
201 }
202
203 static void nbs_cycle_print(FILE *fp,const nbnxn_search_t nbs)
204 {
205     int n;
206     int t;
207
208     fprintf(fp,"\n");
209     fprintf(fp,"ns %4d grid %4.1f search %4.1f red.f %5.3f",
210             nbs->cc[enbsCCgrid].count,
211             Mcyc_av(&nbs->cc[enbsCCgrid]),
212             Mcyc_av(&nbs->cc[enbsCCsearch]),
213             Mcyc_av(&nbs->cc[enbsCCreducef]));
214
215     if (nbs->nthread_max > 1)
216     {
217         if (nbs->cc[enbsCCcombine].count > 0)
218         {
219             fprintf(fp," comb %5.2f",
220                     Mcyc_av(&nbs->cc[enbsCCcombine]));
221         }
222         fprintf(fp," s. th");
223         for(t=0; t<nbs->nthread_max; t++)
224         {
225             fprintf(fp," %4.1f",
226                     Mcyc_av(&nbs->work[t].cc[enbsCCsearch]));
227         }
228     }
229     fprintf(fp,"\n");
230 }
231
232 static void nbnxn_grid_init(nbnxn_grid_t * grid)
233 {
234     grid->cxy_na      = NULL;
235     grid->cxy_ind     = NULL;
236     grid->cxy_nalloc  = 0;
237     grid->bb          = NULL;
238     grid->bbj         = NULL;
239     grid->nc_nalloc   = 0;
240 }
241
242 static int get_2log(int n)
243 {
244     int log2;
245
246     log2 = 0;
247     while ((1<<log2) < n)
248     {
249         log2++;
250     }
251     if ((1<<log2) != n)
252     {
253         gmx_fatal(FARGS,"nbnxn na_c (%d) is not a power of 2",n);
254     }
255
256     return log2;
257 }
258
259 static int nbnxn_kernel_to_ci_size(int nb_kernel_type)
260 {
261     switch (nb_kernel_type)
262     {
263     case nbnxnk4x4_PlainC:
264     case nbnxnk4xN_SIMD_4xN:
265     case nbnxnk4xN_SIMD_2xNN:
266         return NBNXN_CPU_CLUSTER_I_SIZE;
267     case nbnxnk8x8x8_CUDA:
268     case nbnxnk8x8x8_PlainC:
269         /* The cluster size for super/sub lists is only set here.
270          * Any value should work for the pair-search and atomdata code.
271          * The kernels, of course, might require a particular value.
272          */
273         return NBNXN_GPU_CLUSTER_SIZE;
274     default:
275         gmx_incons("unknown kernel type");
276     }
277
278     return 0;
279 }
280
281 int nbnxn_kernel_to_cj_size(int nb_kernel_type)
282 {
283     int nbnxn_simd_width=0;
284     int cj_size=0;
285
286 #ifdef GMX_NBNXN_SIMD
287     nbnxn_simd_width = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
288 #endif
289
290     switch (nb_kernel_type)
291     {
292     case nbnxnk4x4_PlainC:
293         cj_size = NBNXN_CPU_CLUSTER_I_SIZE;
294         break;
295     case nbnxnk4xN_SIMD_4xN:
296         cj_size = nbnxn_simd_width;
297         break;
298     case nbnxnk4xN_SIMD_2xNN:
299         cj_size = nbnxn_simd_width/2;
300         break;
301     case nbnxnk8x8x8_CUDA:
302     case nbnxnk8x8x8_PlainC:
303         cj_size = nbnxn_kernel_to_ci_size(nb_kernel_type);
304         break;
305     default:
306         gmx_incons("unknown kernel type");
307     }
308
309     return cj_size;
310 }
311
312 static int ci_to_cj(int na_cj_2log,int ci)
313 {
314     switch (na_cj_2log)
315     {
316     case 2: return  ci;     break;
317     case 1: return (ci<<1); break;
318     case 3: return (ci>>1); break;
319     }
320
321     return 0;
322 }
323
324 gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type)
325 {
326     if (nb_kernel_type == nbnxnkNotSet)
327     {
328         gmx_fatal(FARGS, "Non-bonded kernel type not set for Verlet-style pair-list.");
329     }
330
331     switch (nb_kernel_type)
332     {
333     case nbnxnk8x8x8_CUDA:
334     case nbnxnk8x8x8_PlainC:
335         return FALSE;
336
337     case nbnxnk4x4_PlainC:
338     case nbnxnk4xN_SIMD_4xN:
339     case nbnxnk4xN_SIMD_2xNN:
340         return TRUE;
341
342     default:
343         gmx_incons("Invalid nonbonded kernel type passed!");
344         return FALSE;
345     }
346 }
347
348 void nbnxn_init_search(nbnxn_search_t * nbs_ptr,
349                        ivec *n_dd_cells,
350                        gmx_domdec_zones_t *zones,
351                        int nthread_max)
352 {
353     nbnxn_search_t nbs;
354     int d,g,t;
355
356     snew(nbs,1);
357     *nbs_ptr = nbs;
358
359     nbs->DomDec = (n_dd_cells != NULL);
360
361     clear_ivec(nbs->dd_dim);
362     nbs->ngrid = 1;
363     if (nbs->DomDec)
364     {
365         nbs->zones = zones;
366
367         for(d=0; d<DIM; d++)
368         {
369             if ((*n_dd_cells)[d] > 1)
370             {
371                 nbs->dd_dim[d] = 1;
372                 /* Each grid matches a DD zone */
373                 nbs->ngrid *= 2;
374             }
375         }
376     }
377
378     snew(nbs->grid,nbs->ngrid);
379     for(g=0; g<nbs->ngrid; g++)
380     {
381         nbnxn_grid_init(&nbs->grid[g]);
382     }
383     nbs->cell        = NULL;
384     nbs->cell_nalloc = 0;
385     nbs->a           = NULL;
386     nbs->a_nalloc    = 0;
387
388     nbs->nthread_max = nthread_max;
389
390     /* Initialize the work data structures for each thread */
391     snew(nbs->work,nbs->nthread_max);
392     for(t=0; t<nbs->nthread_max; t++)
393     {
394         nbs->work[t].cxy_na           = NULL;
395         nbs->work[t].cxy_na_nalloc    = 0;
396         nbs->work[t].sort_work        = NULL;
397         nbs->work[t].sort_work_nalloc = 0;
398     }
399
400     /* Initialize detailed nbsearch cycle counting */
401     nbs->print_cycles = (getenv("GMX_NBNXN_CYCLE") != 0);
402     nbs->search_count = 0;
403     nbs_cycle_clear(nbs->cc);
404     for(t=0; t<nbs->nthread_max; t++)
405     {
406         nbs_cycle_clear(nbs->work[t].cc);
407     }
408 }
409
410 static real grid_atom_density(int n,rvec corner0,rvec corner1)
411 {
412     rvec size;
413
414     rvec_sub(corner1,corner0,size);
415
416     return n/(size[XX]*size[YY]*size[ZZ]);
417 }
418
419 static int set_grid_size_xy(const nbnxn_search_t nbs,
420                             nbnxn_grid_t *grid,
421                             int n,rvec corner0,rvec corner1,
422                             real atom_density,
423                             int XFormat)
424 {
425     rvec size;
426     int  na_c;
427     real adens,tlen,tlen_x,tlen_y,nc_max;
428     int  t;
429
430     rvec_sub(corner1,corner0,size);
431
432     if (n > grid->na_sc)
433     {
434         /* target cell length */
435         if (grid->bSimple)
436         {
437             /* To minimize the zero interactions, we should make
438              * the largest of the i/j cell cubic.
439              */
440             na_c = max(grid->na_c,grid->na_cj);
441
442             /* Approximately cubic cells */
443             tlen   = pow(na_c/atom_density,1.0/3.0);
444             tlen_x = tlen;
445             tlen_y = tlen;
446         }
447         else
448         {
449             /* Approximately cubic sub cells */
450             tlen   = pow(grid->na_c/atom_density,1.0/3.0);
451             tlen_x = tlen*GPU_NSUBCELL_X;
452             tlen_y = tlen*GPU_NSUBCELL_Y;
453         }
454         /* We round ncx and ncy down, because we get less cell pairs
455          * in the nbsist when the fixed cell dimensions (x,y) are
456          * larger than the variable one (z) than the other way around.
457          */
458         grid->ncx = max(1,(int)(size[XX]/tlen_x));
459         grid->ncy = max(1,(int)(size[YY]/tlen_y));
460     }
461     else
462     {
463         grid->ncx = 1;
464         grid->ncy = 1;
465     }
466
467     /* We need one additional cell entry for particles moved by DD */
468     if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
469     {
470         grid->cxy_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
471         srenew(grid->cxy_na,grid->cxy_nalloc);
472         srenew(grid->cxy_ind,grid->cxy_nalloc+1);
473     }
474     for(t=0; t<nbs->nthread_max; t++)
475     {
476         if (grid->ncx*grid->ncy+1 > nbs->work[t].cxy_na_nalloc)
477         {
478             nbs->work[t].cxy_na_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
479             srenew(nbs->work[t].cxy_na,nbs->work[t].cxy_na_nalloc);
480         }
481     }
482
483     /* Worst case scenario of 1 atom in each last cell */
484     if (grid->na_cj <= grid->na_c)
485     {
486         nc_max = n/grid->na_sc + grid->ncx*grid->ncy;
487     }
488     else
489     {
490         nc_max = n/grid->na_sc + grid->ncx*grid->ncy*grid->na_cj/grid->na_c;
491     }
492
493     if (nc_max > grid->nc_nalloc)
494     {
495         int bb_nalloc;
496
497         grid->nc_nalloc = over_alloc_large(nc_max);
498         srenew(grid->nsubc,grid->nc_nalloc);
499         srenew(grid->bbcz,grid->nc_nalloc*NNBSBB_D);
500 #ifdef NBNXN_8BB_SSE
501         bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX;
502 #else
503         bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL*NNBSBB_B;
504 #endif
505         sfree_aligned(grid->bb);
506         /* This snew also zeros the contents, this avoid possible
507          * floating exceptions in SSE with the unused bb elements.
508          */
509         snew_aligned(grid->bb,bb_nalloc,16);
510
511         if (grid->bSimple)
512         {
513             if (grid->na_cj == grid->na_c)
514             {
515                 grid->bbj = grid->bb;
516             }
517             else
518             {
519                 sfree_aligned(grid->bbj);
520                 snew_aligned(grid->bbj,bb_nalloc*grid->na_c/grid->na_cj,16);
521             }
522         }
523
524         srenew(grid->flags,grid->nc_nalloc);
525     }
526
527     copy_rvec(corner0,grid->c0);
528     copy_rvec(corner1,grid->c1);
529     grid->sx = size[XX]/grid->ncx;
530     grid->sy = size[YY]/grid->ncy;
531     grid->inv_sx = 1/grid->sx;
532     grid->inv_sy = 1/grid->sy;
533
534     return nc_max;
535 }
536
537 #define SORT_GRID_OVERSIZE 2
538 #define SGSF (SORT_GRID_OVERSIZE + 1)
539
540 static void sort_atoms(int dim,gmx_bool Backwards,
541                        int *a,int n,rvec *x,
542                        real h0,real invh,int nsort,int *sort)
543 {
544     int i,c;
545     int zi,zim;
546     int cp,tmp;
547
548     if (n <= 1)
549     {
550         /* Nothing to do */
551         return;
552     }
553
554     /* For small oversize factors clearing the whole area is fastest.
555      * For large oversize we should clear the used elements after use.
556      */
557     for(i=0; i<nsort; i++)
558     {
559         sort[i] = -1;
560     }
561     /* Sort the particles using a simple index sort */
562     for(i=0; i<n; i++)
563     {
564         /* The cast takes care of float-point rounding effects below zero.
565          * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
566          * times the box height out of the box.
567          */
568         zi = (int)((x[a[i]][dim] - h0)*invh);
569
570 #ifdef DEBUG_NBNXN_GRIDDING
571         if (zi < 0 || zi >= nsort)
572         {
573             gmx_fatal(FARGS,"(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d\n",
574                       a[i],'x'+dim,x[a[i]][dim],h0,invh,zi,nsort);
575         }
576 #endif
577
578         /* Ideally this particle should go in sort cell zi,
579          * but that might already be in use,
580          * in that case find the first empty cell higher up
581          */
582         if (sort[zi] < 0)
583         {
584             sort[zi] = a[i];
585         }
586         else
587         {
588             /* We have multiple atoms in the same sorting slot.
589              * Sort on real z for minimal bounding box size.
590              * There is an extra check for identical z to ensure
591              * well-defined output order, independent of input order
592              * to ensure binary reproducibility after restarts.
593              */
594             while(sort[zi] >= 0 && ( x[a[i]][dim] >  x[sort[zi]][dim] ||
595                                     (x[a[i]][dim] == x[sort[zi]][dim] &&
596                                      a[i] > sort[zi])))
597             {
598                 zi++;
599             }
600
601             if (sort[zi] >= 0)
602             {
603                 /* Shift all elements by one slot until we find an empty slot */
604                 cp = sort[zi];
605                 zim = zi + 1;
606                 while (sort[zim] >= 0)
607                 {
608                     tmp = sort[zim];
609                     sort[zim] = cp;
610                     cp  = tmp;
611                     zim++;
612                 }
613                 sort[zim] = cp;
614             }
615             sort[zi] = a[i];
616         }
617     }
618
619     c = 0;
620     if (!Backwards)
621     {
622         for(zi=0; zi<nsort; zi++)
623         {
624             if (sort[zi] >= 0)
625             {
626                 a[c++] = sort[zi];
627             }
628         }
629     }
630     else
631     {
632         for(zi=nsort-1; zi>=0; zi--)
633         {
634             if (sort[zi] >= 0)
635             {
636                 a[c++] = sort[zi];
637             }
638         }
639     }
640     if (c < n)
641     {
642         gmx_incons("Lost particles while sorting");
643     }
644 }
645
646 #ifdef GMX_DOUBLE
647 #define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
648 #define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
649 #else
650 #define R2F_D(x) (x)
651 #define R2F_U(x) (x)
652 #endif
653
654 /* Coordinate order x,y,z, bb order xyz0 */
655 static void calc_bounding_box(int na,int stride,const real *x,float *bb)
656 {
657     int  i,j;
658     real xl,xh,yl,yh,zl,zh;
659
660     i = 0;
661     xl = x[i+XX];
662     xh = x[i+XX];
663     yl = x[i+YY];
664     yh = x[i+YY];
665     zl = x[i+ZZ];
666     zh = x[i+ZZ];
667     i += stride;
668     for(j=1; j<na; j++)
669     {
670         xl = min(xl,x[i+XX]);
671         xh = max(xh,x[i+XX]);
672         yl = min(yl,x[i+YY]);
673         yh = max(yh,x[i+YY]);
674         zl = min(zl,x[i+ZZ]);
675         zh = max(zh,x[i+ZZ]);
676         i += stride;
677     }
678     /* Note: possible double to float conversion here */
679     bb[BBL_X] = R2F_D(xl);
680     bb[BBL_Y] = R2F_D(yl);
681     bb[BBL_Z] = R2F_D(zl);
682     bb[BBU_X] = R2F_U(xh);
683     bb[BBU_Y] = R2F_U(yh);
684     bb[BBU_Z] = R2F_U(zh);
685 }
686
687 /* Packed coordinates, bb order xyz0 */
688 static void calc_bounding_box_x_x4(int na,const real *x,float *bb)
689 {
690     int  j;
691     real xl,xh,yl,yh,zl,zh;
692
693     xl = x[XX*PACK_X4];
694     xh = x[XX*PACK_X4];
695     yl = x[YY*PACK_X4];
696     yh = x[YY*PACK_X4];
697     zl = x[ZZ*PACK_X4];
698     zh = x[ZZ*PACK_X4];
699     for(j=1; j<na; j++)
700     {
701         xl = min(xl,x[j+XX*PACK_X4]);
702         xh = max(xh,x[j+XX*PACK_X4]);
703         yl = min(yl,x[j+YY*PACK_X4]);
704         yh = max(yh,x[j+YY*PACK_X4]);
705         zl = min(zl,x[j+ZZ*PACK_X4]);
706         zh = max(zh,x[j+ZZ*PACK_X4]);
707     }
708     /* Note: possible double to float conversion here */
709     bb[BBL_X] = R2F_D(xl);
710     bb[BBL_Y] = R2F_D(yl);
711     bb[BBL_Z] = R2F_D(zl);
712     bb[BBU_X] = R2F_U(xh);
713     bb[BBU_Y] = R2F_U(yh);
714     bb[BBU_Z] = R2F_U(zh);
715 }
716
717 /* Packed coordinates, bb order xyz0 */
718 static void calc_bounding_box_x_x8(int na,const real *x,float *bb)
719 {
720     int  j;
721     real xl,xh,yl,yh,zl,zh;
722
723     xl = x[XX*PACK_X8];
724     xh = x[XX*PACK_X8];
725     yl = x[YY*PACK_X8];
726     yh = x[YY*PACK_X8];
727     zl = x[ZZ*PACK_X8];
728     zh = x[ZZ*PACK_X8];
729     for(j=1; j<na; j++)
730     {
731         xl = min(xl,x[j+XX*PACK_X8]);
732         xh = max(xh,x[j+XX*PACK_X8]);
733         yl = min(yl,x[j+YY*PACK_X8]);
734         yh = max(yh,x[j+YY*PACK_X8]);
735         zl = min(zl,x[j+ZZ*PACK_X8]);
736         zh = max(zh,x[j+ZZ*PACK_X8]);
737     }
738     /* Note: possible double to float conversion here */
739     bb[BBL_X] = R2F_D(xl);
740     bb[BBL_Y] = R2F_D(yl);
741     bb[BBL_Z] = R2F_D(zl);
742     bb[BBU_X] = R2F_U(xh);
743     bb[BBU_Y] = R2F_U(yh);
744     bb[BBU_Z] = R2F_U(zh);
745 }
746
747 #ifdef NBNXN_SEARCH_SSE
748
749 /* Packed coordinates, bb order xyz0 */
750 static void calc_bounding_box_x_x4_halves(int na,const real *x,
751                                           float *bb,float *bbj)
752 {
753     calc_bounding_box_x_x4(min(na,2),x,bbj);
754
755     if (na > 2)
756     {
757         calc_bounding_box_x_x4(min(na-2,2),x+(PACK_X4>>1),bbj+NNBSBB_B);
758     }
759     else
760     {
761         /* Set the "empty" bounding box to the same as the first one,
762          * so we don't need to treat special cases in the rest of the code.
763          */
764         _mm_store_ps(bbj+NNBSBB_B         ,_mm_load_ps(bbj));
765         _mm_store_ps(bbj+NNBSBB_B+NNBSBB_C,_mm_load_ps(bbj+NNBSBB_C));
766     }
767
768     _mm_store_ps(bb         ,_mm_min_ps(_mm_load_ps(bbj),
769                                         _mm_load_ps(bbj+NNBSBB_B)));
770     _mm_store_ps(bb+NNBSBB_C,_mm_max_ps(_mm_load_ps(bbj+NNBSBB_C),
771                                         _mm_load_ps(bbj+NNBSBB_B+NNBSBB_C)));
772 }
773
774 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
775 static void calc_bounding_box_xxxx(int na,int stride,const real *x,float *bb)
776 {
777     int  i,j;
778     real xl,xh,yl,yh,zl,zh;
779
780     i = 0;
781     xl = x[i+XX];
782     xh = x[i+XX];
783     yl = x[i+YY];
784     yh = x[i+YY];
785     zl = x[i+ZZ];
786     zh = x[i+ZZ];
787     i += stride;
788     for(j=1; j<na; j++)
789     {
790         xl = min(xl,x[i+XX]);
791         xh = max(xh,x[i+XX]);
792         yl = min(yl,x[i+YY]);
793         yh = max(yh,x[i+YY]);
794         zl = min(zl,x[i+ZZ]);
795         zh = max(zh,x[i+ZZ]);
796         i += stride;
797     }
798     /* Note: possible double to float conversion here */
799     bb[0*STRIDE_8BB] = R2F_D(xl);
800     bb[1*STRIDE_8BB] = R2F_D(yl);
801     bb[2*STRIDE_8BB] = R2F_D(zl);
802     bb[3*STRIDE_8BB] = R2F_U(xh);
803     bb[4*STRIDE_8BB] = R2F_U(yh);
804     bb[5*STRIDE_8BB] = R2F_U(zh);
805 }
806
807 #endif /* NBNXN_SEARCH_SSE */
808
809 #ifdef NBNXN_SEARCH_SSE_SINGLE
810
811 /* Coordinate order xyz?, bb order xyz0 */
812 static void calc_bounding_box_sse(int na,const float *x,float *bb)
813 {
814     __m128 bb_0_SSE,bb_1_SSE;
815     __m128 x_SSE;
816
817     int  i;
818
819     bb_0_SSE = _mm_load_ps(x);
820     bb_1_SSE = bb_0_SSE;
821
822     for(i=1; i<na; i++)
823     {
824         x_SSE    = _mm_load_ps(x+i*NNBSBB_C);
825         bb_0_SSE = _mm_min_ps(bb_0_SSE,x_SSE);
826         bb_1_SSE = _mm_max_ps(bb_1_SSE,x_SSE);
827     }
828
829     _mm_store_ps(bb  ,bb_0_SSE);
830     _mm_store_ps(bb+4,bb_1_SSE);
831 }
832
833 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
834 static void calc_bounding_box_xxxx_sse(int na,const float *x,
835                                        float *bb_work,
836                                        real *bb)
837 {
838     calc_bounding_box_sse(na,x,bb_work);
839
840     bb[0*STRIDE_8BB] = bb_work[BBL_X];
841     bb[1*STRIDE_8BB] = bb_work[BBL_Y];
842     bb[2*STRIDE_8BB] = bb_work[BBL_Z];
843     bb[3*STRIDE_8BB] = bb_work[BBU_X];
844     bb[4*STRIDE_8BB] = bb_work[BBU_Y];
845     bb[5*STRIDE_8BB] = bb_work[BBU_Z];
846 }
847
848 #endif /* NBNXN_SEARCH_SSE_SINGLE */
849
850 #ifdef NBNXN_SEARCH_SSE
851
852 /* Combines pairs of consecutive bounding boxes */
853 static void combine_bounding_box_pairs(nbnxn_grid_t *grid,const float *bb)
854 {
855     int    i,j,sc2,nc2,c2;
856     __m128 min_SSE,max_SSE;
857
858     for(i=0; i<grid->ncx*grid->ncy; i++)
859     {
860         /* Starting bb in a column is expected to be 2-aligned */
861         sc2 = grid->cxy_ind[i]>>1;
862         /* For odd numbers skip the last bb here */
863         nc2 = (grid->cxy_na[i]+3)>>(2+1);
864         for(c2=sc2; c2<sc2+nc2; c2++)
865         {
866             min_SSE = _mm_min_ps(_mm_load_ps(bb+(c2*4+0)*NNBSBB_C),
867                                  _mm_load_ps(bb+(c2*4+2)*NNBSBB_C));
868             max_SSE = _mm_max_ps(_mm_load_ps(bb+(c2*4+1)*NNBSBB_C),
869                                  _mm_load_ps(bb+(c2*4+3)*NNBSBB_C));
870             _mm_store_ps(grid->bbj+(c2*2+0)*NNBSBB_C,min_SSE);
871             _mm_store_ps(grid->bbj+(c2*2+1)*NNBSBB_C,max_SSE);
872         }
873         if (((grid->cxy_na[i]+3)>>2) & 1)
874         {
875             /* Copy the last bb for odd bb count in this column */
876             for(j=0; j<NNBSBB_C; j++)
877             {
878                 grid->bbj[(c2*2+0)*NNBSBB_C+j] = bb[(c2*4+0)*NNBSBB_C+j];
879                 grid->bbj[(c2*2+1)*NNBSBB_C+j] = bb[(c2*4+1)*NNBSBB_C+j];
880             }
881         }
882     }
883 }
884
885 #endif
886
887
888 /* Prints the average bb size, used for debug output */
889 static void print_bbsizes_simple(FILE *fp,
890                                  const nbnxn_search_t nbs,
891                                  const nbnxn_grid_t *grid)
892 {
893     int  c,d;
894     dvec ba;
895
896     clear_dvec(ba);
897     for(c=0; c<grid->nc; c++)
898     {
899         for(d=0; d<DIM; d++)
900         {
901             ba[d] += grid->bb[c*NNBSBB_B+NNBSBB_C+d] - grid->bb[c*NNBSBB_B+d];
902         }
903     }
904     dsvmul(1.0/grid->nc,ba,ba);
905
906     fprintf(fp,"ns bb: %4.2f %4.2f %4.2f  %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
907             nbs->box[XX][XX]/grid->ncx,
908             nbs->box[YY][YY]/grid->ncy,
909             nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/grid->nc,
910             ba[XX],ba[YY],ba[ZZ],
911             ba[XX]*grid->ncx/nbs->box[XX][XX],
912             ba[YY]*grid->ncy/nbs->box[YY][YY],
913             ba[ZZ]*grid->nc/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
914 }
915
916 /* Prints the average bb size, used for debug output */
917 static void print_bbsizes_supersub(FILE *fp,
918                                    const nbnxn_search_t nbs,
919                                    const nbnxn_grid_t *grid)
920 {
921     int  ns,c,s;
922     dvec ba;
923
924     clear_dvec(ba);
925     ns = 0;
926     for(c=0; c<grid->nc; c++)
927     {
928 #ifdef NBNXN_BBXXXX
929         for(s=0; s<grid->nsubc[c]; s+=STRIDE_8BB)
930         {
931             int cs_w,i,d;
932
933             cs_w = (c*GPU_NSUBCELL + s)/STRIDE_8BB;
934             for(i=0; i<STRIDE_8BB; i++)
935             {
936                 for(d=0; d<DIM; d++)
937                 {
938                     ba[d] +=
939                         grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_8BB+i] -
940                         grid->bb[cs_w*NNBSBB_XXXX+     d *STRIDE_8BB+i];
941                 }
942             }
943         }
944 #else
945         for(s=0; s<grid->nsubc[c]; s++)
946         {
947             int cs,d;
948
949             cs = c*GPU_NSUBCELL + s;
950             for(d=0; d<DIM; d++)
951             {
952                 ba[d] +=
953                     grid->bb[cs*NNBSBB_B+NNBSBB_C+d] -
954                     grid->bb[cs*NNBSBB_B         +d];
955             }
956         }
957 #endif
958         ns += grid->nsubc[c];
959     }
960     dsvmul(1.0/ns,ba,ba);
961
962     fprintf(fp,"ns bb: %4.2f %4.2f %4.2f  %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
963             nbs->box[XX][XX]/(grid->ncx*GPU_NSUBCELL_X),
964             nbs->box[YY][YY]/(grid->ncy*GPU_NSUBCELL_Y),
965             nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z),
966             ba[XX],ba[YY],ba[ZZ],
967             ba[XX]*grid->ncx*GPU_NSUBCELL_X/nbs->box[XX][XX],
968             ba[YY]*grid->ncy*GPU_NSUBCELL_Y/nbs->box[YY][YY],
969             ba[ZZ]*grid->nc*GPU_NSUBCELL_Z/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
970 }
971
972 /* Potentially sorts atoms on LJ coefficients !=0 and ==0.
973  * Also sets interaction flags.
974  */
975 void sort_on_lj(nbnxn_atomdata_t *nbat,int na_c,
976                 int a0,int a1,const int *atinfo,
977                 int *order,
978                 int *flags)
979 {
980     int subc,s,a,n1,n2,a_lj_max,i,j;
981     int sort1[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
982     int sort2[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
983     gmx_bool haveQ;
984
985     *flags = 0;
986
987     subc = 0;
988     for(s=a0; s<a1; s+=na_c)
989     {
990         /* Make lists for this (sub-)cell on atoms with and without LJ */
991         n1 = 0;
992         n2 = 0;
993         haveQ = FALSE;
994         a_lj_max = -1;
995         for(a=s; a<min(s+na_c,a1); a++)
996         {
997             haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
998
999             if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
1000             {
1001                 sort1[n1++] = order[a];
1002                 a_lj_max = a;
1003             }
1004             else
1005             {
1006                 sort2[n2++] = order[a];
1007             }
1008         }
1009
1010         /* If we don't have atom with LJ, there's nothing to sort */
1011         if (n1 > 0)
1012         {
1013             *flags |= NBNXN_CI_DO_LJ(subc);
1014
1015             if (2*n1 <= na_c)
1016             {
1017                 /* Only sort when strictly necessary. Ordering particles
1018                  * Ordering particles can lead to less accurate summation
1019                  * due to rounding, both for LJ and Coulomb interactions.
1020                  */
1021                 if (2*(a_lj_max - s) >= na_c)
1022                 {
1023                     for(i=0; i<n1; i++)
1024                     {
1025                         order[a0+i] = sort1[i];
1026                     }
1027                     for(j=0; j<n2; j++)
1028                     {
1029                         order[a0+n1+j] = sort2[j];
1030                     }
1031                 }
1032
1033                 *flags |= NBNXN_CI_HALF_LJ(subc);
1034             }
1035         }
1036         if (haveQ)
1037         {
1038             *flags |= NBNXN_CI_DO_COUL(subc);
1039         }
1040         subc++;
1041     }
1042 }
1043
1044 /* Fill a pair search cell with atoms.
1045  * Potentially sorts atoms and sets the interaction flags.
1046  */
1047 void fill_cell(const nbnxn_search_t nbs,
1048                nbnxn_grid_t *grid,
1049                nbnxn_atomdata_t *nbat,
1050                int a0,int a1,
1051                const int *atinfo,
1052                rvec *x,
1053                int sx,int sy, int sz,
1054                float *bb_work)
1055 {
1056     int    na,a;
1057     size_t offset;
1058     float  *bb_ptr;
1059
1060     na = a1 - a0;
1061
1062     if (grid->bSimple)
1063     {
1064         sort_on_lj(nbat,grid->na_c,a0,a1,atinfo,nbs->a,
1065                    grid->flags+(a0>>grid->na_c_2log)-grid->cell0);
1066     }
1067
1068     /* Now we have sorted the atoms, set the cell indices */
1069     for(a=a0; a<a1; a++)
1070     {
1071         nbs->cell[nbs->a[a]] = a;
1072     }
1073
1074     copy_rvec_to_nbat_real(nbs->a+a0,a1-a0,grid->na_c,x,
1075                            nbat->XFormat,nbat->x,a0,
1076                            sx,sy,sz);
1077
1078     if (nbat->XFormat == nbatX4)
1079     {
1080         /* Store the bounding boxes as xyz.xyz. */
1081         offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
1082         bb_ptr = grid->bb + offset;
1083
1084 #if defined GMX_DOUBLE && defined NBNXN_SEARCH_SSE
1085         if (2*grid->na_cj == grid->na_c)
1086         {
1087             calc_bounding_box_x_x4_halves(na,nbat->x+X4_IND_A(a0),bb_ptr,
1088                                           grid->bbj+offset*2);
1089         }
1090         else
1091 #endif
1092         {
1093             calc_bounding_box_x_x4(na,nbat->x+X4_IND_A(a0),bb_ptr);
1094         }
1095     }
1096     else if (nbat->XFormat == nbatX8)
1097     {
1098         /* Store the bounding boxes as xyz.xyz. */
1099         offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
1100         bb_ptr = grid->bb + offset;
1101
1102         calc_bounding_box_x_x8(na,nbat->x+X8_IND_A(a0),bb_ptr);
1103     }
1104 #ifdef NBNXN_BBXXXX
1105     else if (!grid->bSimple)
1106     {
1107         /* Store the bounding boxes in a format convenient
1108          * for SSE calculations: xxxxyyyyzzzz...
1109                              */
1110         bb_ptr =
1111             grid->bb +
1112             ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_8BB_2LOG))*NNBSBB_XXXX +
1113             (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_8BB-1));
1114
1115 #ifdef NBNXN_SEARCH_SSE_SINGLE
1116         if (nbat->XFormat == nbatXYZQ)
1117         {
1118             calc_bounding_box_xxxx_sse(na,nbat->x+a0*nbat->xstride,
1119                                        bb_work,bb_ptr);
1120         }
1121         else
1122 #endif
1123         {
1124             calc_bounding_box_xxxx(na,nbat->xstride,nbat->x+a0*nbat->xstride,
1125                                    bb_ptr);
1126         }
1127         if (gmx_debug_at)
1128         {
1129             fprintf(debug,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
1130                     sx,sy,sz,
1131                     bb_ptr[0*STRIDE_8BB],bb_ptr[3*STRIDE_8BB],
1132                     bb_ptr[1*STRIDE_8BB],bb_ptr[4*STRIDE_8BB],
1133                     bb_ptr[2*STRIDE_8BB],bb_ptr[5*STRIDE_8BB]);
1134         }
1135     }
1136 #endif
1137     else
1138     {
1139         /* Store the bounding boxes as xyz.xyz. */
1140         bb_ptr = grid->bb+((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
1141
1142         calc_bounding_box(na,nbat->xstride,nbat->x+a0*nbat->xstride,
1143                           bb_ptr);
1144
1145         if (gmx_debug_at)
1146         {
1147             int bbo;
1148             bbo = (a0 - grid->cell0*grid->na_sc)/grid->na_c;
1149             fprintf(debug,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
1150                     sx,sy,sz,
1151                     (grid->bb+bbo*NNBSBB_B)[BBL_X],
1152                     (grid->bb+bbo*NNBSBB_B)[BBU_X],
1153                     (grid->bb+bbo*NNBSBB_B)[BBL_Y],
1154                     (grid->bb+bbo*NNBSBB_B)[BBU_Y],
1155                     (grid->bb+bbo*NNBSBB_B)[BBL_Z],
1156                     (grid->bb+bbo*NNBSBB_B)[BBU_Z]);
1157         }
1158     }
1159 }
1160
1161 /* Spatially sort the atoms within one grid column */
1162 static void sort_columns_simple(const nbnxn_search_t nbs,
1163                                 int dd_zone,
1164                                 nbnxn_grid_t *grid,
1165                                 int a0,int a1,
1166                                 const int *atinfo,
1167                                 rvec *x,
1168                                 nbnxn_atomdata_t *nbat,
1169                                 int cxy_start,int cxy_end,
1170                                 int *sort_work)
1171 {
1172     int  cxy;
1173     int  cx,cy,cz,ncz,cfilled,c;
1174     int  na,ash,ind,a;
1175     int  na_c,ash_c;
1176
1177     if (debug)
1178     {
1179         fprintf(debug,"cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1180                 grid->cell0,cxy_start,cxy_end,a0,a1);
1181     }
1182
1183     /* Sort the atoms within each x,y column in 3 dimensions */
1184     for(cxy=cxy_start; cxy<cxy_end; cxy++)
1185     {
1186         cx = cxy/grid->ncy;
1187         cy = cxy - cx*grid->ncy;
1188
1189         na  = grid->cxy_na[cxy];
1190         ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
1191         ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
1192
1193         /* Sort the atoms within each x,y column on z coordinate */
1194         sort_atoms(ZZ,FALSE,
1195                    nbs->a+ash,na,x,
1196                    grid->c0[ZZ],
1197                    ncz*grid->na_sc*SORT_GRID_OVERSIZE/nbs->box[ZZ][ZZ],
1198                    ncz*grid->na_sc*SGSF,sort_work);
1199
1200         /* Fill the ncz cells in this column */
1201         cfilled = grid->cxy_ind[cxy];
1202         for(cz=0; cz<ncz; cz++)
1203         {
1204             c  = grid->cxy_ind[cxy] + cz ;
1205
1206             ash_c = ash + cz*grid->na_sc;
1207             na_c  = min(grid->na_sc,na-(ash_c-ash));
1208
1209             fill_cell(nbs,grid,nbat,
1210                       ash_c,ash_c+na_c,atinfo,x,
1211                       grid->na_sc*cx + (dd_zone >> 2),
1212                       grid->na_sc*cy + (dd_zone & 3),
1213                       grid->na_sc*cz,
1214                       NULL);
1215
1216             /* This copy to bbcz is not really necessary.
1217              * But it allows to use the same grid search code
1218              * for the simple and supersub cell setups.
1219              */
1220             if (na_c > 0)
1221             {
1222                 cfilled = c;
1223             }
1224             grid->bbcz[c*NNBSBB_D  ] = grid->bb[cfilled*NNBSBB_B+2];
1225             grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled*NNBSBB_B+6];
1226         }
1227
1228         /* Set the unused atom indices to -1 */
1229         for(ind=na; ind<ncz*grid->na_sc; ind++)
1230         {
1231             nbs->a[ash+ind] = -1;
1232         }
1233     }
1234 }
1235
1236 /* Spatially sort the atoms within one grid column */
1237 static void sort_columns_supersub(const nbnxn_search_t nbs,
1238                                   int dd_zone,
1239                                   nbnxn_grid_t *grid,
1240                                   int a0,int a1,
1241                                   const int *atinfo,
1242                                   rvec *x,
1243                                   nbnxn_atomdata_t *nbat,
1244                                   int cxy_start,int cxy_end,
1245                                   int *sort_work)
1246 {
1247     int  cxy;
1248     int  cx,cy,cz=-1,c=-1,ncz;
1249     int  na,ash,na_c,ind,a;
1250     int  subdiv_z,sub_z,na_z,ash_z;
1251     int  subdiv_y,sub_y,na_y,ash_y;
1252     int  subdiv_x,sub_x,na_x,ash_x;
1253
1254     /* cppcheck-suppress unassignedVariable */
1255     float bb_work_array[NNBSBB_B+3],*bb_work_align;
1256
1257     bb_work_align = (float *)(((size_t)(bb_work_array+3)) & (~((size_t)15)));
1258
1259     if (debug)
1260     {
1261         fprintf(debug,"cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1262                 grid->cell0,cxy_start,cxy_end,a0,a1);
1263     }
1264
1265     subdiv_x = grid->na_c;
1266     subdiv_y = GPU_NSUBCELL_X*subdiv_x;
1267     subdiv_z = GPU_NSUBCELL_Y*subdiv_y;
1268
1269     /* Sort the atoms within each x,y column in 3 dimensions */
1270     for(cxy=cxy_start; cxy<cxy_end; cxy++)
1271     {
1272         cx = cxy/grid->ncy;
1273         cy = cxy - cx*grid->ncy;
1274
1275         na  = grid->cxy_na[cxy];
1276         ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
1277         ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
1278
1279         /* Sort the atoms within each x,y column on z coordinate */
1280         sort_atoms(ZZ,FALSE,
1281                    nbs->a+ash,na,x,
1282                    grid->c0[ZZ],
1283                    ncz*grid->na_sc*SORT_GRID_OVERSIZE/nbs->box[ZZ][ZZ],
1284                    ncz*grid->na_sc*SGSF,sort_work);
1285
1286         /* This loop goes over the supercells and subcells along z at once */
1287         for(sub_z=0; sub_z<ncz*GPU_NSUBCELL_Z; sub_z++)
1288         {
1289             ash_z = ash + sub_z*subdiv_z;
1290             na_z  = min(subdiv_z,na-(ash_z-ash));
1291
1292             /* We have already sorted on z */
1293
1294             if (sub_z % GPU_NSUBCELL_Z == 0)
1295             {
1296                 cz = sub_z/GPU_NSUBCELL_Z;
1297                 c  = grid->cxy_ind[cxy] + cz ;
1298
1299                 /* The number of atoms in this supercell */
1300                 na_c = min(grid->na_sc,na-(ash_z-ash));
1301
1302                 grid->nsubc[c] = min(GPU_NSUBCELL,(na_c+grid->na_c-1)/grid->na_c);
1303
1304                 /* Store the z-boundaries of the super cell */
1305                 grid->bbcz[c*NNBSBB_D  ] = x[nbs->a[ash_z]][ZZ];
1306                 grid->bbcz[c*NNBSBB_D+1] = x[nbs->a[ash_z+na_c-1]][ZZ];
1307             }
1308
1309 #if GPU_NSUBCELL_Y > 1
1310             /* Sort the atoms along y */
1311             sort_atoms(YY,(sub_z & 1),
1312                        nbs->a+ash_z,na_z,x,
1313                        grid->c0[YY]+cy*grid->sy,grid->inv_sy,
1314                        subdiv_y*SGSF,sort_work);
1315 #endif
1316
1317             for(sub_y=0; sub_y<GPU_NSUBCELL_Y; sub_y++)
1318             {
1319                 ash_y = ash_z + sub_y*subdiv_y;
1320                 na_y  = min(subdiv_y,na-(ash_y-ash));
1321
1322 #if GPU_NSUBCELL_X > 1
1323                 /* Sort the atoms along x */
1324                 sort_atoms(XX,((cz*GPU_NSUBCELL_Y + sub_y) & 1),
1325                            nbs->a+ash_y,na_y,x,
1326                            grid->c0[XX]+cx*grid->sx,grid->inv_sx,
1327                            subdiv_x*SGSF,sort_work);
1328 #endif
1329
1330                 for(sub_x=0; sub_x<GPU_NSUBCELL_X; sub_x++)
1331                 {
1332                     ash_x = ash_y + sub_x*subdiv_x;
1333                     na_x  = min(subdiv_x,na-(ash_x-ash));
1334
1335                     fill_cell(nbs,grid,nbat,
1336                               ash_x,ash_x+na_x,atinfo,x,
1337                               grid->na_c*(cx*GPU_NSUBCELL_X+sub_x) + (dd_zone >> 2),
1338                               grid->na_c*(cy*GPU_NSUBCELL_Y+sub_y) + (dd_zone & 3),
1339                               grid->na_c*sub_z,
1340                               bb_work_align);
1341                 }
1342             }
1343         }
1344
1345         /* Set the unused atom indices to -1 */
1346         for(ind=na; ind<ncz*grid->na_sc; ind++)
1347         {
1348             nbs->a[ash+ind] = -1;
1349         }
1350     }
1351 }
1352
1353 /* Determine in which grid column atoms should go */
1354 static void calc_column_indices(nbnxn_grid_t *grid,
1355                                 int a0,int a1,
1356                                 rvec *x,const int *move,
1357                                 int thread,int nthread,
1358                                 int *cell,
1359                                 int *cxy_na)
1360 {
1361     int  n0,n1,i;
1362     int  cx,cy;
1363
1364     /* We add one extra cell for particles which moved during DD */
1365     for(i=0; i<grid->ncx*grid->ncy+1; i++)
1366     {
1367         cxy_na[i] = 0;
1368     }
1369
1370     n0 = a0 + (int)((thread+0)*(a1 - a0))/nthread;
1371     n1 = a0 + (int)((thread+1)*(a1 - a0))/nthread;
1372     for(i=n0; i<n1; i++)
1373     {
1374         if (move == NULL || move[i] >= 0)
1375         {
1376             /* We need to be careful with rounding,
1377              * particles might be a few bits outside the local box.
1378              * The int cast takes care of the lower bound,
1379              * we need to explicitly take care of the upper bound.
1380              */
1381             cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
1382             if (cx == grid->ncx)
1383             {
1384                 cx = grid->ncx - 1;
1385             }
1386             cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
1387             if (cy == grid->ncy)
1388             {
1389                 cy = grid->ncy - 1;
1390             }
1391             /* For the moment cell contains only the, grid local,
1392              * x and y indices, not z.
1393              */
1394             cell[i] = cx*grid->ncy + cy;
1395
1396 #ifdef DEBUG_NBNXN_GRIDDING
1397             if (cell[i] < 0 || cell[i] >= grid->ncx*grid->ncy)
1398             {
1399                 gmx_fatal(FARGS,
1400                           "grid cell cx %d cy %d out of range (max %d %d)\n"
1401                           "atom %f %f %f, grid->c0 %f %f",
1402                           cx,cy,grid->ncx,grid->ncy,
1403                           x[i][XX],x[i][YY],x[i][ZZ],grid->c0[XX],grid->c0[YY]);
1404             }
1405 #endif
1406         }
1407         else
1408         {
1409             /* Put this moved particle after the end of the grid,
1410              * so we can process it later without using conditionals.
1411              */
1412             cell[i] = grid->ncx*grid->ncy;
1413         }
1414
1415         cxy_na[cell[i]]++;
1416     }
1417 }
1418
1419 /* Determine in which grid cells the atoms should go */
1420 static void calc_cell_indices(const nbnxn_search_t nbs,
1421                               int dd_zone,
1422                               nbnxn_grid_t *grid,
1423                               int a0,int a1,
1424                               const int *atinfo,
1425                               rvec *x,
1426                               const int *move,
1427                               nbnxn_atomdata_t *nbat)
1428 {
1429     int  n0,n1,i;
1430     int  cx,cy,cxy,ncz_max,ncz;
1431     int  nthread,thread;
1432     int  *cxy_na,cxy_na_i;
1433
1434     nthread = gmx_omp_nthreads_get(emntPairsearch);
1435
1436 #pragma omp parallel for num_threads(nthread) schedule(static)
1437     for(thread=0; thread<nthread; thread++)
1438     {
1439         calc_column_indices(grid,a0,a1,x,move,thread,nthread,
1440                             nbs->cell,nbs->work[thread].cxy_na);
1441     }
1442
1443     /* Make the cell index as a function of x and y */
1444     ncz_max = 0;
1445     ncz = 0;
1446     grid->cxy_ind[0] = 0;
1447     for(i=0; i<grid->ncx*grid->ncy+1; i++)
1448     {
1449         /* We set ncz_max at the beginning of the loop iso at the end
1450          * to skip i=grid->ncx*grid->ncy which are moved particles
1451          * that do not need to be ordered on the grid.
1452          */
1453         if (ncz > ncz_max)
1454         {
1455             ncz_max = ncz;
1456         }
1457         cxy_na_i = nbs->work[0].cxy_na[i];
1458         for(thread=1; thread<nthread; thread++)
1459         {
1460             cxy_na_i += nbs->work[thread].cxy_na[i];
1461         }
1462         ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
1463         if (nbat->XFormat == nbatX8)
1464         {
1465             /* Make the number of cell a multiple of 2 */
1466             ncz = (ncz + 1) & ~1;
1467         }
1468         grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
1469         /* Clear cxy_na, so we can reuse the array below */
1470         grid->cxy_na[i] = 0;
1471     }
1472     grid->nc = grid->cxy_ind[grid->ncx*grid->ncy] - grid->cxy_ind[0];
1473
1474     nbat->natoms = (grid->cell0 + grid->nc)*grid->na_sc;
1475
1476     if (debug)
1477     {
1478         fprintf(debug,"ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1479                 grid->na_sc,grid->na_c,grid->nc,
1480                 grid->ncx,grid->ncy,grid->nc/((double)(grid->ncx*grid->ncy)),
1481                 ncz_max);
1482         if (gmx_debug_at)
1483         {
1484             i = 0;
1485             for(cy=0; cy<grid->ncy; cy++)
1486             {
1487                 for(cx=0; cx<grid->ncx; cx++)
1488                 {
1489                     fprintf(debug," %2d",grid->cxy_ind[i+1]-grid->cxy_ind[i]);
1490                     i++;
1491                 }
1492                 fprintf(debug,"\n");
1493             }
1494         }
1495     }
1496
1497     /* Make sure the work array for sorting is large enough */
1498     if (ncz_max*grid->na_sc*SGSF > nbs->work[0].sort_work_nalloc)
1499     {
1500         for(thread=0; thread<nbs->nthread_max; thread++)
1501         {
1502             nbs->work[thread].sort_work_nalloc =
1503                 over_alloc_large(ncz_max*grid->na_sc*SGSF);
1504             srenew(nbs->work[thread].sort_work,
1505                    nbs->work[thread].sort_work_nalloc);
1506         }
1507     }
1508
1509     /* Now we know the dimensions we can fill the grid.
1510      * This is the first, unsorted fill. We sort the columns after this.
1511      */
1512     for(i=a0; i<a1; i++)
1513     {
1514         /* At this point nbs->cell contains the local grid x,y indices */
1515         cxy = nbs->cell[i];
1516         nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
1517     }
1518
1519     /* Set the cell indices for the moved particles */
1520     n0 = grid->nc*grid->na_sc;
1521     n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
1522     for(i=n0; i<n1; i++)
1523     {
1524         nbs->cell[nbs->a[i]] = i;
1525     }
1526
1527     /* Sort the super-cell columns along z into the sub-cells. */
1528 #pragma omp parallel for num_threads(nbs->nthread_max) schedule(static)
1529     for(thread=0; thread<nbs->nthread_max; thread++)
1530     {
1531         if (grid->bSimple)
1532         {
1533             sort_columns_simple(nbs,dd_zone,grid,a0,a1,atinfo,x,nbat,
1534                                 ((thread+0)*grid->ncx*grid->ncy)/nthread,
1535                                 ((thread+1)*grid->ncx*grid->ncy)/nthread,
1536                                 nbs->work[thread].sort_work);
1537         }
1538         else
1539         {
1540             sort_columns_supersub(nbs,dd_zone,grid,a0,a1,atinfo,x,nbat,
1541                                   ((thread+0)*grid->ncx*grid->ncy)/nthread,
1542                                   ((thread+1)*grid->ncx*grid->ncy)/nthread,
1543                                   nbs->work[thread].sort_work);
1544         }
1545     }
1546
1547 #ifdef NBNXN_SEARCH_SSE
1548     if (grid->bSimple && nbat->XFormat == nbatX8)
1549     {
1550         combine_bounding_box_pairs(grid,grid->bb);
1551     }
1552 #endif
1553
1554     if (!grid->bSimple)
1555     {
1556         grid->nsubc_tot = 0;
1557         for(i=0; i<grid->nc; i++)
1558         {
1559             grid->nsubc_tot += grid->nsubc[i];
1560         }
1561     }
1562
1563     if (debug)
1564     {
1565         if (grid->bSimple)
1566         {
1567             print_bbsizes_simple(debug,nbs,grid);
1568         }
1569         else
1570         {
1571             fprintf(debug,"ns non-zero sub-cells: %d average atoms %.2f\n",
1572                     grid->nsubc_tot,(a1-a0)/(double)grid->nsubc_tot);
1573
1574             print_bbsizes_supersub(debug,nbs,grid);
1575         }
1576     }
1577 }
1578
1579 static void init_buffer_flags(nbnxn_buffer_flags_t *flags,
1580                               int natoms)
1581 {
1582     int b;
1583
1584     flags->nflag = (natoms + NBNXN_BUFFERFLAG_SIZE - 1)/NBNXN_BUFFERFLAG_SIZE;
1585     if (flags->nflag > flags->flag_nalloc)
1586     {
1587         flags->flag_nalloc = over_alloc_large(flags->nflag);
1588         srenew(flags->flag,flags->flag_nalloc);
1589     }
1590     for(b=0; b<flags->nflag; b++)
1591     {
1592         flags->flag[b] = 0;
1593     }
1594 }
1595
1596 /* Sets up a grid and puts the atoms on the grid.
1597  * This function only operates on one domain of the domain decompostion.
1598  * Note that without domain decomposition there is only one domain.
1599  */
1600 void nbnxn_put_on_grid(nbnxn_search_t nbs,
1601                        int ePBC,matrix box,
1602                        int dd_zone,
1603                        rvec corner0,rvec corner1,
1604                        int a0,int a1,
1605                        real atom_density,
1606                        const int *atinfo,
1607                        rvec *x,
1608                        int nmoved,int *move,
1609                        int nb_kernel_type,
1610                        nbnxn_atomdata_t *nbat)
1611 {
1612     nbnxn_grid_t *grid;
1613     int n;
1614     int nc_max_grid,nc_max;
1615
1616     grid = &nbs->grid[dd_zone];
1617
1618     nbs_cycle_start(&nbs->cc[enbsCCgrid]);
1619
1620     grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
1621
1622     grid->na_c      = nbnxn_kernel_to_ci_size(nb_kernel_type);
1623     grid->na_cj     = nbnxn_kernel_to_cj_size(nb_kernel_type);
1624     grid->na_sc     = (grid->bSimple ? 1 : GPU_NSUBCELL)*grid->na_c;
1625     grid->na_c_2log = get_2log(grid->na_c);
1626
1627     nbat->na_c = grid->na_c;
1628
1629     if (dd_zone == 0)
1630     {
1631         grid->cell0 = 0;
1632     }
1633     else
1634     {
1635         grid->cell0 =
1636             (nbs->grid[dd_zone-1].cell0 + nbs->grid[dd_zone-1].nc)*
1637             nbs->grid[dd_zone-1].na_sc/grid->na_sc;
1638     }
1639
1640     n = a1 - a0;
1641
1642     if (dd_zone == 0)
1643     {
1644         nbs->ePBC = ePBC;
1645         copy_mat(box,nbs->box);
1646
1647         if (atom_density >= 0)
1648         {
1649             grid->atom_density = atom_density;
1650         }
1651         else
1652         {
1653             grid->atom_density = grid_atom_density(n-nmoved,corner0,corner1);
1654         }
1655
1656         grid->cell0 = 0;
1657
1658         nbs->natoms_local    = a1 - nmoved;
1659         /* We assume that nbnxn_put_on_grid is called first
1660          * for the local atoms (dd_zone=0).
1661          */
1662         nbs->natoms_nonlocal = a1 - nmoved;
1663     }
1664     else
1665     {
1666         nbs->natoms_nonlocal = max(nbs->natoms_nonlocal,a1);
1667     }
1668
1669     nc_max_grid = set_grid_size_xy(nbs,grid,n-nmoved,corner0,corner1,
1670                                    nbs->grid[0].atom_density,
1671                                    nbat->XFormat);
1672
1673     nc_max = grid->cell0 + nc_max_grid;
1674
1675     if (a1 > nbs->cell_nalloc)
1676     {
1677         nbs->cell_nalloc = over_alloc_large(a1);
1678         srenew(nbs->cell,nbs->cell_nalloc);
1679     }
1680
1681     /* To avoid conditionals we store the moved particles at the end of a,
1682      * make sure we have enough space.
1683      */
1684     if (nc_max*grid->na_sc + nmoved > nbs->a_nalloc)
1685     {
1686         nbs->a_nalloc = over_alloc_large(nc_max*grid->na_sc + nmoved);
1687         srenew(nbs->a,nbs->a_nalloc);
1688     }
1689
1690     /* We need padding up to a multiple of the buffer flag size: simply add */
1691     if (nc_max*grid->na_sc + NBNXN_BUFFERFLAG_SIZE > nbat->nalloc)
1692     {
1693         nbnxn_atomdata_realloc(nbat,nc_max*grid->na_sc+NBNXN_BUFFERFLAG_SIZE);
1694     }
1695
1696     calc_cell_indices(nbs,dd_zone,grid,a0,a1,atinfo,x,move,nbat);
1697
1698     if (dd_zone == 0)
1699     {
1700         nbat->natoms_local = nbat->natoms;
1701     }
1702
1703     nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
1704 }
1705
1706 /* Calls nbnxn_put_on_grid for all non-local domains */
1707 void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs,
1708                                 const gmx_domdec_zones_t *zones,
1709                                 const int *atinfo,
1710                                 rvec *x,
1711                                 int nb_kernel_type,
1712                                 nbnxn_atomdata_t *nbat)
1713 {
1714     int  zone,d;
1715     rvec c0,c1;
1716
1717     for(zone=1; zone<zones->n; zone++)
1718     {
1719         for(d=0; d<DIM; d++)
1720         {
1721             c0[d] = zones->size[zone].bb_x0[d];
1722             c1[d] = zones->size[zone].bb_x1[d];
1723         }
1724
1725         nbnxn_put_on_grid(nbs,nbs->ePBC,NULL,
1726                           zone,c0,c1,
1727                           zones->cg_range[zone],
1728                           zones->cg_range[zone+1],
1729                           -1,
1730                           atinfo,
1731                           x,
1732                           0,NULL,
1733                           nb_kernel_type,
1734                           nbat);
1735     }
1736 }
1737
1738 /* Add simple grid type information to the local super/sub grid */
1739 void nbnxn_grid_add_simple(nbnxn_search_t nbs,
1740                            nbnxn_atomdata_t *nbat)
1741 {
1742     nbnxn_grid_t *grid;
1743     float *bbcz,*bb;
1744     int ncd,sc;
1745
1746     grid = &nbs->grid[0];
1747
1748     if (grid->bSimple)
1749     {
1750         gmx_incons("nbnxn_grid_simple called with a simple grid");
1751     }
1752
1753     ncd = grid->na_sc/NBNXN_CPU_CLUSTER_I_SIZE;
1754
1755     if (grid->nc*ncd > grid->nc_nalloc_simple)
1756     {
1757         grid->nc_nalloc_simple = over_alloc_large(grid->nc*ncd);
1758         srenew(grid->bbcz_simple,grid->nc_nalloc_simple*NNBSBB_D);
1759         srenew(grid->bb_simple,grid->nc_nalloc_simple*NNBSBB_B);
1760         srenew(grid->flags_simple,grid->nc_nalloc_simple);
1761         if (nbat->XFormat)
1762         {
1763             sfree_aligned(grid->bbj);
1764             snew_aligned(grid->bbj,grid->nc_nalloc_simple/2,16);
1765         }
1766     }
1767
1768     bbcz = grid->bbcz_simple;
1769     bb   = grid->bb_simple;
1770
1771 #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
1772     for(sc=0; sc<grid->nc; sc++)
1773     {
1774         int c,tx,na;
1775
1776         for(c=0; c<ncd; c++)
1777         {
1778             tx = sc*ncd + c;
1779
1780             na = NBNXN_CPU_CLUSTER_I_SIZE;
1781             while (na > 0 &&
1782                    nbat->type[tx*NBNXN_CPU_CLUSTER_I_SIZE+na-1] == nbat->ntype-1)
1783             {
1784                 na--;
1785             }
1786
1787             if (na > 0)
1788             {
1789                 switch (nbat->XFormat)
1790                 {
1791                 case nbatX4:
1792                     /* PACK_X4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
1793                     calc_bounding_box_x_x4(na,nbat->x+tx*STRIDE_P4,
1794                                            bb+tx*NNBSBB_B);
1795                     break;
1796                 case nbatX8:
1797                     /* PACK_X8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
1798                     calc_bounding_box_x_x8(na,nbat->x+X8_IND_A(tx*NBNXN_CPU_CLUSTER_I_SIZE),
1799                                            bb+tx*NNBSBB_B);
1800                     break;
1801                 default:
1802                     calc_bounding_box(na,nbat->xstride,
1803                                       nbat->x+tx*NBNXN_CPU_CLUSTER_I_SIZE*nbat->xstride,
1804                                       bb+tx*NNBSBB_B);
1805                     break;
1806                 }
1807                 bbcz[tx*NNBSBB_D+0] = bb[tx*NNBSBB_B         +ZZ];
1808                 bbcz[tx*NNBSBB_D+1] = bb[tx*NNBSBB_B+NNBSBB_C+ZZ];
1809
1810                 /* No interaction optimization yet here */
1811                 grid->flags_simple[tx] = NBNXN_CI_DO_LJ(0) | NBNXN_CI_DO_COUL(0);
1812             }
1813             else
1814             {
1815                 grid->flags_simple[tx] = 0;
1816             }
1817         }
1818     }
1819
1820 #ifdef NBNXN_SEARCH_SSE
1821     if (grid->bSimple && nbat->XFormat == nbatX8)
1822     {
1823         combine_bounding_box_pairs(grid,grid->bb_simple);
1824     }
1825 #endif
1826 }
1827
1828 void nbnxn_get_ncells(nbnxn_search_t nbs,int *ncx,int *ncy)
1829 {
1830     *ncx = nbs->grid[0].ncx;
1831     *ncy = nbs->grid[0].ncy;
1832 }
1833
1834 void nbnxn_get_atomorder(nbnxn_search_t nbs,int **a,int *n)
1835 {
1836     const nbnxn_grid_t *grid;
1837
1838     grid = &nbs->grid[0];
1839
1840     /* Return the atom order for the home cell (index 0) */
1841     *a  = nbs->a;
1842
1843     *n = grid->cxy_ind[grid->ncx*grid->ncy]*grid->na_sc;
1844 }
1845
1846 void nbnxn_set_atomorder(nbnxn_search_t nbs)
1847 {
1848     nbnxn_grid_t *grid;
1849     int ao,cx,cy,cxy,cz,j;
1850
1851     /* Set the atom order for the home cell (index 0) */
1852     grid = &nbs->grid[0];
1853
1854     ao = 0;
1855     for(cx=0; cx<grid->ncx; cx++)
1856     {
1857         for(cy=0; cy<grid->ncy; cy++)
1858         {
1859             cxy = cx*grid->ncy + cy;
1860             j   = grid->cxy_ind[cxy]*grid->na_sc;
1861             for(cz=0; cz<grid->cxy_na[cxy]; cz++)
1862             {
1863                 nbs->a[j]     = ao;
1864                 nbs->cell[ao] = j;
1865                 ao++;
1866                 j++;
1867             }
1868         }
1869     }
1870 }
1871
1872 /* Determines the cell range along one dimension that
1873  * the bounding box b0 - b1 sees.
1874  */
1875 static void get_cell_range(real b0,real b1,
1876                            int nc,real c0,real s,real invs,
1877                            real d2,real r2,int *cf,int *cl)
1878 {
1879     *cf = max((int)((b0 - c0)*invs),0);
1880
1881     while (*cf > 0 && d2 + sqr((b0 - c0) - (*cf-1+1)*s) < r2)
1882     {
1883         (*cf)--;
1884     }
1885
1886     *cl = min((int)((b1 - c0)*invs),nc-1);
1887     while (*cl < nc-1 && d2 + sqr((*cl+1)*s - (b1 - c0)) < r2)
1888     {
1889         (*cl)++;
1890     }
1891 }
1892
1893 /* Reference code calculating the distance^2 between two bounding boxes */
1894 static float box_dist2(float bx0,float bx1,float by0,
1895                        float by1,float bz0,float bz1,
1896                        const float *bb)
1897 {
1898     float d2;
1899     float dl,dh,dm,dm0;
1900
1901     d2 = 0;
1902
1903     dl  = bx0 - bb[BBU_X];
1904     dh  = bb[BBL_X] - bx1;
1905     dm  = max(dl,dh);
1906     dm0 = max(dm,0);
1907     d2 += dm0*dm0;
1908
1909     dl  = by0 - bb[BBU_Y];
1910     dh  = bb[BBL_Y] - by1;
1911     dm  = max(dl,dh);
1912     dm0 = max(dm,0);
1913     d2 += dm0*dm0;
1914
1915     dl  = bz0 - bb[BBU_Z];
1916     dh  = bb[BBL_Z] - bz1;
1917     dm  = max(dl,dh);
1918     dm0 = max(dm,0);
1919     d2 += dm0*dm0;
1920
1921     return d2;
1922 }
1923
1924 /* Plain C code calculating the distance^2 between two bounding boxes */
1925 static float subc_bb_dist2(int si,const float *bb_i_ci,
1926                            int csj,const float *bb_j_all)
1927 {
1928     const float *bb_i,*bb_j;
1929     float d2;
1930     float dl,dh,dm,dm0;
1931
1932     bb_i = bb_i_ci  +  si*NNBSBB_B;
1933     bb_j = bb_j_all + csj*NNBSBB_B;
1934
1935     d2 = 0;
1936
1937     dl  = bb_i[BBL_X] - bb_j[BBU_X];
1938     dh  = bb_j[BBL_X] - bb_i[BBU_X];
1939     dm  = max(dl,dh);
1940     dm0 = max(dm,0);
1941     d2 += dm0*dm0;
1942
1943     dl  = bb_i[BBL_Y] - bb_j[BBU_Y];
1944     dh  = bb_j[BBL_Y] - bb_i[BBU_Y];
1945     dm  = max(dl,dh);
1946     dm0 = max(dm,0);
1947     d2 += dm0*dm0;
1948
1949     dl  = bb_i[BBL_Z] - bb_j[BBU_Z];
1950     dh  = bb_j[BBL_Z] - bb_i[BBU_Z];
1951     dm  = max(dl,dh);
1952     dm0 = max(dm,0);
1953     d2 += dm0*dm0;
1954
1955     return d2;
1956 }
1957
1958 #ifdef NBNXN_SEARCH_SSE
1959
1960 /* SSE code for bb distance for bb format xyz0 */
1961 static float subc_bb_dist2_sse(int na_c,
1962                               int si,const float *bb_i_ci,
1963                               int csj,const float *bb_j_all)
1964 {
1965     const float *bb_i,*bb_j;
1966
1967     __m128 bb_i_SSE0,bb_i_SSE1;
1968     __m128 bb_j_SSE0,bb_j_SSE1;
1969     __m128 dl_SSE;
1970     __m128 dh_SSE;
1971     __m128 dm_SSE;
1972     __m128 dm0_SSE;
1973     __m128 d2_SSE;
1974 #ifndef GMX_X86_SSE4_1
1975     float d2_array[7],*d2_align;
1976
1977     d2_align = (float *)(((size_t)(d2_array+3)) & (~((size_t)15)));
1978 #else
1979     float d2;
1980 #endif
1981
1982     bb_i = bb_i_ci  +  si*NNBSBB_B;
1983     bb_j = bb_j_all + csj*NNBSBB_B;
1984
1985     bb_i_SSE0 = _mm_load_ps(bb_i);
1986     bb_i_SSE1 = _mm_load_ps(bb_i+NNBSBB_C);
1987     bb_j_SSE0 = _mm_load_ps(bb_j);
1988     bb_j_SSE1 = _mm_load_ps(bb_j+NNBSBB_C);
1989
1990     dl_SSE    = _mm_sub_ps(bb_i_SSE0,bb_j_SSE1);
1991     dh_SSE    = _mm_sub_ps(bb_j_SSE0,bb_i_SSE1);
1992
1993     dm_SSE    = _mm_max_ps(dl_SSE,dh_SSE);
1994     dm0_SSE   = _mm_max_ps(dm_SSE,_mm_setzero_ps());
1995 #ifndef GMX_X86_SSE4_1
1996     d2_SSE    = _mm_mul_ps(dm0_SSE,dm0_SSE);
1997
1998     _mm_store_ps(d2_align,d2_SSE);
1999
2000     return d2_align[0] + d2_align[1] + d2_align[2];
2001 #else
2002     /* SSE4.1 dot product of components 0,1,2 */
2003     d2_SSE    = _mm_dp_ps(dm0_SSE,dm0_SSE,0x71);
2004
2005     _mm_store_ss(&d2,d2_SSE);
2006
2007     return d2;
2008 #endif
2009 }
2010
2011 /* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
2012 #define SUBC_BB_DIST2_SSE_XXXX_INNER(si,bb_i,d2) \
2013 {                                                \
2014     int    shi;                                  \
2015                                                  \
2016     __m128 dx_0,dy_0,dz_0;                       \
2017     __m128 dx_1,dy_1,dz_1;                       \
2018                                                  \
2019     __m128 mx,my,mz;                             \
2020     __m128 m0x,m0y,m0z;                          \
2021                                                  \
2022     __m128 d2x,d2y,d2z;                          \
2023     __m128 d2s,d2t;                              \
2024                                                  \
2025     shi = si*NNBSBB_D*DIM;                       \
2026                                                  \
2027     xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_8BB);   \
2028     yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_8BB);   \
2029     zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_8BB);   \
2030     xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_8BB);   \
2031     yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_8BB);   \
2032     zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_8BB);   \
2033                                                  \
2034     dx_0 = _mm_sub_ps(xi_l,xj_h);                \
2035     dy_0 = _mm_sub_ps(yi_l,yj_h);                \
2036     dz_0 = _mm_sub_ps(zi_l,zj_h);                \
2037                                                  \
2038     dx_1 = _mm_sub_ps(xj_l,xi_h);                \
2039     dy_1 = _mm_sub_ps(yj_l,yi_h);                \
2040     dz_1 = _mm_sub_ps(zj_l,zi_h);                \
2041                                                  \
2042     mx   = _mm_max_ps(dx_0,dx_1);                \
2043     my   = _mm_max_ps(dy_0,dy_1);                \
2044     mz   = _mm_max_ps(dz_0,dz_1);                \
2045                                                  \
2046     m0x  = _mm_max_ps(mx,zero);                  \
2047     m0y  = _mm_max_ps(my,zero);                  \
2048     m0z  = _mm_max_ps(mz,zero);                  \
2049                                                  \
2050     d2x  = _mm_mul_ps(m0x,m0x);                  \
2051     d2y  = _mm_mul_ps(m0y,m0y);                  \
2052     d2z  = _mm_mul_ps(m0z,m0z);                  \
2053                                                  \
2054     d2s  = _mm_add_ps(d2x,d2y);                  \
2055     d2t  = _mm_add_ps(d2s,d2z);                  \
2056                                                  \
2057     _mm_store_ps(d2+si,d2t);                     \
2058 }
2059
2060 /* SSE code for nsi bb distances for bb format xxxxyyyyzzzz */
2061 static void subc_bb_dist2_sse_xxxx(const float *bb_j,
2062                                    int nsi,const float *bb_i,
2063                                    float *d2)
2064 {
2065     __m128 xj_l,yj_l,zj_l;
2066     __m128 xj_h,yj_h,zj_h;
2067     __m128 xi_l,yi_l,zi_l;
2068     __m128 xi_h,yi_h,zi_h;
2069
2070     __m128 zero;
2071
2072     zero = _mm_setzero_ps();
2073
2074     xj_l = _mm_set1_ps(bb_j[0*STRIDE_8BB]);
2075     yj_l = _mm_set1_ps(bb_j[1*STRIDE_8BB]);
2076     zj_l = _mm_set1_ps(bb_j[2*STRIDE_8BB]);
2077     xj_h = _mm_set1_ps(bb_j[3*STRIDE_8BB]);
2078     yj_h = _mm_set1_ps(bb_j[4*STRIDE_8BB]);
2079     zj_h = _mm_set1_ps(bb_j[5*STRIDE_8BB]);
2080
2081     /* Here we "loop" over si (0,STRIDE_8BB) from 0 to nsi with step STRIDE_8BB.
2082      * But as we know the number of iterations is 1 or 2, we unroll manually.
2083      */
2084     SUBC_BB_DIST2_SSE_XXXX_INNER(0,bb_i,d2);
2085     if (STRIDE_8BB < nsi)
2086     {
2087         SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_8BB,bb_i,d2);
2088     }
2089 }
2090
2091 #endif /* NBNXN_SEARCH_SSE */
2092
2093 /* Plain C function which determines if any atom pair between two cells
2094  * is within distance sqrt(rl2).
2095  */
2096 static gmx_bool subc_in_range_x(int na_c,
2097                                 int si,const real *x_i,
2098                                 int csj,int stride,const real *x_j,
2099                                 real rl2)
2100 {
2101     int  i,j,i0,j0;
2102     real d2;
2103
2104     for(i=0; i<na_c; i++)
2105     {
2106         i0 = (si*na_c + i)*DIM;
2107         for(j=0; j<na_c; j++)
2108         {
2109             j0 = (csj*na_c + j)*stride;
2110
2111             d2 = sqr(x_i[i0  ] - x_j[j0  ]) +
2112                  sqr(x_i[i0+1] - x_j[j0+1]) +
2113                  sqr(x_i[i0+2] - x_j[j0+2]);
2114
2115             if (d2 < rl2)
2116             {
2117                 return TRUE;
2118             }
2119         }
2120     }
2121
2122     return FALSE;
2123 }
2124
2125 /* SSE function which determines if any atom pair between two cells,
2126  * both with 8 atoms, is within distance sqrt(rl2).
2127  */
2128 static gmx_bool subc_in_range_sse8(int na_c,
2129                                    int si,const real *x_i,
2130                                    int csj,int stride,const real *x_j,
2131                                    real rl2)
2132 {
2133 #ifdef NBNXN_SEARCH_SSE_SINGLE
2134     __m128 ix_SSE0,iy_SSE0,iz_SSE0;
2135     __m128 ix_SSE1,iy_SSE1,iz_SSE1;
2136
2137     __m128 rc2_SSE;
2138
2139     int na_c_sse;
2140     int j0,j1;
2141
2142     rc2_SSE   = _mm_set1_ps(rl2);
2143
2144     na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_8BB;
2145     ix_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_8BB);
2146     iy_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_8BB);
2147     iz_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_8BB);
2148     ix_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_8BB);
2149     iy_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_8BB);
2150     iz_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_8BB);
2151
2152     /* We loop from the outer to the inner particles to maximize
2153      * the chance that we find a pair in range quickly and return.
2154      */
2155     j0 = csj*na_c;
2156     j1 = j0 + na_c - 1;
2157     while (j0 < j1)
2158     {
2159         __m128 jx0_SSE,jy0_SSE,jz0_SSE;
2160         __m128 jx1_SSE,jy1_SSE,jz1_SSE;
2161
2162         __m128 dx_SSE0,dy_SSE0,dz_SSE0;
2163         __m128 dx_SSE1,dy_SSE1,dz_SSE1;
2164         __m128 dx_SSE2,dy_SSE2,dz_SSE2;
2165         __m128 dx_SSE3,dy_SSE3,dz_SSE3;
2166
2167         __m128 rsq_SSE0;
2168         __m128 rsq_SSE1;
2169         __m128 rsq_SSE2;
2170         __m128 rsq_SSE3;
2171
2172         __m128 wco_SSE0;
2173         __m128 wco_SSE1;
2174         __m128 wco_SSE2;
2175         __m128 wco_SSE3;
2176         __m128 wco_any_SSE01,wco_any_SSE23,wco_any_SSE;
2177
2178         jx0_SSE = _mm_load1_ps(x_j+j0*stride+0);
2179         jy0_SSE = _mm_load1_ps(x_j+j0*stride+1);
2180         jz0_SSE = _mm_load1_ps(x_j+j0*stride+2);
2181
2182         jx1_SSE = _mm_load1_ps(x_j+j1*stride+0);
2183         jy1_SSE = _mm_load1_ps(x_j+j1*stride+1);
2184         jz1_SSE = _mm_load1_ps(x_j+j1*stride+2);
2185
2186         /* Calculate distance */
2187         dx_SSE0            = _mm_sub_ps(ix_SSE0,jx0_SSE);
2188         dy_SSE0            = _mm_sub_ps(iy_SSE0,jy0_SSE);
2189         dz_SSE0            = _mm_sub_ps(iz_SSE0,jz0_SSE);
2190         dx_SSE1            = _mm_sub_ps(ix_SSE1,jx0_SSE);
2191         dy_SSE1            = _mm_sub_ps(iy_SSE1,jy0_SSE);
2192         dz_SSE1            = _mm_sub_ps(iz_SSE1,jz0_SSE);
2193         dx_SSE2            = _mm_sub_ps(ix_SSE0,jx1_SSE);
2194         dy_SSE2            = _mm_sub_ps(iy_SSE0,jy1_SSE);
2195         dz_SSE2            = _mm_sub_ps(iz_SSE0,jz1_SSE);
2196         dx_SSE3            = _mm_sub_ps(ix_SSE1,jx1_SSE);
2197         dy_SSE3            = _mm_sub_ps(iy_SSE1,jy1_SSE);
2198         dz_SSE3            = _mm_sub_ps(iz_SSE1,jz1_SSE);
2199
2200         /* rsq = dx*dx+dy*dy+dz*dz */
2201         rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
2202         rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
2203         rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
2204         rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
2205
2206         wco_SSE0           = _mm_cmplt_ps(rsq_SSE0,rc2_SSE);
2207         wco_SSE1           = _mm_cmplt_ps(rsq_SSE1,rc2_SSE);
2208         wco_SSE2           = _mm_cmplt_ps(rsq_SSE2,rc2_SSE);
2209         wco_SSE3           = _mm_cmplt_ps(rsq_SSE3,rc2_SSE);
2210
2211         wco_any_SSE01      = _mm_or_ps(wco_SSE0,wco_SSE1);
2212         wco_any_SSE23      = _mm_or_ps(wco_SSE2,wco_SSE3);
2213         wco_any_SSE        = _mm_or_ps(wco_any_SSE01,wco_any_SSE23);
2214
2215         if (_mm_movemask_ps(wco_any_SSE))
2216         {
2217             return TRUE;
2218         }
2219
2220         j0++;
2221         j1--;
2222     }
2223     return FALSE;
2224
2225 #else
2226     /* No SSE */
2227     gmx_incons("SSE function called without SSE support");
2228
2229     return TRUE;
2230 #endif
2231 }
2232
2233 /* Returns the j sub-cell for index cj_ind */
2234 static int nbl_cj(const nbnxn_pairlist_t *nbl,int cj_ind)
2235 {
2236     return nbl->cj4[cj_ind>>2].cj[cj_ind & 3];
2237 }
2238
2239 /* Returns the i-interaction mask of the j sub-cell for index cj_ind */
2240 static unsigned nbl_imask0(const nbnxn_pairlist_t *nbl,int cj_ind)
2241 {
2242     return nbl->cj4[cj_ind>>2].imei[0].imask;
2243 }
2244
2245 /* Ensures there is enough space for extra extra exclusion masks */
2246 static void check_excl_space(nbnxn_pairlist_t *nbl,int extra)
2247 {
2248     if (nbl->nexcl+extra > nbl->excl_nalloc)
2249     {
2250         nbl->excl_nalloc = over_alloc_small(nbl->nexcl+extra);
2251         nbnxn_realloc_void((void **)&nbl->excl,
2252                            nbl->nexcl*sizeof(*nbl->excl),
2253                            nbl->excl_nalloc*sizeof(*nbl->excl),
2254                            nbl->alloc,nbl->free);
2255     }
2256 }
2257
2258 /* Ensures there is enough space for ncell extra j-cells in the list */
2259 static void check_subcell_list_space_simple(nbnxn_pairlist_t *nbl,
2260                                             int ncell)
2261 {
2262     int cj_max;
2263
2264     cj_max = nbl->ncj + ncell;
2265
2266     if (cj_max > nbl->cj_nalloc)
2267     {
2268         nbl->cj_nalloc = over_alloc_small(cj_max);
2269         nbnxn_realloc_void((void **)&nbl->cj,
2270                            nbl->ncj*sizeof(*nbl->cj),
2271                            nbl->cj_nalloc*sizeof(*nbl->cj),
2272                            nbl->alloc,nbl->free);
2273     }
2274 }
2275
2276 /* Ensures there is enough space for ncell extra j-subcells in the list */
2277 static void check_subcell_list_space_supersub(nbnxn_pairlist_t *nbl,
2278                                               int nsupercell)
2279 {
2280     int ncj4_max,j4,j,w,t;
2281
2282 #define NWARP       2
2283 #define WARP_SIZE  32
2284
2285     /* We can have maximally nsupercell*GPU_NSUBCELL sj lists */
2286     /* We can store 4 j-subcell - i-supercell pairs in one struct.
2287      * since we round down, we need one extra entry.
2288      */
2289     ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL + 4-1) >> 2);
2290
2291     if (ncj4_max > nbl->cj4_nalloc)
2292     {
2293         nbl->cj4_nalloc = over_alloc_small(ncj4_max);
2294         nbnxn_realloc_void((void **)&nbl->cj4,
2295                            nbl->work->cj4_init*sizeof(*nbl->cj4),
2296                            nbl->cj4_nalloc*sizeof(*nbl->cj4),
2297                            nbl->alloc,nbl->free);
2298     }
2299
2300     if (ncj4_max > nbl->work->cj4_init)
2301     {
2302         for(j4=nbl->work->cj4_init; j4<ncj4_max; j4++)
2303         {
2304             /* No i-subcells and no excl's in the list initially */
2305             for(w=0; w<NWARP; w++)
2306             {
2307                 nbl->cj4[j4].imei[w].imask    = 0U;
2308                 nbl->cj4[j4].imei[w].excl_ind = 0;
2309
2310             }
2311         }
2312         nbl->work->cj4_init = ncj4_max;
2313     }
2314 }
2315
2316 /* Set all excl masks for one GPU warp no exclusions */
2317 static void set_no_excls(nbnxn_excl_t *excl)
2318 {
2319     int t;
2320
2321     for(t=0; t<WARP_SIZE; t++)
2322     {
2323         /* Turn all interaction bits on */
2324         excl->pair[t] = NBNXN_INT_MASK_ALL;
2325     }
2326 }
2327
2328 /* Initializes a single nbnxn_pairlist_t data structure */
2329 static void nbnxn_init_pairlist(nbnxn_pairlist_t *nbl,
2330                                 gmx_bool bSimple,
2331                                 nbnxn_alloc_t *alloc,
2332                                 nbnxn_free_t  *free)
2333 {
2334     if (alloc == NULL)
2335     {
2336         nbl->alloc = nbnxn_alloc_aligned;
2337     }
2338     else
2339     {
2340         nbl->alloc = alloc;
2341     }
2342     if (free == NULL)
2343     {
2344         nbl->free = nbnxn_free_aligned;
2345     }
2346     else
2347     {
2348         nbl->free = free;
2349     }
2350
2351     nbl->bSimple     = bSimple;
2352     nbl->na_sc       = 0;
2353     nbl->na_ci       = 0;
2354     nbl->na_cj       = 0;
2355     nbl->nci         = 0;
2356     nbl->ci          = NULL;
2357     nbl->ci_nalloc   = 0;
2358     nbl->ncj         = 0;
2359     nbl->cj          = NULL;
2360     nbl->cj_nalloc   = 0;
2361     nbl->ncj4        = 0;
2362     /* We need one element extra in sj, so alloc initially with 1 */
2363     nbl->cj4_nalloc  = 0;
2364     nbl->cj4         = NULL;
2365     nbl->nci_tot     = 0;
2366
2367     if (!nbl->bSimple)
2368     {
2369         nbl->excl        = NULL;
2370         nbl->excl_nalloc = 0;
2371         nbl->nexcl       = 0;
2372         check_excl_space(nbl,1);
2373         nbl->nexcl       = 1;
2374         set_no_excls(&nbl->excl[0]);
2375     }
2376
2377     snew(nbl->work,1);
2378 #ifdef NBNXN_BBXXXX
2379     snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,32);
2380 #else
2381     snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,32);
2382 #endif
2383     snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,32);
2384 #ifdef GMX_NBNXN_SIMD
2385     snew_aligned(nbl->work->x_ci_simd_4xn,1,32);
2386     snew_aligned(nbl->work->x_ci_simd_2xnn,1,32);
2387 #endif
2388     snew_aligned(nbl->work->d2,GPU_NSUBCELL,32);
2389 }
2390
2391 void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
2392                              gmx_bool bSimple, gmx_bool bCombined,
2393                              nbnxn_alloc_t *alloc,
2394                              nbnxn_free_t  *free)
2395 {
2396     int i;
2397
2398     nbl_list->bSimple   = bSimple;
2399     nbl_list->bCombined = bCombined;
2400
2401     nbl_list->nnbl = gmx_omp_nthreads_get(emntNonbonded);
2402
2403     if (!nbl_list->bCombined &&
2404         nbl_list->nnbl > NBNXN_BUFFERFLAG_MAX_THREADS)
2405     {
2406         gmx_fatal(FARGS,"%d OpenMP threads were requested. Since the non-bonded force buffer reduction is prohibitively slow with more than %d threads, we do not allow this. Use %d or less OpenMP threads.",
2407                   nbl_list->nnbl,NBNXN_BUFFERFLAG_MAX_THREADS,NBNXN_BUFFERFLAG_MAX_THREADS);
2408     }
2409
2410     snew(nbl_list->nbl,nbl_list->nnbl);
2411     /* Execute in order to avoid memory interleaving between threads */
2412 #pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)
2413     for(i=0; i<nbl_list->nnbl; i++)
2414     {
2415         /* Allocate the nblist data structure locally on each thread
2416          * to optimize memory access for NUMA architectures.
2417          */
2418         snew(nbl_list->nbl[i],1);
2419
2420         /* Only list 0 is used on the GPU, use normal allocation for i>0 */
2421         if (i == 0)
2422         {
2423             nbnxn_init_pairlist(nbl_list->nbl[i],nbl_list->bSimple,alloc,free);
2424         }
2425         else
2426         {
2427             nbnxn_init_pairlist(nbl_list->nbl[i],nbl_list->bSimple,NULL,NULL);
2428         }
2429     }
2430 }
2431
2432 /* Print statistics of a pair list, used for debug output */
2433 static void print_nblist_statistics_simple(FILE *fp,const nbnxn_pairlist_t *nbl,
2434                                            const nbnxn_search_t nbs,real rl)
2435 {
2436     const nbnxn_grid_t *grid;
2437     int cs[SHIFTS];
2438     int s,i,j;
2439     int npexcl;
2440
2441     /* This code only produces correct statistics with domain decomposition */
2442     grid = &nbs->grid[0];
2443
2444     fprintf(fp,"nbl nci %d ncj %d\n",
2445             nbl->nci,nbl->ncj);
2446     fprintf(fp,"nbl na_sc %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
2447             nbl->na_sc,rl,nbl->ncj,nbl->ncj/(double)grid->nc,
2448             nbl->ncj/(double)grid->nc*grid->na_sc,
2449             nbl->ncj/(double)grid->nc*grid->na_sc/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nc*grid->na_sc/det(nbs->box)));
2450
2451     fprintf(fp,"nbl average j cell list length %.1f\n",
2452             0.25*nbl->ncj/(double)nbl->nci);
2453
2454     for(s=0; s<SHIFTS; s++)
2455     {
2456         cs[s] = 0;
2457     }
2458     npexcl = 0;
2459     for(i=0; i<nbl->nci; i++)
2460     {
2461         cs[nbl->ci[i].shift & NBNXN_CI_SHIFT] +=
2462             nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start;
2463
2464         j = nbl->ci[i].cj_ind_start;
2465         while (j < nbl->ci[i].cj_ind_end &&
2466                nbl->cj[j].excl != NBNXN_INT_MASK_ALL)
2467         {
2468             npexcl++;
2469             j++;
2470         }
2471     }
2472     fprintf(fp,"nbl cell pairs, total: %d excl: %d %.1f%%\n",
2473             nbl->ncj,npexcl,100*npexcl/(double)nbl->ncj);
2474     for(s=0; s<SHIFTS; s++)
2475     {
2476         if (cs[s] > 0)
2477         {
2478             fprintf(fp,"nbl shift %2d ncj %3d\n",s,cs[s]);
2479         }
2480     }
2481 }
2482
2483 /* Print statistics of a pair lists, used for debug output */
2484 static void print_nblist_statistics_supersub(FILE *fp,const nbnxn_pairlist_t *nbl,
2485                                              const nbnxn_search_t nbs,real rl)
2486 {
2487     const nbnxn_grid_t *grid;
2488     int i,j4,j,si,b;
2489     int c[GPU_NSUBCELL+1];
2490
2491     /* This code only produces correct statistics with domain decomposition */
2492     grid = &nbs->grid[0];
2493
2494     fprintf(fp,"nbl nsci %d ncj4 %d nsi %d excl4 %d\n",
2495             nbl->nsci,nbl->ncj4,nbl->nci_tot,nbl->nexcl);
2496     fprintf(fp,"nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
2497             nbl->na_ci,rl,nbl->nci_tot,nbl->nci_tot/(double)grid->nsubc_tot,
2498             nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c,
2499             nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nsubc_tot*grid->na_c/det(nbs->box)));
2500
2501     fprintf(fp,"nbl average j super cell list length %.1f\n",
2502             0.25*nbl->ncj4/(double)nbl->nsci);
2503     fprintf(fp,"nbl average i sub cell list length %.1f\n",
2504             nbl->nci_tot/(0.25*nbl->ncj4));
2505
2506     for(si=0; si<=GPU_NSUBCELL; si++)
2507     {
2508         c[si] = 0;
2509     }
2510     for(i=0; i<nbl->nsci; i++)
2511     {
2512         for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
2513         {
2514             for(j=0; j<4; j++)
2515             {
2516                 b = 0;
2517                 for(si=0; si<GPU_NSUBCELL; si++)
2518                 {
2519                     if (nbl->cj4[j4].imei[0].imask & (1U << (j*GPU_NSUBCELL + si)))
2520                     {
2521                         b++;
2522                     }
2523                 }
2524                 c[b]++;
2525             }
2526         }
2527     }
2528     for(b=0; b<=GPU_NSUBCELL; b++)
2529     {
2530         fprintf(fp,"nbl j-list #i-subcell %d %7d %4.1f\n",
2531                 b,c[b],100.0*c[b]/(double)(nbl->ncj4*NBNXN_GPU_JGROUP_SIZE));
2532     }
2533 }
2534
2535 /* Print the full pair list, used for debug output */
2536 static void print_supersub_nsp(const char *fn,
2537                                const nbnxn_pairlist_t *nbl,
2538                                int iloc)
2539 {
2540     char buf[STRLEN];
2541     FILE *fp;
2542     int i,nsp,j4,p;
2543
2544     sprintf(buf,"%s_%s.xvg",fn,NONLOCAL_I(iloc) ? "nl" : "l");
2545     fp = ffopen(buf,"w");
2546
2547     for(i=0; i<nbl->nci; i++)
2548     {
2549         nsp = 0;
2550         for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
2551         {
2552             for(p=0; p<NBNXN_GPU_JGROUP_SIZE*GPU_NSUBCELL; p++)
2553             {
2554                 nsp += (nbl->cj4[j4].imei[0].imask >> p) & 1;
2555             }
2556         }
2557         fprintf(fp,"%4d %3d %3d\n",
2558                 i,
2559                 nsp,
2560                 nbl->sci[i].cj4_ind_end-nbl->sci[i].cj4_ind_start);
2561     }
2562
2563     fclose(fp);
2564 }
2565
2566 /* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp */
2567 static void low_get_nbl_exclusions(nbnxn_pairlist_t *nbl,int cj4,
2568                                    int warp,nbnxn_excl_t **excl)
2569 {
2570     if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
2571     {
2572         /* No exclusions set, make a new list entry */
2573         nbl->cj4[cj4].imei[warp].excl_ind = nbl->nexcl;
2574         nbl->nexcl++;
2575         *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
2576         set_no_excls(*excl);
2577     }
2578     else
2579     {
2580         /* We already have some exclusions, new ones can be added to the list */
2581         *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
2582     }
2583 }
2584
2585 /* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp,
2586  * allocates extra memory, if necessary.
2587  */
2588 static void get_nbl_exclusions_1(nbnxn_pairlist_t *nbl,int cj4,
2589                                  int warp,nbnxn_excl_t **excl)
2590 {
2591     if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
2592     {
2593         /* We need to make a new list entry, check if we have space */
2594         check_excl_space(nbl,1);
2595     }
2596     low_get_nbl_exclusions(nbl,cj4,warp,excl);
2597 }
2598
2599 /* Returns pointers to the exclusion mask for cj4-unit cj4 for both warps,
2600  * allocates extra memory, if necessary.
2601  */
2602 static void get_nbl_exclusions_2(nbnxn_pairlist_t *nbl,int cj4,
2603                                  nbnxn_excl_t **excl_w0,
2604                                  nbnxn_excl_t **excl_w1)
2605 {
2606     /* Check for space we might need */
2607     check_excl_space(nbl,2);
2608
2609     low_get_nbl_exclusions(nbl,cj4,0,excl_w0);
2610     low_get_nbl_exclusions(nbl,cj4,1,excl_w1);
2611 }
2612
2613 /* Sets the self exclusions i=j and pair exclusions i>j */
2614 static void set_self_and_newton_excls_supersub(nbnxn_pairlist_t *nbl,
2615                                                int cj4_ind,int sj_offset,
2616                                                int si)
2617 {
2618     nbnxn_excl_t *excl[2];
2619     int  ei,ej,w;
2620
2621     /* Here we only set the set self and double pair exclusions */
2622
2623     get_nbl_exclusions_2(nbl,cj4_ind,&excl[0],&excl[1]);
2624
2625     /* Only minor < major bits set */
2626     for(ej=0; ej<nbl->na_ci; ej++)
2627     {
2628         w = (ej>>2);
2629         for(ei=ej; ei<nbl->na_ci; ei++)
2630         {
2631             excl[w]->pair[(ej&(4-1))*nbl->na_ci+ei] &=
2632                 ~(1U << (sj_offset*GPU_NSUBCELL+si));
2633         }
2634     }
2635 }
2636
2637 /* Returns a diagonal or off-diagonal interaction mask for plain C lists */
2638 static unsigned int get_imask(gmx_bool rdiag,int ci,int cj)
2639 {
2640     return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
2641 }
2642
2643 /* Returns a diagonal or off-diagonal interaction mask for SIMD128 lists */
2644 static unsigned int get_imask_x86_simd128(gmx_bool rdiag,int ci,int cj)
2645 {
2646 #ifndef GMX_DOUBLE /* cj-size = 4 */
2647     return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
2648 #else              /* cj-size = 2 */
2649     return (rdiag && ci*2 == cj ? NBNXN_INT_MASK_DIAG_J2_0 :
2650             (rdiag && ci*2+1 == cj ? NBNXN_INT_MASK_DIAG_J2_1 :
2651              NBNXN_INT_MASK_ALL));
2652 #endif
2653 }
2654
2655 /* Returns a diagonal or off-diagonal interaction mask for SIMD256 lists */
2656 static unsigned int get_imask_x86_simd256(gmx_bool rdiag,int ci,int cj)
2657 {
2658 #ifndef GMX_DOUBLE /* cj-size = 8 */
2659     return (rdiag && ci == cj*2 ? NBNXN_INT_MASK_DIAG_J8_0 :
2660             (rdiag && ci == cj*2+1 ? NBNXN_INT_MASK_DIAG_J8_1 :
2661              NBNXN_INT_MASK_ALL));
2662 #else              /* cj-size = 4 */
2663     return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
2664 #endif
2665 }
2666
2667 #ifdef GMX_NBNXN_SIMD
2668 #if GMX_NBNXN_SIMD_BITWIDTH == 128
2669 #define get_imask_x86_simd_4xn  get_imask_x86_simd128
2670 #else
2671 #if GMX_NBNXN_SIMD_BITWIDTH == 256
2672 #define get_imask_x86_simd_4xn  get_imask_x86_simd256
2673 #define get_imask_x86_simd_2xnn get_imask_x86_simd128
2674 #else
2675 #error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
2676 #endif
2677 #endif
2678 #endif
2679
2680 /* Plain C code for making a pair list of cell ci vs cell cjf-cjl.
2681  * Checks bounding box distances and possibly atom pair distances.
2682  */
2683 static void make_cluster_list_simple(const nbnxn_grid_t *gridj,
2684                                      nbnxn_pairlist_t *nbl,
2685                                      int ci,int cjf,int cjl,
2686                                      gmx_bool remove_sub_diag,
2687                                      const real *x_j,
2688                                      real rl2,float rbb2,
2689                                      int *ndistc)
2690 {
2691     const nbnxn_list_work_t *work;
2692
2693     const float *bb_ci;
2694     const real  *x_ci;
2695
2696     gmx_bool   InRange;
2697     real       d2;
2698     int        cjf_gl,cjl_gl,cj;
2699
2700     work = nbl->work;
2701
2702     bb_ci = nbl->work->bb_ci;
2703     x_ci  = nbl->work->x_ci;
2704
2705     InRange = FALSE;
2706     while (!InRange && cjf <= cjl)
2707     {
2708         d2 = subc_bb_dist2(0,bb_ci,cjf,gridj->bb);
2709         *ndistc += 2;
2710
2711         /* Check if the distance is within the distance where
2712          * we use only the bounding box distance rbb,
2713          * or within the cut-off and there is at least one atom pair
2714          * within the cut-off.
2715          */
2716         if (d2 < rbb2)
2717         {
2718             InRange = TRUE;
2719         }
2720         else if (d2 < rl2)
2721         {
2722             int i,j;
2723
2724             cjf_gl = gridj->cell0 + cjf;
2725             for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
2726             {
2727                 for(j=0; j<NBNXN_CPU_CLUSTER_I_SIZE; j++)
2728                 {
2729                     InRange = InRange ||
2730                         (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
2731                          sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
2732                          sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
2733                 }
2734             }
2735             *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
2736         }
2737         if (!InRange)
2738         {
2739             cjf++;
2740         }
2741     }
2742     if (!InRange)
2743     {
2744         return;
2745     }
2746
2747     InRange = FALSE;
2748     while (!InRange && cjl > cjf)
2749     {
2750         d2 = subc_bb_dist2(0,bb_ci,cjl,gridj->bb);
2751         *ndistc += 2;
2752
2753         /* Check if the distance is within the distance where
2754          * we use only the bounding box distance rbb,
2755          * or within the cut-off and there is at least one atom pair
2756          * within the cut-off.
2757          */
2758         if (d2 < rbb2)
2759         {
2760             InRange = TRUE;
2761         }
2762         else if (d2 < rl2)
2763         {
2764             int i,j;
2765
2766             cjl_gl = gridj->cell0 + cjl;
2767             for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
2768             {
2769                 for(j=0; j<NBNXN_CPU_CLUSTER_I_SIZE; j++)
2770                 {
2771                     InRange = InRange ||
2772                         (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
2773                          sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
2774                          sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
2775                 }
2776             }
2777             *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
2778         }
2779         if (!InRange)
2780         {
2781             cjl--;
2782         }
2783     }
2784
2785     if (cjf <= cjl)
2786     {
2787         for(cj=cjf; cj<=cjl; cj++)
2788         {
2789             /* Store cj and the interaction mask */
2790             nbl->cj[nbl->ncj].cj   = gridj->cell0 + cj;
2791             nbl->cj[nbl->ncj].excl = get_imask(remove_sub_diag,ci,cj);
2792             nbl->ncj++;
2793         }
2794         /* Increase the closing index in i super-cell list */
2795         nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
2796     }
2797 }
2798
2799 #ifdef GMX_NBNXN_SIMD_4XN
2800 #include "nbnxn_search_simd_4xn.h"
2801 #endif
2802 #ifdef GMX_NBNXN_SIMD_2XNN
2803 #include "nbnxn_search_simd_2xnn.h"
2804 #endif
2805
2806 /* Plain C or SSE code for making a pair list of super-cell sci vs scj.
2807  * Checks bounding box distances and possibly atom pair distances.
2808  */
2809 static void make_cluster_list_supersub(const nbnxn_search_t nbs,
2810                                        const nbnxn_grid_t *gridi,
2811                                        const nbnxn_grid_t *gridj,
2812                                        nbnxn_pairlist_t *nbl,
2813                                        int sci,int scj,
2814                                        gmx_bool sci_equals_scj,
2815                                        int stride,const real *x,
2816                                        real rl2,float rbb2,
2817                                        int *ndistc)
2818 {
2819     int  na_c;
2820     int  npair;
2821     int  cjo,ci1,ci,cj,cj_gl;
2822     int  cj4_ind,cj_offset;
2823     unsigned imask;
2824     nbnxn_cj4_t *cj4;
2825     const float *bb_ci;
2826     const real *x_ci;
2827     float *d2l,d2;
2828     int  w;
2829 #define PRUNE_LIST_CPU_ONE
2830 #ifdef PRUNE_LIST_CPU_ONE
2831     int  ci_last=-1;
2832 #endif
2833
2834     d2l = nbl->work->d2;
2835
2836     bb_ci = nbl->work->bb_ci;
2837     x_ci  = nbl->work->x_ci;
2838
2839     na_c = gridj->na_c;
2840
2841     for(cjo=0; cjo<gridj->nsubc[scj]; cjo++)
2842     {
2843         cj4_ind   = (nbl->work->cj_ind >> 2);
2844         cj_offset = nbl->work->cj_ind - cj4_ind*NBNXN_GPU_JGROUP_SIZE;
2845         cj4       = &nbl->cj4[cj4_ind];
2846
2847         cj = scj*GPU_NSUBCELL + cjo;
2848
2849         cj_gl = gridj->cell0*GPU_NSUBCELL + cj;
2850
2851         /* Initialize this j-subcell i-subcell list */
2852         cj4->cj[cj_offset] = cj_gl;
2853         imask              = 0;
2854
2855         if (sci_equals_scj)
2856         {
2857             ci1 = cjo + 1;
2858         }
2859         else
2860         {
2861             ci1 = gridi->nsubc[sci];
2862         }
2863
2864 #ifdef NBNXN_BBXXXX
2865         /* Determine all ci1 bb distances in one call with SSE */
2866         subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_8BB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_8BB-1)),
2867                                ci1,bb_ci,d2l);
2868         *ndistc += na_c*2;
2869 #endif
2870
2871         npair = 0;
2872         /* We use a fixed upper-bound instead of ci1 to help optimization */
2873         for(ci=0; ci<GPU_NSUBCELL; ci++)
2874         {
2875             if (ci == ci1)
2876             {
2877                 break;
2878             }
2879
2880 #ifndef NBNXN_BBXXXX
2881             /* Determine the bb distance between ci and cj */
2882             d2l[ci] = subc_bb_dist2(ci,bb_ci,cj,gridj->bb);
2883             *ndistc += 2;
2884 #endif
2885             d2 = d2l[ci];
2886
2887 #ifdef PRUNE_LIST_CPU_ALL
2888             /* Check if the distance is within the distance where
2889              * we use only the bounding box distance rbb,
2890              * or within the cut-off and there is at least one atom pair
2891              * within the cut-off. This check is very costly.
2892              */
2893             *ndistc += na_c*na_c;
2894             if (d2 < rbb2 ||
2895                 (d2 < rl2 && subc_in_range_x(na_c,ci,x_ci,cj_gl,stride,x,rl2)))
2896 #else
2897             /* Check if the distance between the two bounding boxes
2898              * in within the pair-list cut-off.
2899              */
2900             if (d2 < rl2)
2901 #endif
2902             {
2903                 /* Flag this i-subcell to be taken into account */
2904                 imask |= (1U << (cj_offset*GPU_NSUBCELL+ci));
2905
2906 #ifdef PRUNE_LIST_CPU_ONE
2907                 ci_last = ci;
2908 #endif
2909
2910                 npair++;
2911             }
2912         }
2913
2914 #ifdef PRUNE_LIST_CPU_ONE
2915         /* If we only found 1 pair, check if any atoms are actually
2916          * within the cut-off, so we could get rid of it.
2917          */
2918         if (npair == 1 && d2l[ci_last] >= rbb2)
2919         {
2920             /* Avoid using function pointers here, as it's slower */
2921             if (
2922 #ifdef NBNXN_8BB_SSE
2923                 !subc_in_range_sse8
2924 #else
2925                 !subc_in_range_x
2926 #endif
2927                                 (na_c,ci_last,x_ci,cj_gl,stride,x,rl2))
2928             {
2929                 imask &= ~(1U << (cj_offset*GPU_NSUBCELL+ci_last));
2930                 npair--;
2931             }
2932         }
2933 #endif
2934
2935         if (npair > 0)
2936         {
2937             /* We have a useful sj entry, close it now */
2938
2939             /* Set the exclucions for the ci== sj entry.
2940              * Here we don't bother to check if this entry is actually flagged,
2941              * as it will nearly always be in the list.
2942              */
2943             if (sci_equals_scj)
2944             {
2945                 set_self_and_newton_excls_supersub(nbl,cj4_ind,cj_offset,cjo);
2946             }
2947
2948             /* Copy the cluster interaction mask to the list */
2949             for(w=0; w<NWARP; w++)
2950             {
2951                 cj4->imei[w].imask |= imask;
2952             }
2953
2954             nbl->work->cj_ind++;
2955
2956             /* Keep the count */
2957             nbl->nci_tot += npair;
2958
2959             /* Increase the closing index in i super-cell list */
2960             nbl->sci[nbl->nsci].cj4_ind_end = ((nbl->work->cj_ind+4-1)>>2);
2961         }
2962     }
2963 }
2964
2965 /* Set all atom-pair exclusions from the topology stored in excl
2966  * as masks in the pair-list for simple list i-entry nbl_ci
2967  */
2968 static void set_ci_top_excls(const nbnxn_search_t nbs,
2969                              nbnxn_pairlist_t *nbl,
2970                              gmx_bool diagRemoved,
2971                              int na_ci_2log,
2972                              int na_cj_2log,
2973                              const nbnxn_ci_t *nbl_ci,
2974                              const t_blocka *excl)
2975 {
2976     const int *cell;
2977     int ci;
2978     int cj_ind_first,cj_ind_last;
2979     int cj_first,cj_last;
2980     int ndirect;
2981     int i,ai,aj,si,eind,ge,se;
2982     int found,cj_ind_0,cj_ind_1,cj_ind_m;
2983     int cj_m;
2984     gmx_bool Found_si;
2985     int si_ind;
2986     nbnxn_excl_t *nbl_excl;
2987     int inner_i,inner_e;
2988
2989     cell = nbs->cell;
2990
2991     if (nbl_ci->cj_ind_end == nbl_ci->cj_ind_start)
2992     {
2993         /* Empty list */
2994         return;
2995     }
2996
2997     ci = nbl_ci->ci;
2998
2999     cj_ind_first = nbl_ci->cj_ind_start;
3000     cj_ind_last  = nbl->ncj - 1;
3001
3002     cj_first = nbl->cj[cj_ind_first].cj;
3003     cj_last  = nbl->cj[cj_ind_last].cj;
3004
3005     /* Determine how many contiguous j-cells we have starting
3006      * from the first i-cell. This number can be used to directly
3007      * calculate j-cell indices for excluded atoms.
3008      */
3009     ndirect = 0;
3010     if (na_ci_2log == na_cj_2log)
3011     {
3012         while (cj_ind_first + ndirect <= cj_ind_last &&
3013                nbl->cj[cj_ind_first+ndirect].cj == ci + ndirect)
3014         {
3015             ndirect++;
3016         }
3017     }
3018 #ifdef NBNXN_SEARCH_SSE
3019     else
3020     {
3021         while (cj_ind_first + ndirect <= cj_ind_last &&
3022                nbl->cj[cj_ind_first+ndirect].cj == ci_to_cj(na_cj_2log,ci) + ndirect)
3023         {
3024             ndirect++;
3025         }
3026     }
3027 #endif
3028
3029     /* Loop over the atoms in the i super-cell */
3030     for(i=0; i<nbl->na_sc; i++)
3031     {
3032         ai = nbs->a[ci*nbl->na_sc+i];
3033         if (ai >= 0)
3034         {
3035             si  = (i>>na_ci_2log);
3036
3037             /* Loop over the topology-based exclusions for this i-atom */
3038             for(eind=excl->index[ai]; eind<excl->index[ai+1]; eind++)
3039             {
3040                 aj = excl->a[eind];
3041
3042                 if (aj == ai)
3043                 {
3044                     /* The self exclusion are already set, save some time */
3045                     continue;
3046                 }
3047
3048                 ge = cell[aj];
3049
3050                 /* Without shifts we only calculate interactions j>i
3051                  * for one-way pair-lists.
3052                  */
3053                 if (diagRemoved && ge <= ci*nbl->na_sc + i)
3054                 {
3055                     continue;
3056                 }
3057
3058                 se = (ge >> na_cj_2log);
3059
3060                 /* Could the cluster se be in our list? */
3061                 if (se >= cj_first && se <= cj_last)
3062                 {
3063                     if (se < cj_first + ndirect)
3064                     {
3065                         /* We can calculate cj_ind directly from se */
3066                         found = cj_ind_first + se - cj_first;
3067                     }
3068                     else
3069                     {
3070                         /* Search for se using bisection */
3071                         found = -1;
3072                         cj_ind_0 = cj_ind_first + ndirect;
3073                         cj_ind_1 = cj_ind_last + 1;
3074                         while (found == -1 && cj_ind_0 < cj_ind_1)
3075                         {
3076                             cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
3077
3078                             cj_m = nbl->cj[cj_ind_m].cj;
3079
3080                             if (se == cj_m)
3081                             {
3082                                 found = cj_ind_m;
3083                             }
3084                             else if (se < cj_m)
3085                             {
3086                                 cj_ind_1 = cj_ind_m;
3087                             }
3088                             else
3089                             {
3090                                 cj_ind_0 = cj_ind_m + 1;
3091                             }
3092                         }
3093                     }
3094
3095                     if (found >= 0)
3096                     {
3097                         inner_i = i  - (si << na_ci_2log);
3098                         inner_e = ge - (se << na_cj_2log);
3099
3100                         nbl->cj[found].excl &= ~(1U<<((inner_i<<na_cj_2log) + inner_e));
3101                     }
3102                 }
3103             }
3104         }
3105     }
3106 }
3107
3108 /* Set all atom-pair exclusions from the topology stored in excl
3109  * as masks in the pair-list for i-super-cell entry nbl_sci
3110  */
3111 static void set_sci_top_excls(const nbnxn_search_t nbs,
3112                               nbnxn_pairlist_t *nbl,
3113                               gmx_bool diagRemoved,
3114                               int na_c_2log,
3115                               const nbnxn_sci_t *nbl_sci,
3116                               const t_blocka *excl)
3117 {
3118     const int *cell;
3119     int na_c;
3120     int sci;
3121     int cj_ind_first,cj_ind_last;
3122     int cj_first,cj_last;
3123     int ndirect;
3124     int i,ai,aj,si,eind,ge,se;
3125     int found,cj_ind_0,cj_ind_1,cj_ind_m;
3126     int cj_m;
3127     gmx_bool Found_si;
3128     int si_ind;
3129     nbnxn_excl_t *nbl_excl;
3130     int inner_i,inner_e,w;
3131
3132     cell = nbs->cell;
3133
3134     na_c = nbl->na_ci;
3135
3136     if (nbl_sci->cj4_ind_end == nbl_sci->cj4_ind_start)
3137     {
3138         /* Empty list */
3139         return;
3140     }
3141
3142     sci = nbl_sci->sci;
3143
3144     cj_ind_first = nbl_sci->cj4_ind_start*NBNXN_GPU_JGROUP_SIZE;
3145     cj_ind_last  = nbl->work->cj_ind - 1;
3146
3147     cj_first = nbl->cj4[nbl_sci->cj4_ind_start].cj[0];
3148     cj_last  = nbl_cj(nbl,cj_ind_last);
3149
3150     /* Determine how many contiguous j-clusters we have starting
3151      * from the first i-cluster. This number can be used to directly
3152      * calculate j-cluster indices for excluded atoms.
3153      */
3154     ndirect = 0;
3155     while (cj_ind_first + ndirect <= cj_ind_last &&
3156            nbl_cj(nbl,cj_ind_first+ndirect) == sci*GPU_NSUBCELL + ndirect)
3157     {
3158         ndirect++;
3159     }
3160
3161     /* Loop over the atoms in the i super-cell */
3162     for(i=0; i<nbl->na_sc; i++)
3163     {
3164         ai = nbs->a[sci*nbl->na_sc+i];
3165         if (ai >= 0)
3166         {
3167             si  = (i>>na_c_2log);
3168
3169             /* Loop over the topology-based exclusions for this i-atom */
3170             for(eind=excl->index[ai]; eind<excl->index[ai+1]; eind++)
3171             {
3172                 aj = excl->a[eind];
3173
3174                 if (aj == ai)
3175                 {
3176                     /* The self exclusion are already set, save some time */
3177                     continue;
3178                 }
3179
3180                 ge = cell[aj];
3181
3182                 /* Without shifts we only calculate interactions j>i
3183                  * for one-way pair-lists.
3184                  */
3185                 if (diagRemoved && ge <= sci*nbl->na_sc + i)
3186                 {
3187                     continue;
3188                 }
3189
3190                 se = ge>>na_c_2log;
3191                 /* Could the cluster se be in our list? */
3192                 if (se >= cj_first && se <= cj_last)
3193                 {
3194                     if (se < cj_first + ndirect)
3195                     {
3196                         /* We can calculate cj_ind directly from se */
3197                         found = cj_ind_first + se - cj_first;
3198                     }
3199                     else
3200                     {
3201                         /* Search for se using bisection */
3202                         found = -1;
3203                         cj_ind_0 = cj_ind_first + ndirect;
3204                         cj_ind_1 = cj_ind_last + 1;
3205                         while (found == -1 && cj_ind_0 < cj_ind_1)
3206                         {
3207                             cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
3208
3209                             cj_m = nbl_cj(nbl,cj_ind_m);
3210
3211                             if (se == cj_m)
3212                             {
3213                                 found = cj_ind_m;
3214                             }
3215                             else if (se < cj_m)
3216                             {
3217                                 cj_ind_1 = cj_ind_m;
3218                             }
3219                             else
3220                             {
3221                                 cj_ind_0 = cj_ind_m + 1;
3222                             }
3223                         }
3224                     }
3225
3226                     if (found >= 0)
3227                     {
3228                         inner_i = i  - si*na_c;
3229                         inner_e = ge - se*na_c;
3230
3231 /* Macro for getting the index of atom a within a cluster */
3232 #define AMODI(a)  ((a) & (NBNXN_CPU_CLUSTER_I_SIZE - 1))
3233 /* Macro for converting an atom number to a cluster number */
3234 #define A2CI(a)   ((a) >> NBNXN_CPU_CLUSTER_I_SIZE_2LOG)
3235
3236                         if (nbl_imask0(nbl,found) & (1U << (AMODI(found)*GPU_NSUBCELL + si)))
3237                         {
3238                             w       = (inner_e >> 2);
3239
3240                             get_nbl_exclusions_1(nbl,A2CI(found),w,&nbl_excl);
3241
3242                             nbl_excl->pair[AMODI(inner_e)*nbl->na_ci+inner_i] &=
3243                                 ~(1U << (AMODI(found)*GPU_NSUBCELL + si));
3244                         }
3245
3246 #undef AMODI
3247 #undef A2CI
3248                     }
3249                 }
3250             }
3251         }
3252     }
3253 }
3254
3255 /* Reallocate the simple ci list for at least n entries */
3256 static void nb_realloc_ci(nbnxn_pairlist_t *nbl,int n)
3257 {
3258     nbl->ci_nalloc = over_alloc_small(n);
3259     nbnxn_realloc_void((void **)&nbl->ci,
3260                        nbl->nci*sizeof(*nbl->ci),
3261                        nbl->ci_nalloc*sizeof(*nbl->ci),
3262                        nbl->alloc,nbl->free);
3263 }
3264
3265 /* Reallocate the super-cell sci list for at least n entries */
3266 static void nb_realloc_sci(nbnxn_pairlist_t *nbl,int n)
3267 {
3268     nbl->sci_nalloc = over_alloc_small(n);
3269     nbnxn_realloc_void((void **)&nbl->sci,
3270                        nbl->nsci*sizeof(*nbl->sci),
3271                        nbl->sci_nalloc*sizeof(*nbl->sci),
3272                        nbl->alloc,nbl->free);
3273 }
3274
3275 /* Make a new ci entry at index nbl->nci */
3276 static void new_ci_entry(nbnxn_pairlist_t *nbl,int ci,int shift,int flags,
3277                          nbnxn_list_work_t *work)
3278 {
3279     if (nbl->nci + 1 > nbl->ci_nalloc)
3280     {
3281         nb_realloc_ci(nbl,nbl->nci+1);
3282     }
3283     nbl->ci[nbl->nci].ci            = ci;
3284     nbl->ci[nbl->nci].shift         = shift;
3285     /* Store the interaction flags along with the shift */
3286     nbl->ci[nbl->nci].shift        |= flags;
3287     nbl->ci[nbl->nci].cj_ind_start  = nbl->ncj;
3288     nbl->ci[nbl->nci].cj_ind_end    = nbl->ncj;
3289 }
3290
3291 /* Make a new sci entry at index nbl->nsci */
3292 static void new_sci_entry(nbnxn_pairlist_t *nbl,int sci,int shift,int flags,
3293                           nbnxn_list_work_t *work)
3294 {
3295     if (nbl->nsci + 1 > nbl->sci_nalloc)
3296     {
3297         nb_realloc_sci(nbl,nbl->nsci+1);
3298     }
3299     nbl->sci[nbl->nsci].sci           = sci;
3300     nbl->sci[nbl->nsci].shift         = shift;
3301     nbl->sci[nbl->nsci].cj4_ind_start = nbl->ncj4;
3302     nbl->sci[nbl->nsci].cj4_ind_end   = nbl->ncj4;
3303 }
3304
3305 /* Sort the simple j-list cj on exclusions.
3306  * Entries with exclusions will all be sorted to the beginning of the list.
3307  */
3308 static void sort_cj_excl(nbnxn_cj_t *cj,int ncj,
3309                          nbnxn_list_work_t *work)
3310 {
3311     int jnew,j;
3312
3313     if (ncj > work->cj_nalloc)
3314     {
3315         work->cj_nalloc = over_alloc_large(ncj);
3316         srenew(work->cj,work->cj_nalloc);
3317     }
3318
3319     /* Make a list of the j-cells involving exclusions */
3320     jnew = 0;
3321     for(j=0; j<ncj; j++)
3322     {
3323         if (cj[j].excl != NBNXN_INT_MASK_ALL)
3324         {
3325             work->cj[jnew++] = cj[j];
3326         }
3327     }
3328     /* Check if there are exclusions at all or not just the first entry */
3329     if (!((jnew == 0) ||
3330           (jnew == 1 && cj[0].excl != NBNXN_INT_MASK_ALL)))
3331     {
3332         for(j=0; j<ncj; j++)
3333         {
3334             if (cj[j].excl == NBNXN_INT_MASK_ALL)
3335             {
3336                 work->cj[jnew++] = cj[j];
3337             }
3338         }
3339         for(j=0; j<ncj; j++)
3340         {
3341             cj[j] = work->cj[j];
3342         }
3343     }
3344 }
3345
3346 /* Close this simple list i entry */
3347 static void close_ci_entry_simple(nbnxn_pairlist_t *nbl)
3348 {
3349     int jlen;
3350
3351     /* All content of the new ci entry have already been filled correctly,
3352      * we only need to increase the count here (for non empty lists).
3353      */
3354     jlen = nbl->ci[nbl->nci].cj_ind_end - nbl->ci[nbl->nci].cj_ind_start;
3355     if (jlen > 0)
3356     {
3357         sort_cj_excl(nbl->cj+nbl->ci[nbl->nci].cj_ind_start,jlen,nbl->work);
3358
3359         if (nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0))
3360         {
3361             nbl->work->ncj_hlj += jlen;
3362         }
3363         else if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
3364         {
3365             nbl->work->ncj_noq += jlen;
3366         }
3367
3368         nbl->nci++;
3369     }
3370 }
3371
3372 /* Split sci entry for load balancing on the GPU.
3373  * As we only now the current count on our own thread,
3374  * we will need to estimate the current total amount of i-entries.
3375  * As the lists get concatenated later, this estimate depends
3376  * both on nthread and our own thread index thread.
3377  */
3378 static void split_sci_entry(nbnxn_pairlist_t *nbl,
3379                             int nsp_max_av,gmx_bool progBal,int nc_bal,
3380                             int thread,int nthread)
3381 {
3382     int nsci_est;
3383     int nsp_max;
3384     int cj4_start,cj4_end,j4len,cj4;
3385     int sci;
3386     int nsp,nsp_sci,nsp_cj4,nsp_cj4_e,nsp_cj4_p;
3387     int p;
3388
3389     /* Estimate the total numbers of ci's of the nblist combined
3390      * over all threads using the target number of ci's.
3391      */
3392     nsci_est = nc_bal*thread/nthread + nbl->nsci;
3393     if (progBal)
3394     {
3395         /* The first ci blocks should be larger, to avoid overhead.
3396          * The last ci blocks should be smaller, to improve load balancing.
3397          */
3398         nsp_max = max(1,
3399                       nsp_max_av*nc_bal*3/(2*(nsci_est - 1 + nc_bal)));
3400     }
3401     else
3402     {
3403         nsp_max = nsp_max_av;
3404     }
3405
3406     cj4_start = nbl->sci[nbl->nsci-1].cj4_ind_start;
3407     cj4_end   = nbl->sci[nbl->nsci-1].cj4_ind_end;
3408     j4len = cj4_end - cj4_start;
3409
3410     if (j4len > 1 && j4len*GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE > nsp_max)
3411     {
3412         /* Remove the last ci entry and process the cj4's again */
3413         nbl->nsci -= 1;
3414
3415         sci        = nbl->nsci;
3416         cj4        = cj4_start;
3417         nsp        = 0;
3418         nsp_sci    = 0;
3419         nsp_cj4_e  = 0;
3420         nsp_cj4    = 0;
3421         while (cj4 < cj4_end)
3422         {
3423             nsp_cj4_p = nsp_cj4;
3424             nsp_cj4   = 0;
3425             for(p=0; p<GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE; p++)
3426             {
3427                 nsp_cj4 += (nbl->cj4[cj4].imei[0].imask >> p) & 1;
3428             }
3429             nsp += nsp_cj4;
3430
3431             if (nsp > nsp_max && nsp > nsp_cj4)
3432             {
3433                 nbl->sci[sci].cj4_ind_end = cj4;
3434                 sci++;
3435                 nbl->nsci++;
3436                 if (nbl->nsci+1 > nbl->sci_nalloc)
3437                 {
3438                     nb_realloc_sci(nbl,nbl->nsci+1);
3439                 }
3440                 nbl->sci[sci].sci           = nbl->sci[nbl->nsci-1].sci;
3441                 nbl->sci[sci].shift         = nbl->sci[nbl->nsci-1].shift;
3442                 nbl->sci[sci].cj4_ind_start = cj4;
3443                 nsp_sci   = nsp - nsp_cj4;
3444                 nsp_cj4_e = nsp_cj4_p;
3445                 nsp       = nsp_cj4;
3446             }
3447
3448             cj4++;
3449         }
3450
3451         /* Put the remaining cj4's in a new ci entry */
3452         nbl->sci[sci].cj4_ind_end = cj4_end;
3453
3454         /* Possibly balance out the last two ci's
3455          * by moving the last cj4 of the second last ci.
3456          */
3457         if (nsp_sci - nsp_cj4_e >= nsp + nsp_cj4_e)
3458         {
3459             nbl->sci[sci-1].cj4_ind_end--;
3460             nbl->sci[sci].cj4_ind_start--;
3461         }
3462
3463         sci++;
3464         nbl->nsci++;
3465     }
3466 }
3467
3468 /* Clost this super/sub list i entry */
3469 static void close_ci_entry_supersub(nbnxn_pairlist_t *nbl,
3470                                     int nsp_max_av,
3471                                     gmx_bool progBal,int nc_bal,
3472                                     int thread,int nthread)
3473 {
3474     int j4len,tlen;
3475     int nb,b;
3476
3477     /* All content of the new ci entry have already been filled correctly,
3478      * we only need to increase the count here (for non empty lists).
3479      */
3480     j4len = nbl->sci[nbl->nsci].cj4_ind_end - nbl->sci[nbl->nsci].cj4_ind_start;
3481     if (j4len > 0)
3482     {
3483         /* We can only have complete blocks of 4 j-entries in a list,
3484          * so round the count up before closing.
3485          */
3486         nbl->ncj4         = ((nbl->work->cj_ind + 4-1) >> 2);
3487         nbl->work->cj_ind = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
3488
3489         nbl->nsci++;
3490
3491         if (nsp_max_av > 0)
3492         {
3493             split_sci_entry(nbl,nsp_max_av,progBal,nc_bal,thread,nthread);
3494         }
3495     }
3496 }
3497
3498 /* Syncs the working array before adding another grid pair to the list */
3499 static void sync_work(nbnxn_pairlist_t *nbl)
3500 {
3501     if (!nbl->bSimple)
3502     {
3503         nbl->work->cj_ind   = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
3504         nbl->work->cj4_init = nbl->ncj4;
3505     }
3506 }
3507
3508 /* Clears an nbnxn_pairlist_t data structure */
3509 static void clear_pairlist(nbnxn_pairlist_t *nbl)
3510 {
3511     nbl->nci           = 0;
3512     nbl->nsci          = 0;
3513     nbl->ncj           = 0;
3514     nbl->ncj4          = 0;
3515     nbl->nci_tot       = 0;
3516     nbl->nexcl         = 1;
3517
3518     nbl->work->ncj_noq = 0;
3519     nbl->work->ncj_hlj = 0;
3520 }
3521
3522 /* Sets a simple list i-cell bounding box, including PBC shift */
3523 static void set_icell_bb_simple(const float *bb,int ci,
3524                                 real shx,real shy,real shz,
3525                                 float *bb_ci)
3526 {
3527     int ia;
3528
3529     ia = ci*NNBSBB_B;
3530     bb_ci[BBL_X] = bb[ia+BBL_X] + shx;
3531     bb_ci[BBL_Y] = bb[ia+BBL_Y] + shy;
3532     bb_ci[BBL_Z] = bb[ia+BBL_Z] + shz;
3533     bb_ci[BBU_X] = bb[ia+BBU_X] + shx;
3534     bb_ci[BBU_Y] = bb[ia+BBU_Y] + shy;
3535     bb_ci[BBU_Z] = bb[ia+BBU_Z] + shz;
3536 }
3537
3538 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
3539 static void set_icell_bb_supersub(const float *bb,int ci,
3540                                   real shx,real shy,real shz,
3541                                   float *bb_ci)
3542 {
3543     int ia,m,i;
3544
3545 #ifdef NBNXN_BBXXXX
3546     ia = ci*(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX;
3547     for(m=0; m<(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX; m+=NNBSBB_XXXX)
3548     {
3549         for(i=0; i<STRIDE_8BB; i++)
3550         {
3551             bb_ci[m+0*STRIDE_8BB+i] = bb[ia+m+0*STRIDE_8BB+i] + shx;
3552             bb_ci[m+1*STRIDE_8BB+i] = bb[ia+m+1*STRIDE_8BB+i] + shy;
3553             bb_ci[m+2*STRIDE_8BB+i] = bb[ia+m+2*STRIDE_8BB+i] + shz;
3554             bb_ci[m+3*STRIDE_8BB+i] = bb[ia+m+3*STRIDE_8BB+i] + shx;
3555             bb_ci[m+4*STRIDE_8BB+i] = bb[ia+m+4*STRIDE_8BB+i] + shy;
3556             bb_ci[m+5*STRIDE_8BB+i] = bb[ia+m+5*STRIDE_8BB+i] + shz;
3557         }
3558     }
3559 #else
3560     ia = ci*GPU_NSUBCELL*NNBSBB_B;
3561     for(i=0; i<GPU_NSUBCELL*NNBSBB_B; i+=NNBSBB_B)
3562     {
3563         bb_ci[i+BBL_X] = bb[ia+i+BBL_X] + shx;
3564         bb_ci[i+BBL_Y] = bb[ia+i+BBL_Y] + shy;
3565         bb_ci[i+BBL_Z] = bb[ia+i+BBL_Z] + shz;
3566         bb_ci[i+BBU_X] = bb[ia+i+BBU_X] + shx;
3567         bb_ci[i+BBU_Y] = bb[ia+i+BBU_Y] + shy;
3568         bb_ci[i+BBU_Z] = bb[ia+i+BBU_Z] + shz;
3569     }
3570 #endif
3571 }
3572
3573 /* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
3574 static void icell_set_x_simple(int ci,
3575                                real shx,real shy,real shz,
3576                                int na_c,
3577                                int stride,const real *x,
3578                                nbnxn_list_work_t *work)
3579 {
3580     int  ia,i;
3581
3582     ia = ci*NBNXN_CPU_CLUSTER_I_SIZE;
3583
3584     for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE; i++)
3585     {
3586         work->x_ci[i*STRIDE_XYZ+XX] = x[(ia+i)*stride+XX] + shx;
3587         work->x_ci[i*STRIDE_XYZ+YY] = x[(ia+i)*stride+YY] + shy;
3588         work->x_ci[i*STRIDE_XYZ+ZZ] = x[(ia+i)*stride+ZZ] + shz;
3589     }
3590 }
3591
3592 /* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
3593 static void icell_set_x_supersub(int ci,
3594                                  real shx,real shy,real shz,
3595                                  int na_c,
3596                                  int stride,const real *x,
3597                                  nbnxn_list_work_t *work)
3598 {
3599     int  ia,i;
3600     real *x_ci;
3601
3602     x_ci = work->x_ci;
3603
3604     ia = ci*GPU_NSUBCELL*na_c;
3605     for(i=0; i<GPU_NSUBCELL*na_c; i++)
3606     {
3607         x_ci[i*DIM + XX] = x[(ia+i)*stride + XX] + shx;
3608         x_ci[i*DIM + YY] = x[(ia+i)*stride + YY] + shy;
3609         x_ci[i*DIM + ZZ] = x[(ia+i)*stride + ZZ] + shz;
3610     }
3611 }
3612
3613 #ifdef NBNXN_SEARCH_SSE
3614 /* Copies PBC shifted super-cell packed atom coordinates to working array */
3615 static void icell_set_x_supersub_sse8(int ci,
3616                                       real shx,real shy,real shz,
3617                                       int na_c,
3618                                       int stride,const real *x,
3619                                       nbnxn_list_work_t *work)
3620 {
3621     int  si,io,ia,i,j;
3622     real *x_ci;
3623
3624     x_ci = work->x_ci;
3625
3626     for(si=0; si<GPU_NSUBCELL; si++)
3627     {
3628         for(i=0; i<na_c; i+=STRIDE_8BB)
3629         {
3630             io = si*na_c + i;
3631             ia = ci*GPU_NSUBCELL*na_c + io;
3632             for(j=0; j<STRIDE_8BB; j++)
3633             {
3634                 x_ci[io*DIM + j + XX*STRIDE_8BB] = x[(ia+j)*stride+XX] + shx;
3635                 x_ci[io*DIM + j + YY*STRIDE_8BB] = x[(ia+j)*stride+YY] + shy;
3636                 x_ci[io*DIM + j + ZZ*STRIDE_8BB] = x[(ia+j)*stride+ZZ] + shz;
3637             }
3638         }
3639     }
3640 }
3641 #endif
3642
3643 static real nbnxn_rlist_inc_nonloc_fac = 0.6;
3644
3645 /* Due to the cluster size the effective pair-list is longer than
3646  * that of a simple atom pair-list. This function gives the extra distance.
3647  */
3648 real nbnxn_get_rlist_effective_inc(int cluster_size,real atom_density)
3649 {
3650     return ((0.5 + nbnxn_rlist_inc_nonloc_fac)*sqr(((cluster_size) - 1.0)/(cluster_size))*pow((cluster_size)/(atom_density),1.0/3.0));
3651 }
3652
3653 /* Estimates the interaction volume^2 for non-local interactions */
3654 static real nonlocal_vol2(const gmx_domdec_zones_t *zones,rvec ls,real r)
3655 {
3656     int  z,d;
3657     real cl,ca,za;
3658     real vold_est;
3659     real vol2_est_tot;
3660
3661     vol2_est_tot = 0;
3662
3663     /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
3664      * not home interaction volume^2. As these volumes are not additive,
3665      * this is an overestimate, but it would only be significant in the limit
3666      * of small cells, where we anyhow need to split the lists into
3667      * as small parts as possible.
3668      */
3669
3670     for(z=0; z<zones->n; z++)
3671     {
3672         if (zones->shift[z][XX] + zones->shift[z][YY] + zones->shift[z][ZZ] == 1)
3673         {
3674             cl = 0;
3675             ca = 1;
3676             za = 1;
3677             for(d=0; d<DIM; d++)
3678             {
3679                 if (zones->shift[z][d] == 0)
3680                 {
3681                     cl += 0.5*ls[d];
3682                     ca *= ls[d];
3683                     za *= zones->size[z].x1[d] - zones->size[z].x0[d];
3684                 }
3685             }
3686
3687             /* 4 octants of a sphere */
3688             vold_est  = 0.25*M_PI*r*r*r*r;
3689             /* 4 quarter pie slices on the edges */
3690             vold_est += 4*cl*M_PI/6.0*r*r*r;
3691             /* One rectangular volume on a face */
3692             vold_est += ca*0.5*r*r;
3693
3694             vol2_est_tot += vold_est*za;
3695         }
3696     }
3697
3698     return vol2_est_tot;
3699 }
3700
3701 /* Estimates the average size of a full j-list for super/sub setup */
3702 static int get_nsubpair_max(const nbnxn_search_t nbs,
3703                             int iloc,
3704                             real rlist,
3705                             int min_ci_balanced)
3706 {
3707     const nbnxn_grid_t *grid;
3708     rvec ls;
3709     real xy_diag2,r_eff_sup,vol_est,nsp_est,nsp_est_nl;
3710     int  nsubpair_max;
3711
3712     grid = &nbs->grid[0];
3713
3714     ls[XX] = (grid->c1[XX] - grid->c0[XX])/(grid->ncx*GPU_NSUBCELL_X);
3715     ls[YY] = (grid->c1[YY] - grid->c0[YY])/(grid->ncy*GPU_NSUBCELL_Y);
3716     ls[ZZ] = (grid->c1[ZZ] - grid->c0[ZZ])*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z);
3717
3718     /* The average squared length of the diagonal of a sub cell */
3719     xy_diag2 = ls[XX]*ls[XX] + ls[YY]*ls[YY] + ls[ZZ]*ls[ZZ];
3720
3721     /* The formulas below are a heuristic estimate of the average nsj per si*/
3722     r_eff_sup = rlist + nbnxn_rlist_inc_nonloc_fac*sqr((grid->na_c - 1.0)/grid->na_c)*sqrt(xy_diag2/3);
3723
3724     if (!nbs->DomDec || nbs->zones->n == 1)
3725     {
3726         nsp_est_nl = 0;
3727     }
3728     else
3729     {
3730         nsp_est_nl =
3731             sqr(grid->atom_density/grid->na_c)*
3732             nonlocal_vol2(nbs->zones,ls,r_eff_sup);
3733     }
3734
3735     if (LOCAL_I(iloc))
3736     {
3737         /* Sub-cell interacts with itself */
3738         vol_est  = ls[XX]*ls[YY]*ls[ZZ];
3739         /* 6/2 rectangular volume on the faces */
3740         vol_est += (ls[XX]*ls[YY] + ls[XX]*ls[ZZ] + ls[YY]*ls[ZZ])*r_eff_sup;
3741         /* 12/2 quarter pie slices on the edges */
3742         vol_est += 2*(ls[XX] + ls[YY] + ls[ZZ])*0.25*M_PI*sqr(r_eff_sup);
3743         /* 4 octants of a sphere */
3744         vol_est += 0.5*4.0/3.0*M_PI*pow(r_eff_sup,3);
3745
3746         nsp_est = grid->nsubc_tot*vol_est*grid->atom_density/grid->na_c;
3747
3748         /* Subtract the non-local pair count */
3749         nsp_est -= nsp_est_nl;
3750
3751         if (debug)
3752         {
3753             fprintf(debug,"nsp_est local %5.1f non-local %5.1f\n",
3754                     nsp_est,nsp_est_nl);
3755         }
3756     }
3757     else
3758     {
3759         nsp_est = nsp_est_nl;
3760     }
3761
3762     if (min_ci_balanced <= 0 || grid->nc >= min_ci_balanced || grid->nc == 0)
3763     {
3764         /* We don't need to worry */
3765         nsubpair_max = -1;
3766     }
3767     else
3768     {
3769         /* Thus the (average) maximum j-list size should be as follows */
3770         nsubpair_max = max(1,(int)(nsp_est/min_ci_balanced+0.5));
3771
3772         /* Since the target value is a maximum (this avoid high outliers,
3773          * which lead to load imbalance), not average, we get more lists
3774          * than we ask for (to compensate we need to add GPU_NSUBCELL*4/4).
3775          * But more importantly, the optimal GPU performance moves
3776          * to lower number of block for very small blocks.
3777          * To compensate we add the maximum pair count per cj4.
3778          */
3779         nsubpair_max += GPU_NSUBCELL*NBNXN_CPU_CLUSTER_I_SIZE;
3780     }
3781
3782     if (debug)
3783     {
3784         fprintf(debug,"nbl nsp estimate %.1f, nsubpair_max %d\n",
3785                 nsp_est,nsubpair_max);
3786     }
3787
3788     return nsubpair_max;
3789 }
3790
3791 /* Debug list print function */
3792 static void print_nblist_ci_cj(FILE *fp,const nbnxn_pairlist_t *nbl)
3793 {
3794     int i,j;
3795
3796     for(i=0; i<nbl->nci; i++)
3797     {
3798         fprintf(fp,"ci %4d  shift %2d  ncj %3d\n",
3799                 nbl->ci[i].ci,nbl->ci[i].shift,
3800                 nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start);
3801
3802         for(j=nbl->ci[i].cj_ind_start; j<nbl->ci[i].cj_ind_end; j++)
3803         {
3804             fprintf(fp,"  cj %5d  imask %x\n",
3805                     nbl->cj[j].cj,
3806                     nbl->cj[j].excl);
3807         }
3808     }
3809 }
3810
3811 /* Debug list print function */
3812 static void print_nblist_sci_cj(FILE *fp,const nbnxn_pairlist_t *nbl)
3813 {
3814     int i,j4,j;
3815
3816     for(i=0; i<nbl->nsci; i++)
3817     {
3818         fprintf(fp,"ci %4d  shift %2d  ncj4 %2d\n",
3819                 nbl->sci[i].sci,nbl->sci[i].shift,
3820                 nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start);
3821
3822         for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
3823         {
3824             for(j=0; j<4; j++)
3825             {
3826                 fprintf(fp,"  sj %5d  imask %x\n",
3827                         nbl->cj4[j4].cj[j],
3828                         nbl->cj4[j4].imei[0].imask);
3829             }
3830         }
3831     }
3832 }
3833
3834 /* Combine pair lists *nbl generated on multiple threads nblc */
3835 static void combine_nblists(int nnbl,nbnxn_pairlist_t **nbl,
3836                             nbnxn_pairlist_t *nblc)
3837 {
3838     int nsci,ncj4,nexcl;
3839     int n,i;
3840
3841     if (nblc->bSimple)
3842     {
3843         gmx_incons("combine_nblists does not support simple lists");
3844     }
3845
3846     nsci  = nblc->nsci;
3847     ncj4  = nblc->ncj4;
3848     nexcl = nblc->nexcl;
3849     for(i=0; i<nnbl; i++)
3850     {
3851         nsci  += nbl[i]->nsci;
3852         ncj4  += nbl[i]->ncj4;
3853         nexcl += nbl[i]->nexcl;
3854     }
3855
3856     if (nsci > nblc->sci_nalloc)
3857     {
3858         nb_realloc_sci(nblc,nsci);
3859     }
3860     if (ncj4 > nblc->cj4_nalloc)
3861     {
3862         nblc->cj4_nalloc = over_alloc_small(ncj4);
3863         nbnxn_realloc_void((void **)&nblc->cj4,
3864                            nblc->ncj4*sizeof(*nblc->cj4),
3865                            nblc->cj4_nalloc*sizeof(*nblc->cj4),
3866                            nblc->alloc,nblc->free);
3867     }
3868     if (nexcl > nblc->excl_nalloc)
3869     {
3870         nblc->excl_nalloc = over_alloc_small(nexcl);
3871         nbnxn_realloc_void((void **)&nblc->excl,
3872                            nblc->nexcl*sizeof(*nblc->excl),
3873                            nblc->excl_nalloc*sizeof(*nblc->excl),
3874                            nblc->alloc,nblc->free);
3875     }
3876
3877     /* Each thread should copy its own data to the combined arrays,
3878      * as otherwise data will go back and forth between different caches.
3879      */
3880 #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
3881     for(n=0; n<nnbl; n++)
3882     {
3883         int sci_offset;
3884         int cj4_offset;
3885         int ci_offset;
3886         int excl_offset;
3887         int i,j4;
3888         const nbnxn_pairlist_t *nbli;
3889
3890         /* Determine the offset in the combined data for our thread */
3891         sci_offset  = nblc->nsci;
3892         cj4_offset  = nblc->ncj4;
3893         ci_offset   = nblc->nci_tot;
3894         excl_offset = nblc->nexcl;
3895
3896         for(i=0; i<n; i++)
3897         {
3898             sci_offset  += nbl[i]->nsci;
3899             cj4_offset  += nbl[i]->ncj4;
3900             ci_offset   += nbl[i]->nci_tot;
3901             excl_offset += nbl[i]->nexcl;
3902         }
3903
3904         nbli = nbl[n];
3905
3906         for(i=0; i<nbli->nsci; i++)
3907         {
3908             nblc->sci[sci_offset+i]                = nbli->sci[i];
3909             nblc->sci[sci_offset+i].cj4_ind_start += cj4_offset;
3910             nblc->sci[sci_offset+i].cj4_ind_end   += cj4_offset;
3911         }
3912
3913         for(j4=0; j4<nbli->ncj4; j4++)
3914         {
3915             nblc->cj4[cj4_offset+j4] = nbli->cj4[j4];
3916             nblc->cj4[cj4_offset+j4].imei[0].excl_ind += excl_offset;
3917             nblc->cj4[cj4_offset+j4].imei[1].excl_ind += excl_offset;
3918         }
3919
3920         for(j4=0; j4<nbli->nexcl; j4++)
3921         {
3922             nblc->excl[excl_offset+j4] = nbli->excl[j4];
3923         }
3924     }
3925
3926     for(n=0; n<nnbl; n++)
3927     {
3928         nblc->nsci    += nbl[n]->nsci;
3929         nblc->ncj4    += nbl[n]->ncj4;
3930         nblc->nci_tot += nbl[n]->nci_tot;
3931         nblc->nexcl   += nbl[n]->nexcl;
3932     }
3933 }
3934
3935 /* Returns the next ci to be processes by our thread */
3936 static gmx_bool next_ci(const nbnxn_grid_t *grid,
3937                         int conv,
3938                         int nth,int ci_block,
3939                         int *ci_x,int *ci_y,
3940                         int *ci_b,int *ci)
3941 {
3942     (*ci_b)++;
3943     (*ci)++;
3944
3945     if (*ci_b == ci_block)
3946     {
3947         /* Jump to the next block assigned to this task */
3948         *ci   += (nth - 1)*ci_block;
3949         *ci_b  = 0;
3950     }
3951
3952     if (*ci >= grid->nc*conv)
3953     {
3954         return FALSE;
3955     }
3956
3957     while (*ci >= grid->cxy_ind[*ci_x*grid->ncy + *ci_y + 1]*conv)
3958     {
3959         *ci_y += 1;
3960         if (*ci_y == grid->ncy)
3961         {
3962             *ci_x += 1;
3963             *ci_y  = 0;
3964         }
3965     }
3966
3967     return TRUE;
3968 }
3969
3970 /* Returns the distance^2 for which we put cell pairs in the list
3971  * without checking atom pair distances. This is usually < rlist^2.
3972  */
3973 static float boundingbox_only_distance2(const nbnxn_grid_t *gridi,
3974                                         const nbnxn_grid_t *gridj,
3975                                         real rlist,
3976                                         gmx_bool simple)
3977 {
3978     /* If the distance between two sub-cell bounding boxes is less
3979      * than this distance, do not check the distance between
3980      * all particle pairs in the sub-cell, since then it is likely
3981      * that the box pair has atom pairs within the cut-off.
3982      * We use the nblist cut-off minus 0.5 times the average x/y diagonal
3983      * spacing of the sub-cells. Around 40% of the checked pairs are pruned.
3984      * Using more than 0.5 gains at most 0.5%.
3985      * If forces are calculated more than twice, the performance gain
3986      * in the force calculation outweighs the cost of checking.
3987      * Note that with subcell lists, the atom-pair distance check
3988      * is only performed when only 1 out of 8 sub-cells in within range,
3989      * this is because the GPU is much faster than the cpu.
3990      */
3991     real bbx,bby;
3992     real rbb2;
3993
3994     bbx = 0.5*(gridi->sx + gridj->sx);
3995     bby = 0.5*(gridi->sy + gridj->sy);
3996     if (!simple)
3997     {
3998         bbx /= GPU_NSUBCELL_X;
3999         bby /= GPU_NSUBCELL_Y;
4000     }
4001
4002     rbb2 = sqr(max(0,rlist - 0.5*sqrt(bbx*bbx + bby*bby)));
4003
4004 #ifndef GMX_DOUBLE
4005     return rbb2;
4006 #else
4007     return (float)((1+GMX_FLOAT_EPS)*rbb2);
4008 #endif
4009 }
4010
4011 static int get_ci_block_size(const nbnxn_grid_t *gridi,
4012                              gmx_bool bDomDec, int nth)
4013 {
4014     const int ci_block_enum = 5;
4015     const int ci_block_denom = 11;
4016     const int ci_block_min_atoms = 16;
4017     int ci_block;
4018
4019     /* Here we decide how to distribute the blocks over the threads.
4020      * We use prime numbers to try to avoid that the grid size becomes
4021      * a multiple of the number of threads, which would lead to some
4022      * threads getting "inner" pairs and others getting boundary pairs,
4023      * which in turns will lead to load imbalance between threads.
4024      * Set the block size as 5/11/ntask times the average number of cells
4025      * in a y,z slab. This should ensure a quite uniform distribution
4026      * of the grid parts of the different thread along all three grid
4027      * zone boundaries with 3D domain decomposition. At the same time
4028      * the blocks will not become too small.
4029      */
4030     ci_block = (gridi->nc*ci_block_enum)/(ci_block_denom*gridi->ncx*nth);
4031
4032     /* Ensure the blocks are not too small: avoids cache invalidation */
4033     if (ci_block*gridi->na_sc < ci_block_min_atoms)
4034     {
4035         ci_block = (ci_block_min_atoms + gridi->na_sc - 1)/gridi->na_sc;
4036     }
4037     
4038     /* Without domain decomposition
4039      * or with less than 3 blocks per task, divide in nth blocks.
4040      */
4041     if (!bDomDec || ci_block*3*nth > gridi->nc)
4042     {
4043         ci_block = (gridi->nc + nth - 1)/nth;
4044     }
4045
4046     return ci_block;
4047 }
4048
4049 /* Generates the part of pair-list nbl assigned to our thread */
4050 static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
4051                                      const nbnxn_grid_t *gridi,
4052                                      const nbnxn_grid_t *gridj,
4053                                      nbnxn_search_work_t *work,
4054                                      const nbnxn_atomdata_t *nbat,
4055                                      const t_blocka *excl,
4056                                      real rlist,
4057                                      int nb_kernel_type,
4058                                      int ci_block,
4059                                      gmx_bool bFBufferFlag,
4060                                      int nsubpair_max,
4061                                      gmx_bool progBal,
4062                                      int min_ci_balanced,
4063                                      int th,int nth,
4064                                      nbnxn_pairlist_t *nbl)
4065 {
4066     int  na_cj_2log;
4067     matrix box;
4068     real rl2;
4069     float rbb2;
4070     int  d;
4071     int  ci_b,ci,ci_x,ci_y,ci_xy,cj;
4072     ivec shp;
4073     int  tx,ty,tz;
4074     int  shift;
4075     gmx_bool bMakeList;
4076     real shx,shy,shz;
4077     int  conv_i,cell0_i;
4078     const float *bb_i,*bbcz_i,*bbcz_j;
4079     const int *flags_i;
4080     real bx0,bx1,by0,by1,bz0,bz1;
4081     real bz1_frac;
4082     real d2cx,d2z,d2z_cx,d2z_cy,d2zx,d2zxy,d2xy;
4083     int  cxf,cxl,cyf,cyf_x,cyl;
4084     int  cx,cy;
4085     int  c0,c1,cs,cf,cl;
4086     int  ndistc;
4087     int  ncpcheck;
4088     int  gridi_flag_shift=0,gridj_flag_shift=0;
4089     unsigned *gridj_flag=NULL;
4090     int  ncj_old_i,ncj_old_j;
4091
4092     nbs_cycle_start(&work->cc[enbsCCsearch]);
4093
4094     if (gridj->bSimple != nbl->bSimple)
4095     {
4096         gmx_incons("Grid incompatible with pair-list");
4097     }
4098
4099     sync_work(nbl);
4100     nbl->na_sc = gridj->na_sc;
4101     nbl->na_ci = gridj->na_c;
4102     nbl->na_cj = nbnxn_kernel_to_cj_size(nb_kernel_type);
4103     na_cj_2log = get_2log(nbl->na_cj);
4104
4105     nbl->rlist  = rlist;
4106
4107     if (bFBufferFlag)
4108     {
4109         /* Determine conversion of clusters to flag blocks */
4110         gridi_flag_shift = 0;
4111         while ((nbl->na_ci<<gridi_flag_shift) < NBNXN_BUFFERFLAG_SIZE)
4112         {
4113             gridi_flag_shift++;
4114         }
4115         gridj_flag_shift = 0;
4116         while ((nbl->na_cj<<gridj_flag_shift) < NBNXN_BUFFERFLAG_SIZE)
4117         {
4118             gridj_flag_shift++;
4119         }
4120
4121         gridj_flag = work->buffer_flags.flag;
4122     }
4123
4124     copy_mat(nbs->box,box);
4125
4126     rl2 = nbl->rlist*nbl->rlist;
4127
4128     rbb2 = boundingbox_only_distance2(gridi,gridj,nbl->rlist,nbl->bSimple);
4129
4130     if (debug)
4131     {
4132         fprintf(debug,"nbl bounding box only distance %f\n",sqrt(rbb2));
4133     }
4134
4135     /* Set the shift range */
4136     for(d=0; d<DIM; d++)
4137     {
4138         /* Check if we need periodicity shifts.
4139          * Without PBC or with domain decomposition we don't need them.
4140          */
4141         if (d >= ePBC2npbcdim(nbs->ePBC) || nbs->dd_dim[d])
4142         {
4143             shp[d] = 0;
4144         }
4145         else
4146         {
4147             if (d == XX &&
4148                 box[XX][XX] - fabs(box[YY][XX]) - fabs(box[ZZ][XX]) < sqrt(rl2))
4149             {
4150                 shp[d] = 2;
4151             }
4152             else
4153             {
4154                 shp[d] = 1;
4155             }
4156         }
4157     }
4158
4159     if (nbl->bSimple && !gridi->bSimple)
4160     {
4161         conv_i  = gridi->na_sc/gridj->na_sc;
4162         bb_i    = gridi->bb_simple;
4163         bbcz_i  = gridi->bbcz_simple;
4164         flags_i = gridi->flags_simple;
4165     }
4166     else
4167     {
4168         conv_i  = 1;
4169         bb_i    = gridi->bb;
4170         bbcz_i  = gridi->bbcz;
4171         flags_i = gridi->flags;
4172     }
4173     cell0_i = gridi->cell0*conv_i;
4174
4175     bbcz_j = gridj->bbcz;
4176
4177     if (conv_i != 1)
4178     {
4179         /* Blocks of the conversion factor - 1 give a large repeat count
4180          * combined with a small block size. This should result in good
4181          * load balancing for both small and large domains.
4182          */
4183         ci_block = conv_i - 1;
4184     }
4185     if (debug)
4186     {
4187         fprintf(debug,"nbl nc_i %d col.av. %.1f ci_block %d\n",
4188                 gridi->nc,gridi->nc/(double)(gridi->ncx*gridi->ncy),ci_block);
4189     }
4190
4191     ndistc = 0;
4192     ncpcheck = 0;
4193
4194     /* Initially ci_b and ci to 1 before where we want them to start,
4195      * as they will both be incremented in next_ci.
4196      */
4197     ci_b = -1;
4198     ci   = th*ci_block - 1;
4199     ci_x = 0;
4200     ci_y = 0;
4201     while (next_ci(gridi,conv_i,nth,ci_block,&ci_x,&ci_y,&ci_b,&ci))
4202     {
4203         if (nbl->bSimple && flags_i[ci] == 0)
4204         {
4205             continue;
4206         }
4207
4208         ncj_old_i = nbl->ncj;
4209
4210         d2cx = 0;
4211         if (gridj != gridi && shp[XX] == 0)
4212         {
4213             if (nbl->bSimple)
4214             {
4215                 bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX];
4216             }
4217             else
4218             {
4219                 bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx;
4220             }
4221             if (bx1 < gridj->c0[XX])
4222             {
4223                 d2cx = sqr(gridj->c0[XX] - bx1);
4224
4225                 if (d2cx >= rl2)
4226                 {
4227                     continue;
4228                 }
4229             }
4230         }
4231
4232         ci_xy = ci_x*gridi->ncy + ci_y;
4233
4234         /* Loop over shift vectors in three dimensions */
4235         for (tz=-shp[ZZ]; tz<=shp[ZZ]; tz++)
4236         {
4237             shz = tz*box[ZZ][ZZ];
4238
4239             bz0 = bbcz_i[ci*NNBSBB_D  ] + shz;
4240             bz1 = bbcz_i[ci*NNBSBB_D+1] + shz;
4241
4242             if (tz == 0)
4243             {
4244                 d2z = 0;
4245             }
4246             else if (tz < 0)
4247             {
4248                 d2z = sqr(bz1);
4249             }
4250             else
4251             {
4252                 d2z = sqr(bz0 - box[ZZ][ZZ]);
4253             }
4254
4255             d2z_cx = d2z + d2cx;
4256
4257             if (d2z_cx >= rl2)
4258             {
4259                 continue;
4260             }
4261
4262             bz1_frac =
4263                 bz1/((real)(gridi->cxy_ind[ci_xy+1] - gridi->cxy_ind[ci_xy]));
4264             if (bz1_frac < 0)
4265             {
4266                 bz1_frac = 0;
4267             }
4268             /* The check with bz1_frac close to or larger than 1 comes later */
4269
4270             for (ty=-shp[YY]; ty<=shp[YY]; ty++)
4271             {
4272                 shy = ty*box[YY][YY] + tz*box[ZZ][YY];
4273
4274                 if (nbl->bSimple)
4275                 {
4276                     by0 = bb_i[ci*NNBSBB_B         +YY] + shy;
4277                     by1 = bb_i[ci*NNBSBB_B+NNBSBB_C+YY] + shy;
4278                 }
4279                 else
4280                 {
4281                     by0 = gridi->c0[YY] + (ci_y  )*gridi->sy + shy;
4282                     by1 = gridi->c0[YY] + (ci_y+1)*gridi->sy + shy;
4283                 }
4284
4285                 get_cell_range(by0,by1,
4286                                gridj->ncy,gridj->c0[YY],gridj->sy,gridj->inv_sy,
4287                                d2z_cx,rl2,
4288                                &cyf,&cyl);
4289
4290                 if (cyf > cyl)
4291                 {
4292                     continue;
4293                 }
4294
4295                 d2z_cy = d2z;
4296                 if (by1 < gridj->c0[YY])
4297                 {
4298                     d2z_cy += sqr(gridj->c0[YY] - by1);
4299                 }
4300                 else if (by0 > gridj->c1[YY])
4301                 {
4302                     d2z_cy += sqr(by0 - gridj->c1[YY]);
4303                 }
4304
4305                 for (tx=-shp[XX]; tx<=shp[XX]; tx++)
4306                 {
4307                     shift = XYZ2IS(tx,ty,tz);
4308
4309 #ifdef NBNXN_SHIFT_BACKWARD
4310                     if (gridi == gridj && shift > CENTRAL)
4311                     {
4312                         continue;
4313                     }
4314 #endif
4315
4316                     shx = tx*box[XX][XX] + ty*box[YY][XX] + tz*box[ZZ][XX];
4317
4318                     if (nbl->bSimple)
4319                     {
4320                         bx0 = bb_i[ci*NNBSBB_B         +XX] + shx;
4321                         bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX] + shx;
4322                     }
4323                     else
4324                     {
4325                         bx0 = gridi->c0[XX] + (ci_x  )*gridi->sx + shx;
4326                         bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx + shx;
4327                     }
4328
4329                     get_cell_range(bx0,bx1,
4330                                    gridj->ncx,gridj->c0[XX],gridj->sx,gridj->inv_sx,
4331                                    d2z_cy,rl2,
4332                                    &cxf,&cxl);
4333
4334                     if (cxf > cxl)
4335                     {
4336                         continue;
4337                     }
4338
4339                     if (nbl->bSimple)
4340                     {
4341                         new_ci_entry(nbl,cell0_i+ci,shift,flags_i[ci],
4342                                      nbl->work);
4343                     }
4344                     else
4345                     {
4346                         new_sci_entry(nbl,cell0_i+ci,shift,flags_i[ci],
4347                                       nbl->work);
4348                     }
4349
4350 #ifndef NBNXN_SHIFT_BACKWARD
4351                     if (cxf < ci_x)
4352 #else
4353                     if (shift == CENTRAL && gridi == gridj &&
4354                         cxf < ci_x)
4355 #endif
4356                     {
4357                         /* Leave the pairs with i > j.
4358                          * x is the major index, so skip half of it.
4359                          */
4360                         cxf = ci_x;
4361                     }
4362
4363                     if (nbl->bSimple)
4364                     {
4365                         set_icell_bb_simple(bb_i,ci,shx,shy,shz,
4366                                             nbl->work->bb_ci);
4367                     }
4368                     else
4369                     {
4370                         set_icell_bb_supersub(bb_i,ci,shx,shy,shz,
4371                                               nbl->work->bb_ci);
4372                     }
4373
4374                     nbs->icell_set_x(cell0_i+ci,shx,shy,shz,
4375                                      gridi->na_c,nbat->xstride,nbat->x,
4376                                      nbl->work);
4377
4378                     for(cx=cxf; cx<=cxl; cx++)
4379                     {
4380                         d2zx = d2z;
4381                         if (gridj->c0[XX] + cx*gridj->sx > bx1)
4382                         {
4383                             d2zx += sqr(gridj->c0[XX] + cx*gridj->sx - bx1);
4384                         }
4385                         else if (gridj->c0[XX] + (cx+1)*gridj->sx < bx0)
4386                         {
4387                             d2zx += sqr(gridj->c0[XX] + (cx+1)*gridj->sx - bx0);
4388                         }
4389
4390 #ifndef NBNXN_SHIFT_BACKWARD
4391                         if (gridi == gridj &&
4392                             cx == 0 && cyf < ci_y)
4393 #else
4394                         if (gridi == gridj &&
4395                             cx == 0 && shift == CENTRAL && cyf < ci_y)
4396 #endif
4397                         {
4398                             /* Leave the pairs with i > j.
4399                              * Skip half of y when i and j have the same x.
4400                              */
4401                             cyf_x = ci_y;
4402                         }
4403                         else
4404                         {
4405                             cyf_x = cyf;
4406                         }
4407
4408                         for(cy=cyf_x; cy<=cyl; cy++)
4409                         {
4410                             c0 = gridj->cxy_ind[cx*gridj->ncy+cy];
4411                             c1 = gridj->cxy_ind[cx*gridj->ncy+cy+1];
4412 #ifdef NBNXN_SHIFT_BACKWARD
4413                             if (gridi == gridj &&
4414                                 shift == CENTRAL && c0 < ci)
4415                             {
4416                                 c0 = ci;
4417                             }
4418 #endif
4419
4420                             d2zxy = d2zx;
4421                             if (gridj->c0[YY] + cy*gridj->sy > by1)
4422                             {
4423                                 d2zxy += sqr(gridj->c0[YY] + cy*gridj->sy - by1);
4424                             }
4425                             else if (gridj->c0[YY] + (cy+1)*gridj->sy < by0)
4426                             {
4427                                 d2zxy += sqr(gridj->c0[YY] + (cy+1)*gridj->sy - by0);
4428                             }
4429                             if (c1 > c0 && d2zxy < rl2)
4430                             {
4431                                 cs = c0 + (int)(bz1_frac*(c1 - c0));
4432                                 if (cs >= c1)
4433                                 {
4434                                     cs = c1 - 1;
4435                                 }
4436
4437                                 d2xy = d2zxy - d2z;
4438
4439                                 /* Find the lowest cell that can possibly
4440                                  * be within range.
4441                                  */
4442                                 cf = cs;
4443                                 while(cf > c0 &&
4444                                       (bbcz_j[cf*NNBSBB_D+1] >= bz0 ||
4445                                        d2xy + sqr(bbcz_j[cf*NNBSBB_D+1] - bz0) < rl2))
4446                                 {
4447                                     cf--;
4448                                 }
4449
4450                                 /* Find the highest cell that can possibly
4451                                  * be within range.
4452                                  */
4453                                 cl = cs;
4454                                 while(cl < c1-1 &&
4455                                       (bbcz_j[cl*NNBSBB_D] <= bz1 ||
4456                                        d2xy + sqr(bbcz_j[cl*NNBSBB_D] - bz1) < rl2))
4457                                 {
4458                                     cl++;
4459                                 }
4460
4461 #ifdef NBNXN_REFCODE
4462                                 {
4463                                     /* Simple reference code, for debugging,
4464                                      * overrides the more complex code above.
4465                                      */
4466                                     int k;
4467                                     cf = c1;
4468                                     cl = -1;
4469                                     for(k=c0; k<c1; k++)
4470                                     {
4471                                         if (box_dist2(bx0,bx1,by0,by1,bz0,bz1,
4472                                                       bb+k*NNBSBB_B) < rl2 &&
4473                                             k < cf)
4474                                         {
4475                                             cf = k;
4476                                         }
4477                                         if (box_dist2(bx0,bx1,by0,by1,bz0,bz1,
4478                                                       bb+k*NNBSBB_B) < rl2 &&
4479                                             k > cl)
4480                                         {
4481                                             cl = k;
4482                                         }
4483                                     }
4484                                 }
4485 #endif
4486
4487                                 if (gridi == gridj)
4488                                 {
4489                                     /* We want each atom/cell pair only once,
4490                                      * only use cj >= ci.
4491                                      */
4492 #ifndef NBNXN_SHIFT_BACKWARD
4493                                     cf = max(cf,ci);
4494 #else
4495                                     if (shift == CENTRAL)
4496                                     {
4497                                         cf = max(cf,ci);
4498                                     }
4499 #endif
4500                                 }
4501
4502                                 if (cf <= cl)
4503                                 {
4504                                     /* For f buffer flags with simple lists */
4505                                     ncj_old_j = nbl->ncj;
4506
4507                                     switch (nb_kernel_type)
4508                                     {
4509                                     case nbnxnk4x4_PlainC:
4510                                         check_subcell_list_space_simple(nbl,cl-cf+1);
4511
4512                                         make_cluster_list_simple(gridj,
4513                                                                  nbl,ci,cf,cl,
4514                                                                  (gridi == gridj && shift == CENTRAL),
4515                                                                  nbat->x,
4516                                                                  rl2,rbb2,
4517                                                                  &ndistc);
4518                                         break;
4519 #ifdef GMX_NBNXN_SIMD_4XN
4520                                     case nbnxnk4xN_SIMD_4xN:
4521                                         check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
4522                                         make_cluster_list_simd_4xn(gridj,
4523                                                                    nbl,ci,cf,cl,
4524                                                                    (gridi == gridj && shift == CENTRAL),
4525                                                                    nbat->x,
4526                                                                    rl2,rbb2,
4527                                                                    &ndistc);
4528                                         break;
4529 #endif
4530 #ifdef GMX_NBNXN_SIMD_2XNN
4531                                     case nbnxnk4xN_SIMD_2xNN:
4532                                         check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
4533                                         make_cluster_list_simd_2xnn(gridj,
4534                                                                    nbl,ci,cf,cl,
4535                                                                    (gridi == gridj && shift == CENTRAL),
4536                                                                    nbat->x,
4537                                                                    rl2,rbb2,
4538                                                                    &ndistc);
4539                                         break;
4540 #endif
4541                                     case nbnxnk8x8x8_PlainC:
4542                                     case nbnxnk8x8x8_CUDA:
4543                                         check_subcell_list_space_supersub(nbl,cl-cf+1);
4544                                         for(cj=cf; cj<=cl; cj++)
4545                                         {
4546                                             make_cluster_list_supersub(nbs,gridi,gridj,
4547                                                                        nbl,ci,cj,
4548                                                                        (gridi == gridj && shift == CENTRAL && ci == cj),
4549                                                                        nbat->xstride,nbat->x,
4550                                                                        rl2,rbb2,
4551                                                                        &ndistc);
4552                                         }
4553                                         break;
4554                                     }
4555                                     ncpcheck += cl - cf + 1;
4556
4557                                     if (bFBufferFlag && nbl->ncj > ncj_old_j)
4558                                     {
4559                                         int cbf,cbl,cb;
4560
4561                                         cbf = nbl->cj[ncj_old_j].cj >> gridj_flag_shift;
4562                                         cbl = nbl->cj[nbl->ncj-1].cj >> gridj_flag_shift;
4563                                         for(cb=cbf; cb<=cbl; cb++)
4564                                         {
4565                                             gridj_flag[cb] = 1U<<th;
4566                                         }
4567                                     }
4568                                 }
4569                             }
4570                         }
4571                     }
4572
4573                     /* Set the exclusions for this ci list */
4574                     if (nbl->bSimple)
4575                     {
4576                         set_ci_top_excls(nbs,
4577                                          nbl,
4578                                          shift == CENTRAL && gridi == gridj,
4579                                          gridj->na_c_2log,
4580                                          na_cj_2log,
4581                                          &(nbl->ci[nbl->nci]),
4582                                          excl);
4583                     }
4584                     else
4585                     {
4586                         set_sci_top_excls(nbs,
4587                                           nbl,
4588                                           shift == CENTRAL && gridi == gridj,
4589                                           gridj->na_c_2log,
4590                                           &(nbl->sci[nbl->nsci]),
4591                                           excl);
4592                     }
4593
4594                     /* Close this ci list */
4595                     if (nbl->bSimple)
4596                     {
4597                         close_ci_entry_simple(nbl);
4598                     }
4599                     else
4600                     {
4601                         close_ci_entry_supersub(nbl,
4602                                                 nsubpair_max,
4603                                                 progBal,min_ci_balanced,
4604                                                 th,nth);
4605                     }
4606                 }
4607             }
4608         }
4609
4610         if (bFBufferFlag && nbl->ncj > ncj_old_i)
4611         {
4612             work->buffer_flags.flag[(gridi->cell0+ci)>>gridi_flag_shift] = 1U<<th;
4613         }
4614     }
4615
4616     work->ndistc = ndistc;
4617
4618     nbs_cycle_stop(&work->cc[enbsCCsearch]);
4619
4620     if (debug)
4621     {
4622         fprintf(debug,"number of distance checks %d\n",ndistc);
4623         fprintf(debug,"ncpcheck %s %d\n",gridi==gridj ? "local" : "non-local",
4624                 ncpcheck);
4625
4626         if (nbl->bSimple)
4627         {
4628             print_nblist_statistics_simple(debug,nbl,nbs,rlist);
4629         }
4630         else
4631         {
4632             print_nblist_statistics_supersub(debug,nbl,nbs,rlist);
4633         }
4634
4635     }
4636 }
4637
4638 static void reduce_buffer_flags(const nbnxn_search_t nbs,
4639                                 int nsrc,
4640                                 const nbnxn_buffer_flags_t *dest)
4641 {
4642     int s,b;
4643     const unsigned *flag;
4644
4645     for(s=0; s<nsrc; s++)
4646     {
4647         flag = nbs->work[s].buffer_flags.flag;
4648
4649         for(b=0; b<dest->nflag; b++)
4650         {
4651             dest->flag[b] |= flag[b];
4652         }
4653     }
4654 }
4655
4656 static void print_reduction_cost(const nbnxn_buffer_flags_t *flags,int nout)
4657 {
4658     int nelem,nkeep,ncopy,nred,b,c,out;
4659
4660     nelem = 0;
4661     nkeep = 0;
4662     ncopy = 0;
4663     nred  = 0;
4664     for(b=0; b<flags->nflag; b++)
4665     {
4666         if (flags->flag[b] == 1)
4667         {
4668             /* Only flag 0 is set, no copy of reduction required */
4669             nelem++;
4670             nkeep++;
4671         }
4672         else if (flags->flag[b] > 0)
4673         {
4674             c = 0;
4675             for(out=0; out<nout; out++)
4676             {
4677                 if (flags->flag[b] & (1U<<out))
4678                 {
4679                     c++;
4680                 }
4681             }
4682             nelem += c;
4683             if (c == 1)
4684             {
4685                 ncopy++;
4686             }
4687             else
4688             {
4689                 nred += c;
4690             }
4691         }
4692     }
4693
4694     fprintf(debug,"nbnxn reduction: #flag %d #list %d elem %4.2f, keep %4.2f copy %4.2f red %4.2f\n",
4695             flags->nflag,nout,
4696             nelem/(double)(flags->nflag),
4697             nkeep/(double)(flags->nflag),
4698             ncopy/(double)(flags->nflag),
4699             nred/(double)(flags->nflag));
4700 }
4701
4702 /* Make a local or non-local pair-list, depending on iloc */
4703 void nbnxn_make_pairlist(const nbnxn_search_t nbs,
4704                          nbnxn_atomdata_t *nbat,
4705                          const t_blocka *excl,
4706                          real rlist,
4707                          int min_ci_balanced,
4708                          nbnxn_pairlist_set_t *nbl_list,
4709                          int iloc,
4710                          int nb_kernel_type,
4711                          t_nrnb *nrnb)
4712 {
4713     nbnxn_grid_t *gridi,*gridj;
4714     int nzi,zi,zj0,zj1,zj;
4715     int nsubpair_max;
4716     int th;
4717     int nnbl;
4718     nbnxn_pairlist_t **nbl;
4719     int ci_block;
4720     gmx_bool CombineNBLists;
4721     int np_tot,np_noq,np_hlj,nap;
4722
4723     nnbl            = nbl_list->nnbl;
4724     nbl             = nbl_list->nbl;
4725     CombineNBLists  = nbl_list->bCombined;
4726
4727     if (debug)
4728     {
4729         fprintf(debug,"ns making %d nblists\n", nnbl);
4730     }
4731
4732     nbat->bUseBufferFlags = (nbat->nout > 1);
4733     if (nbat->bUseBufferFlags && LOCAL_I(iloc))
4734     {
4735         init_buffer_flags(&nbat->buffer_flags,nbat->natoms);
4736     }
4737
4738     if (nbl_list->bSimple)
4739     {
4740         switch (nb_kernel_type)
4741         {
4742 #ifdef GMX_NBNXN_SIMD_4XN
4743         case nbnxnk4xN_SIMD_4xN:
4744             nbs->icell_set_x = icell_set_x_simd_4xn;
4745             break;
4746 #endif
4747 #ifdef GMX_NBNXN_SIMD_2XNN
4748         case nbnxnk4xN_SIMD_2xNN:
4749             nbs->icell_set_x = icell_set_x_simd_2xnn;
4750             break;
4751 #endif
4752         default:
4753             nbs->icell_set_x = icell_set_x_simple;
4754             break;
4755         }
4756     }
4757     else
4758     {
4759 #ifdef NBNXN_SEARCH_SSE
4760         nbs->icell_set_x = icell_set_x_supersub_sse8;
4761 #else
4762         nbs->icell_set_x = icell_set_x_supersub;
4763 #endif
4764     }
4765
4766     if (LOCAL_I(iloc))
4767     {
4768         /* Only zone (grid) 0 vs 0 */
4769         nzi = 1;
4770         zj0 = 0;
4771         zj1 = 1;
4772     }
4773     else
4774     {
4775         nzi = nbs->zones->nizone;
4776     }
4777
4778     if (!nbl_list->bSimple && min_ci_balanced > 0)
4779     {
4780         nsubpair_max = get_nsubpair_max(nbs,iloc,rlist,min_ci_balanced);
4781     }
4782     else
4783     {
4784         nsubpair_max = 0;
4785     }
4786
4787     /* Clear all pair-lists */
4788     for(th=0; th<nnbl; th++)
4789     {
4790         clear_pairlist(nbl[th]);
4791     }
4792
4793     for(zi=0; zi<nzi; zi++)
4794     {
4795         gridi = &nbs->grid[zi];
4796
4797         if (NONLOCAL_I(iloc))
4798         {
4799             zj0 = nbs->zones->izone[zi].j0;
4800             zj1 = nbs->zones->izone[zi].j1;
4801             if (zi == 0)
4802             {
4803                 zj0++;
4804             }
4805         }
4806         for(zj=zj0; zj<zj1; zj++)
4807         {
4808             gridj = &nbs->grid[zj];
4809
4810             if (debug)
4811             {
4812                 fprintf(debug,"ns search grid %d vs %d\n",zi,zj);
4813             }
4814
4815             nbs_cycle_start(&nbs->cc[enbsCCsearch]);
4816
4817             if (nbl[0]->bSimple && !gridi->bSimple)
4818             {
4819                 /* Hybrid list, determine blocking later */
4820                 ci_block = 0;
4821             }
4822             else
4823             {
4824                 ci_block = get_ci_block_size(gridi,nbs->DomDec,nnbl);
4825             }
4826
4827 #pragma omp parallel for num_threads(nnbl) schedule(static)
4828             for(th=0; th<nnbl; th++)
4829             {
4830                 if (nbat->bUseBufferFlags && zi == 0 && zj == 0)
4831                 {
4832                     init_buffer_flags(&nbs->work[th].buffer_flags,nbat->natoms);
4833                 }
4834
4835                 if (CombineNBLists && th > 0)
4836                 {
4837                     clear_pairlist(nbl[th]);
4838                 }
4839
4840                 /* Divide the i super cell equally over the nblists */
4841                 nbnxn_make_pairlist_part(nbs,gridi,gridj,
4842                                          &nbs->work[th],nbat,excl,
4843                                          rlist,
4844                                          nb_kernel_type,
4845                                          ci_block,
4846                                          nbat->bUseBufferFlags,
4847                                          nsubpair_max,
4848                                          (LOCAL_I(iloc) || nbs->zones->n <= 2),
4849                                          min_ci_balanced,
4850                                          th,nnbl,
4851                                          nbl[th]);
4852             }
4853             nbs_cycle_stop(&nbs->cc[enbsCCsearch]);
4854
4855             np_tot = 0;
4856             np_noq = 0;
4857             np_hlj = 0;
4858             for(th=0; th<nnbl; th++)
4859             {
4860                 inc_nrnb(nrnb,eNR_NBNXN_DIST2,nbs->work[th].ndistc);
4861
4862                 if (nbl_list->bSimple)
4863                 {
4864                     np_tot += nbl[th]->ncj;
4865                     np_noq += nbl[th]->work->ncj_noq;
4866                     np_hlj += nbl[th]->work->ncj_hlj;
4867                 }
4868                 else
4869                 {
4870                     /* This count ignores potential subsequent pair pruning */
4871                     np_tot += nbl[th]->nci_tot;
4872                 }
4873             }
4874             nap = nbl[0]->na_ci*nbl[0]->na_cj;
4875             nbl_list->natpair_ljq = (np_tot - np_noq)*nap - np_hlj*nap/2;
4876             nbl_list->natpair_lj  = np_noq*nap;
4877             nbl_list->natpair_q   = np_hlj*nap/2;
4878
4879             if (CombineNBLists && nnbl > 1)
4880             {
4881                 nbs_cycle_start(&nbs->cc[enbsCCcombine]);
4882
4883                 combine_nblists(nnbl-1,nbl+1,nbl[0]);
4884
4885                 nbs_cycle_stop(&nbs->cc[enbsCCcombine]);
4886             }
4887         }
4888     }
4889
4890     if (nbat->bUseBufferFlags)
4891     {
4892         reduce_buffer_flags(nbs,nnbl,&nbat->buffer_flags);
4893     }
4894
4895     /*
4896     print_supersub_nsp("nsubpair",nbl[0],iloc);
4897     */
4898
4899     /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
4900     if (LOCAL_I(iloc))
4901     {
4902         nbs->search_count++;
4903     }
4904     if (nbs->print_cycles &&
4905         (!nbs->DomDec || (nbs->DomDec && !LOCAL_I(iloc))) &&
4906         nbs->search_count % 100 == 0)
4907     {
4908         nbs_cycle_print(stderr,nbs);
4909     }
4910
4911     if (debug && (CombineNBLists && nnbl > 1))
4912     {
4913         if (nbl[0]->bSimple)
4914         {
4915             print_nblist_statistics_simple(debug,nbl[0],nbs,rlist);
4916         }
4917         else
4918         {
4919             print_nblist_statistics_supersub(debug,nbl[0],nbs,rlist);
4920         }
4921     }
4922
4923     if (debug)
4924     {
4925         if (gmx_debug_at)
4926         {
4927             if (nbl[0]->bSimple)
4928             {
4929                 print_nblist_ci_cj(debug,nbl[0]);
4930             }
4931             else
4932             {
4933                 print_nblist_sci_cj(debug,nbl[0]);
4934             }
4935         }
4936
4937         if (nbat->bUseBufferFlags)
4938         {
4939             print_reduction_cost(&nbat->buffer_flags,nnbl);
4940         }
4941     }
4942 }