src/mdlib/nbnxn_cuda/nbnxn_cuda.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2012, The GROMACS development team,
   6  * check out http://www.gromacs.org for more information.
   7  * Copyright (c) 2012, by the GROMACS development team, led by
   8  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   9  * others, as listed in the AUTHORS file in the top-level source
  10  * directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38
  39 #include <stdlib.h>
  40 #include <assert.h>
  41
  42 #if defined(_MSVC)
  43 #include <limits>
  44 #endif
  45
  46 #include <cuda.h>
  47
  48 #include "types/simple.h"
  49 #include "types/nbnxn_pairlist.h"
  50 #include "types/nb_verlet.h"
  51 #include "types/ishift.h"
  52 #include "types/force_flags.h"
  53 #include "../nbnxn_consts.h"
  54
  55 #ifdef TMPI_ATOMICS
  56 #include "thread_mpi/atomic.h"
  57 #endif
  58
  59 #include "nbnxn_cuda_types.h"
  60 #include "../../gmxlib/cuda_tools/cudautils.cuh"
  61 #include "nbnxn_cuda.h"
  62 #include "nbnxn_cuda_data_mgmt.h"
  63
  64
  65 /*! Texture reference for nonbonded parameters; bound to cu_nbparam_t.nbfp*/
  66 texture<float, 1, cudaReadModeElementType> tex_nbfp;
  67
  68 /*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
  69 texture<float, 1, cudaReadModeElementType> tex_coulomb_tab;
  70
  71 /* Convenience defines */
  72 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  73 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  74
  75 /***** The kernels come here *****/
  76 #include "nbnxn_cuda_kernel_utils.cuh"
  77
  78 /* Generate all combinations of kernels through multiple inclusion:
  79    F, F + E, F + prune, F + E + prune. */
  80 /** Force only **/
  81 #include "nbnxn_cuda_kernels.cuh"
  82 /** Force & energy **/
  83 #define CALC_ENERGIES
  84 #include "nbnxn_cuda_kernels.cuh"
  85 #undef CALC_ENERGIES
  86
  87 /*** Pair-list pruning kernels ***/
  88 /** Force only **/
  89 #define PRUNE_NBL
  90 #include "nbnxn_cuda_kernels.cuh"
  91 /** Force & energy **/
  92 #define CALC_ENERGIES
  93 #include "nbnxn_cuda_kernels.cuh"
  94 #undef CALC_ENERGIES
  95 #undef PRUNE_NBL
  96
  97 /*! Nonbonded kernel function pointer type */
  98 typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
  99                                      const cu_nbparam_t,
 100                                      const cu_plist_t,
 101                                      bool);
 102
 103 /*********************************/
 104
 105 /* XXX always/never run the energy/pruning kernels -- only for benchmarking purposes */
 106 static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
 107 static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
 108 static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
 109
 110
 111 /* Bit-pattern used for polling-based GPU synchronization. It is used as a float
 112  * and corresponds to having the exponent set to the maximum (127 -- single
 113  * precision) and the mantissa to 0.
 114  */
 115 static unsigned int poll_wait_pattern = (0x7FU << 23);
 116
 117 /*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
 118 static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
 119 {
 120     int max_grid_x_size;
 121
 122     assert(dinfo);
 123
 124     max_grid_x_size = dinfo->prop.maxGridSize[0];
 125
 126     /* do we exceed the grid x dimension limit? */
 127     if (nwork_units > max_grid_x_size)
 128     {
 129         gmx_fatal(FARGS, "Watch out system too large to simulate!\n"
 130                   "The number of nonbonded work units (=number of super-clusters) exceeds the"
 131                   "maximum grid size in x dimension (%d > %d)!", nwork_units, max_grid_x_size);
 132     }
 133
 134     return nwork_units;
 135 }
 136
 137
 138 /* Constant arrays listing all kernel function pointers and enabling selection
 139    of a kernel in an elegant manner. */
 140
 141 static const int nEnergyKernelTypes = 2; /* 0 - no energy, 1 - energy */
 142 static const int nPruneKernelTypes  = 2; /* 0 - no prune, 1 - prune */
 143
 144 /* Default kernels */
 145 static const nbnxn_cu_kfunc_ptr_t
 146 nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 147 {
 148     { { k_nbnxn_ewald,              k_nbnxn_ewald_prune },
 149       { k_nbnxn_ewald_ener,         k_nbnxn_ewald_ener_prune } },
 150     { { k_nbnxn_ewald_twin,         k_nbnxn_ewald_twin_prune },
 151       { k_nbnxn_ewald_twin_ener,    k_nbnxn_ewald_twin_ener_prune } },
 152     { { k_nbnxn_rf,                 k_nbnxn_rf_prune },
 153       { k_nbnxn_rf_ener,            k_nbnxn_rf_ener_prune } },
 154     { { k_nbnxn_cutoff,             k_nbnxn_cutoff_prune },
 155       { k_nbnxn_cutoff_ener,        k_nbnxn_cutoff_ener_prune } },
 156 };
 157
 158 /* Legacy kernels */
 159 static const nbnxn_cu_kfunc_ptr_t
 160 nb_legacy_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 161 {
 162     { { k_nbnxn_ewald_legacy,           k_nbnxn_ewald_prune_legacy },
 163       { k_nbnxn_ewald_ener_legacy,      k_nbnxn_ewald_ener_prune_legacy } },
 164     { { k_nbnxn_ewald_twin_legacy,      k_nbnxn_ewald_twin_prune_legacy },
 165       { k_nbnxn_ewald_twin_ener_legacy, k_nbnxn_ewald_twin_ener_prune_legacy } },
 166     { { k_nbnxn_rf_legacy,              k_nbnxn_rf_prune_legacy },
 167       { k_nbnxn_rf_ener_legacy,         k_nbnxn_rf_ener_prune_legacy } },
 168     { { k_nbnxn_cutoff_legacy,          k_nbnxn_cutoff_prune_legacy },
 169       { k_nbnxn_cutoff_ener_legacy,     k_nbnxn_cutoff_ener_prune_legacy } },
 170 };
 171
 172 /*! Return a pointer to the kernel version to be executed at the current step. */
 173 static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int kver, int eeltype,
 174                                                        bool bDoEne, bool bDoPrune)
 175 {
 176     assert(kver < eNbnxnCuKNR);
 177     assert(eeltype < eelCuNR);
 178
 179     if (NBNXN_KVER_LEGACY(kver))
 180     {
 181         return nb_legacy_kfunc_ptr[eeltype][bDoEne][bDoPrune];
 182     }
 183     else
 184     {
 185         return nb_default_kfunc_ptr[eeltype][bDoEne][bDoPrune];
 186     }
 187 }
 188
 189 /*! Calculates the amount of shared memory required for kernel version in use. */
 190 static inline int calc_shmem_required(int kver)
 191 {
 192     int shmem;
 193
 194     /* size of shmem (force-buffers/xq/atom type preloading) */
 195     if (NBNXN_KVER_LEGACY(kver))
 196     {
 197         /* i-atom x+q in shared memory */
 198         shmem =  NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 199         /* force reduction buffers in shared memory */
 200         shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 201     }
 202     else
 203     {
 204         /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
 205         /* i-atom x+q in shared memory */
 206         shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 207         /* cj in shared memory, for both warps separately */
 208         shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
 209 #ifdef IATYPE_SHMEM
 210         /* i-atom types in shared memory */
 211         shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
 212 #endif
 213 #if __CUDA_ARCH__ < 300
 214         /* force reduction buffers in shared memory */
 215         shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 216 #endif
 217     }
 218
 219     return shmem;
 220 }
 221
 222 /*! As we execute nonbonded workload in separate streams, before launching
 223    the kernel we need to make sure that he following operations have completed:
 224    - atomdata allocation and related H2D transfers (every nstlist step);
 225    - pair list H2D transfer (every nstlist step);
 226    - shift vector H2D transfer (every nstlist step);
 227    - force (+shift force and energy) output clearing (every step).
 228
 229    These operations are issued in the local stream at the beginning of the step
 230    and therefore always complete before the local kernel launch. The non-local
 231    kernel is launched after the local on the same device/context, so this is
 232    inherently scheduled after the operations in the local stream (including the
 233    above "misc_ops").
 234    However, for the sake of having a future-proof implementation, we use the
 235    misc_ops_done event to record the point in time when the above  operations
 236    are finished and synchronize with this event in the non-local stream.
 237 */
 238 void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
 239                               const nbnxn_atomdata_t *nbatom,
 240                               int flags,
 241                               int iloc)
 242 {
 243     cudaError_t stat;
 244     int adat_begin, adat_len;  /* local/nonlocal offset and length used for xq and f */
 245     /* CUDA kernel launch-related stuff */
 246     int  shmem, nblock;
 247     dim3 dim_block, dim_grid;
 248     nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
 249
 250     cu_atomdata_t   *adat   = cu_nb->atdat;
 251     cu_nbparam_t    *nbp    = cu_nb->nbparam;
 252     cu_plist_t      *plist  = cu_nb->plist[iloc];
 253     cu_timers_t     *t      = cu_nb->timers;
 254     cudaStream_t    stream  = cu_nb->stream[iloc];
 255
 256     bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
 257     bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
 258     bool bDoTime     = cu_nb->bDoTime;
 259
 260     /* turn energy calculation always on/off (for debugging/testing only) */
 261     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 262
 263     /* don't launch the kernel if there is no work to do */
 264     if (plist->nsci == 0)
 265     {
 266         return;
 267     }
 268
 269     /* calculate the atom data index range based on locality */
 270     if (LOCAL_I(iloc))
 271     {
 272         adat_begin  = 0;
 273         adat_len    = adat->natoms_local;
 274     }
 275     else
 276     {
 277         adat_begin  = adat->natoms_local;
 278         adat_len    = adat->natoms - adat->natoms_local;
 279     }
 280
 281     /* When we get here all misc operations issues in the local stream are done,
 282        so we record that in the local stream and wait for it in the nonlocal one. */
 283     if (cu_nb->bUseTwoStreams)
 284     {
 285         if (iloc == eintLocal)
 286         {
 287             stat = cudaEventRecord(cu_nb->misc_ops_done, stream);
 288             CU_RET_ERR(stat, "cudaEventRecord on misc_ops_done failed");
 289         }
 290         else
 291         {
 292             stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_done, 0);
 293             CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_done failed");
 294         }
 295     }
 296
 297     /* beginning of timed HtoD section */
 298     if (bDoTime)
 299     {
 300         stat = cudaEventRecord(t->start_nb_h2d[iloc], stream);
 301         CU_RET_ERR(stat, "cudaEventRecord failed");
 302     }
 303
 304     /* HtoD x, q */
 305     cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
 306                       adat_len * sizeof(*adat->xq), stream);
 307
 308     if (bDoTime)
 309     {
 310         stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream);
 311         CU_RET_ERR(stat, "cudaEventRecord failed");
 312     }
 313
 314     /* beginning of timed nonbonded calculation section */
 315     if (bDoTime)
 316     {
 317         stat = cudaEventRecord(t->start_nb_k[iloc], stream);
 318         CU_RET_ERR(stat, "cudaEventRecord failed");
 319     }
 320
 321     /* get the pointer to the kernel flavor we need to use */
 322     nb_kernel = select_nbnxn_kernel(cu_nb->kernel_ver, nbp->eeltype, bCalcEner,
 323                                     plist->bDoPrune || always_prune);
 324
 325     /* kernel launch config */
 326     nblock    = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
 327     dim_block = dim3(CL_SIZE, CL_SIZE, 1);
 328     dim_grid  = dim3(nblock, 1, 1);
 329     shmem     = calc_shmem_required(cu_nb->kernel_ver);
 330
 331     if (debug)
 332     {
 333         fprintf(debug, "GPU launch configuration:\n\tThread block: %dx%dx%d\n\t"
 334                 "Grid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
 335                 dim_block.x, dim_block.y, dim_block.z,
 336                 dim_grid.x, dim_grid.y, plist->nsci*NCL_PER_SUPERCL,
 337                 NCL_PER_SUPERCL, plist->na_c);
 338     }
 339
 340     nb_kernel<<<dim_grid, dim_block, shmem, stream>>>(*adat, *nbp, *plist, bCalcFshift);
 341     CU_LAUNCH_ERR("k_calc_nb");
 342
 343     if (bDoTime)
 344     {
 345         stat = cudaEventRecord(t->stop_nb_k[iloc], stream);
 346         CU_RET_ERR(stat, "cudaEventRecord failed");
 347     }
 348 }
 349
 350 void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
 351                                const nbnxn_atomdata_t *nbatom,
 352                                int flags,
 353                                int aloc)
 354 {
 355     cudaError_t stat;
 356     int adat_begin, adat_len, adat_end;  /* local/nonlocal offset and length used for xq and f */
 357     int iloc = -1;
 358
 359     /* determine interaction locality from atom locality */
 360     if (LOCAL_A(aloc))
 361     {
 362         iloc = eintLocal;
 363     }
 364     else if (NONLOCAL_A(aloc))
 365     {
 366         iloc = eintNonlocal;
 367     }
 368     else
 369     {
 370         char stmp[STRLEN];
 371         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 372                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 373         gmx_incons(stmp);
 374     }
 375
 376     cu_atomdata_t   *adat   = cu_nb->atdat;
 377     cu_timers_t     *t      = cu_nb->timers;
 378     bool            bDoTime = cu_nb->bDoTime;
 379     cudaStream_t    stream  = cu_nb->stream[iloc];
 380
 381     bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
 382     bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
 383
 384     /* don't launch copy-back if there was no work to do */
 385     if (cu_nb->plist[iloc]->nsci == 0)
 386     {
 387         return;
 388     }
 389
 390     /* calculate the atom data index range based on locality */
 391     if (LOCAL_A(aloc))
 392     {
 393         adat_begin  = 0;
 394         adat_len    = adat->natoms_local;
 395         adat_end    = cu_nb->atdat->natoms_local;
 396     }
 397     else
 398     {
 399         adat_begin  = adat->natoms_local;
 400         adat_len    = adat->natoms - adat->natoms_local;
 401         adat_end    = cu_nb->atdat->natoms;
 402     }
 403
 404     /* beginning of timed D2H section */
 405     if (bDoTime)
 406     {
 407         stat = cudaEventRecord(t->start_nb_d2h[iloc], stream);
 408         CU_RET_ERR(stat, "cudaEventRecord failed");
 409     }
 410
 411     if (!cu_nb->bUseStreamSync)
 412     {
 413         /* For safety reasons set a few (5%) forces to NaN. This way even if the
 414            polling "hack" fails with some future NVIDIA driver we'll get a crash. */
 415         for (int i = adat_begin; i < 3*adat_end + 2; i += adat_len/20)
 416         {
 417 #ifdef NAN
 418             nbatom->out[0].f[i] = NAN;
 419 #else
 420 #  ifdef _MSVC
 421             if (numeric_limits<float>::has_quiet_NaN)
 422             {
 423                 nbatom->out[0].f[i] = numeric_limits<float>::quiet_NaN();
 424             }
 425             else
 426 #  endif
 427             {
 428                 nbatom->out[0].f[i] = GMX_REAL_MAX;
 429             }
 430 #endif
 431         }
 432
 433         /* Set the last four bytes of the force array to a bit pattern
 434            which can't be the result of the force calculation:
 435            max exponent (127) and zero mantissa. */
 436         *(unsigned int*)&nbatom->out[0].f[adat_end*3 - 1] = poll_wait_pattern;
 437     }
 438
 439     /* With DD the local D2H transfer can only start after the non-local
 440        has been launched. */
 441     if (iloc == eintLocal && cu_nb->bUseTwoStreams)
 442     {
 443         stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
 444         CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
 445     }
 446
 447     /* DtoH f */
 448     cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin,
 449                       (adat_len)*sizeof(*adat->f), stream);
 450
 451     /* After the non-local D2H is launched the nonlocal_done event can be
 452        recorded which signals that the local D2H can proceed. This event is not
 453        placed after the non-local kernel because we first need the non-local
 454        data back first. */
 455     if (iloc == eintNonlocal)
 456     {
 457         stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
 458         CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
 459     }
 460
 461     /* only transfer energies in the local stream */
 462     if (LOCAL_I(iloc))
 463     {
 464         /* DtoH fshift */
 465         if (bCalcFshift)
 466         {
 467             cu_copy_D2H_async(cu_nb->nbst.fshift, adat->fshift,
 468                               SHIFTS * sizeof(*cu_nb->nbst.fshift), stream);
 469         }
 470
 471         /* DtoH energies */
 472         if (bCalcEner)
 473         {
 474             cu_copy_D2H_async(cu_nb->nbst.e_lj, adat->e_lj,
 475                               sizeof(*cu_nb->nbst.e_lj), stream);
 476             cu_copy_D2H_async(cu_nb->nbst.e_el, adat->e_el,
 477                               sizeof(*cu_nb->nbst.e_el), stream);
 478         }
 479     }
 480
 481     if (bDoTime)
 482     {
 483         stat = cudaEventRecord(t->stop_nb_d2h[iloc], stream);
 484         CU_RET_ERR(stat, "cudaEventRecord failed");
 485     }
 486 }
 487
 488 /* Atomic compare-exchange operation on unsigned values. It is used in
 489  * polling wait for the GPU.
 490  */
 491 static inline bool atomic_cas(volatile unsigned int *ptr,
 492                               unsigned int oldval,
 493                               unsigned int newval)
 494 {
 495     assert(ptr);
 496
 497 #ifdef TMPI_ATOMICS
 498     return tMPI_Atomic_cas((tMPI_Atomic_t *)ptr, oldval, newval);
 499 #else
 500     gmx_incons("Atomic operations not available, atomic_cas() should not have been called!");
 501     return true;
 502 #endif
 503 }
 504
 505 void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
 506                          const nbnxn_atomdata_t *nbatom,
 507                          int flags, int aloc,
 508                          real *e_lj, real *e_el, rvec *fshift)
 509 {
 510     /* NOTE:  only implemented for single-precision at this time */
 511     cudaError_t stat;
 512     int i, adat_end, iloc = -1;
 513     volatile unsigned int *poll_word;
 514
 515     /* determine interaction locality from atom locality */
 516     if (LOCAL_A(aloc))
 517     {
 518         iloc = eintLocal;
 519     }
 520     else if (NONLOCAL_A(aloc))
 521     {
 522         iloc = eintNonlocal;
 523     }
 524     else
 525     {
 526         char stmp[STRLEN];
 527         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 528                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 529         gmx_incons(stmp);
 530     }
 531
 532     cu_plist_t      *plist   = cu_nb->plist[iloc];
 533     cu_timers_t     *timers  = cu_nb->timers;
 534     wallclock_gpu_t *timings = cu_nb->timings;
 535     nb_staging      nbst     = cu_nb->nbst;
 536
 537     bool    bCalcEner   = flags & GMX_FORCE_VIRIAL;
 538     bool    bCalcFshift = flags & GMX_FORCE_VIRIAL;
 539
 540     /* turn energy calculation always on/off (for debugging/testing only) */
 541     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 542
 543     /* don't launch wait/update timers & counters if there was no work to do
 544
 545        NOTE: if timing with multiple GPUs (streams) becomes possible, the
 546        counters could end up being inconsistent due to not being incremented
 547        on some of the nodes! */
 548     if (cu_nb->plist[iloc]->nsci == 0)
 549     {
 550         return;
 551     }
 552
 553     /* calculate the atom data index range based on locality */
 554     if (LOCAL_A(aloc))
 555     {
 556         adat_end = cu_nb->atdat->natoms_local;
 557     }
 558     else
 559     {
 560         adat_end = cu_nb->atdat->natoms;
 561     }
 562
 563     if (cu_nb->bUseStreamSync)
 564     {
 565         stat = cudaStreamSynchronize(cu_nb->stream[iloc]);
 566         CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
 567     }
 568     else
 569     {
 570         /* Busy-wait until we get the signal pattern set in last byte
 571          * of the l/nl float vector. This pattern corresponds to a floating
 572          * point number which can't be the result of the force calculation
 573          * (maximum, 127 exponent and 0 mantissa).
 574          * The polling uses atomic compare-exchange.
 575          */
 576         poll_word = (volatile unsigned int*)&nbatom->out[0].f[adat_end*3 - 1];
 577         while (atomic_cas(poll_word, poll_wait_pattern, poll_wait_pattern)) {}
 578     }
 579
 580     /* timing data accumulation */
 581     if (cu_nb->bDoTime)
 582     {
 583         /* only increase counter once (at local F wait) */
 584         if (LOCAL_I(iloc))
 585         {
 586             timings->nb_c++;
 587             timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
 588         }
 589
 590         /* kernel timings */
 591         timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
 592             cu_event_elapsed(timers->start_nb_k[iloc], timers->stop_nb_k[iloc]);
 593
 594         /* X/q H2D and F D2H timings */
 595         timings->nb_h2d_t += cu_event_elapsed(timers->start_nb_h2d[iloc],
 596                                                  timers->stop_nb_h2d[iloc]);
 597         timings->nb_d2h_t += cu_event_elapsed(timers->start_nb_d2h[iloc],
 598                                                  timers->stop_nb_d2h[iloc]);
 599
 600         /* only count atdat and pair-list H2D at pair-search step */
 601         if (plist->bDoPrune)
 602         {
 603             /* atdat transfer timing (add only once, at local F wait) */
 604             if (LOCAL_A(aloc))
 605             {
 606                 timings->pl_h2d_c++;
 607                 timings->pl_h2d_t += cu_event_elapsed(timers->start_atdat,
 608                                                          timers->stop_atdat);
 609             }
 610
 611             timings->pl_h2d_t += cu_event_elapsed(timers->start_pl_h2d[iloc],
 612                                                      timers->stop_pl_h2d[iloc]);
 613         }
 614     }
 615
 616     /* add up energies and shift forces (only once at local F wait) */
 617     if (LOCAL_I(iloc))
 618     {
 619         if (bCalcEner)
 620         {
 621             *e_lj += *nbst.e_lj;
 622             *e_el += *nbst.e_el;
 623         }
 624
 625         if (bCalcFshift)
 626         {
 627             for (i = 0; i < SHIFTS; i++)
 628             {
 629                 fshift[i][0] += nbst.fshift[i].x;
 630                 fshift[i][1] += nbst.fshift[i].y;
 631                 fshift[i][2] += nbst.fshift[i].z;
 632             }
 633         }
 634     }
 635
 636     /* turn off pruning (doesn't matter if this is pair-search step or not) */
 637     plist->bDoPrune = false;
 638 }
 639
 640 /*! Return the reference to the nbfp texture. */
 641 const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref()
 642 {
 643     return tex_nbfp;
 644 }
 645
 646 /*! Return the reference to the coulomb_tab. */
 647 const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref()
 648 {
 649     return tex_coulomb_tab;
 650 }
 651
 652 /*! Set up the cache configuration for the non-bonded kernels,
 653  */
 654 void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
 655 {
 656     cudaError_t stat;
 657
 658     for (int i = 0; i < eelCuNR; i++)
 659         for (int j = 0; j < nEnergyKernelTypes; j++)
 660             for (int k = 0; k < nPruneKernelTypes; k++)
 661             {
 662                 /* Legacy kernel 16/48 kB Shared/L1 */
 663                 stat = cudaFuncSetCacheConfig(nb_legacy_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
 664                 CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 665
 666                 if (devinfo->prop.major >= 3)
 667                 {
 668                     /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
 669                     stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferShared);
 670                 }
 671                 else
 672                 {
 673                     /* On Fermi prefer L1 gives 2% higher performance */
 674                     /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
 675                     stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
 676                 }
 677                 CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 678             }
 679 }