src/mdlib/nbnxn_cuda/nbnxn_cuda.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2012, The GROMACS development team,
   6  * check out http://www.gromacs.org for more information.
   7  * Copyright (c) 2012, by the GROMACS development team, led by
   8  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   9  * others, as listed in the AUTHORS file in the top-level source
  10  * directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38
  39 #include <stdlib.h>
  40 #include <assert.h>
  41
  42 #if defined(_MSVC)
  43 #include <limits>
  44 #endif
  45
  46 #include <cuda.h>
  47
  48 #include "types/simple.h"
  49 #include "types/nbnxn_pairlist.h"
  50 #include "types/nb_verlet.h"
  51 #include "types/ishift.h"
  52 #include "types/force_flags.h"
  53 #include "../nbnxn_consts.h"
  54
  55 #ifdef TMPI_ATOMICS
  56 #include "thread_mpi/atomic.h"
  57 #endif
  58
  59 #include "nbnxn_cuda_types.h"
  60 #include "../../gmxlib/cuda_tools/cudautils.cuh"
  61 #include "nbnxn_cuda.h"
  62 #include "nbnxn_cuda_data_mgmt.h"
  63
  64 #if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ >= 300
  65 #define USE_TEXOBJ
  66 #endif
  67
  68 /*! Texture reference for nonbonded parameters; bound to cu_nbparam_t.nbfp*/
  69 texture<float, 1, cudaReadModeElementType> nbfp_texref;
  70
  71 /*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
  72 texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
  73
  74 /* Convenience defines */
  75 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  76 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  77
  78 /***** The kernels come here *****/
  79 #include "nbnxn_cuda_kernel_utils.cuh"
  80
  81 /* Top-level kernel generation: will generate through multiple inclusion the
  82  * following flavors for all kernels:
  83  * - force-only output;
  84  * - force and energy output;
  85  * - force-only with pair list pruning;
  86  * - force and energy output with pair list pruning.
  87  */
  88 /** Force only **/
  89 #include "nbnxn_cuda_kernels.cuh"
  90 /** Force & energy **/
  91 #define CALC_ENERGIES
  92 #include "nbnxn_cuda_kernels.cuh"
  93 #undef CALC_ENERGIES
  94
  95 /*** Pair-list pruning kernels ***/
  96 /** Force only **/
  97 #define PRUNE_NBL
  98 #include "nbnxn_cuda_kernels.cuh"
  99 /** Force & energy **/
 100 #define CALC_ENERGIES
 101 #include "nbnxn_cuda_kernels.cuh"
 102 #undef CALC_ENERGIES
 103 #undef PRUNE_NBL
 104
 105 /*! Nonbonded kernel function pointer type */
 106 typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
 107                                      const cu_nbparam_t,
 108                                      const cu_plist_t,
 109                                      bool);
 110
 111 /*********************************/
 112
 113 /* XXX always/never run the energy/pruning kernels -- only for benchmarking purposes */
 114 static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
 115 static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
 116 static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
 117
 118
 119 /* Bit-pattern used for polling-based GPU synchronization. It is used as a float
 120  * and corresponds to having the exponent set to the maximum (127 -- single
 121  * precision) and the mantissa to 0.
 122  */
 123 static unsigned int poll_wait_pattern = (0x7FU << 23);
 124
 125 /*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
 126 static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
 127 {
 128     int max_grid_x_size;
 129
 130     assert(dinfo);
 131
 132     max_grid_x_size = dinfo->prop.maxGridSize[0];
 133
 134     /* do we exceed the grid x dimension limit? */
 135     if (nwork_units > max_grid_x_size)
 136     {
 137         gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
 138                   "The number of nonbonded work units (=number of super-clusters) exceeds the"
 139                   "maximum grid size in x dimension (%d > %d)!", nwork_units, max_grid_x_size);
 140     }
 141
 142     return nwork_units;
 143 }
 144
 145
 146 /* Constant arrays listing all kernel function pointers and enabling selection
 147    of a kernel in an elegant manner. */
 148
 149 static const int nEnergyKernelTypes = 2; /* 0 - no energy, 1 - energy */
 150 static const int nPruneKernelTypes  = 2; /* 0 - no prune, 1 - prune */
 151
 152 /*! Pointers to the default kernels organized in a 3 dim array by:
 153  *  electrostatics type, energy calculation on/off, and pruning on/off.
 154  *
 155  *  Note that the order of electrostatics (1st dimension) has to match the
 156  *  order of corresponding enumerated types defined in nbnxn_cuda_types.h.
 157  */
 158 static const nbnxn_cu_kfunc_ptr_t
 159 nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 160 {
 161     { { k_nbnxn_cutoff,                     k_nbnxn_cutoff_prune },
 162       { k_nbnxn_cutoff_ener,                k_nbnxn_cutoff_ener_prune } },
 163     { { k_nbnxn_rf,                         k_nbnxn_rf_prune },
 164       { k_nbnxn_rf_ener,                    k_nbnxn_rf_ener_prune } },
 165     { { k_nbnxn_ewald_tab,                  k_nbnxn_ewald_tab_prune },
 166       { k_nbnxn_ewald_tab_ener,             k_nbnxn_ewald_tab_ener_prune } },
 167     { { k_nbnxn_ewald_tab_twin,             k_nbnxn_ewald_tab_twin_prune },
 168       { k_nbnxn_ewald_tab_twin_ener,        k_nbnxn_ewald_twin_ener_prune } },
 169     { { k_nbnxn_ewald,                      k_nbnxn_ewald_prune },
 170       { k_nbnxn_ewald_ener,                 k_nbnxn_ewald_ener_prune } },
 171     { { k_nbnxn_ewald_twin,                 k_nbnxn_ewald_twin_prune },
 172       { k_nbnxn_ewald_twin_ener,            k_nbnxn_ewald_twin_ener_prune } },
 173 };
 174
 175 /*! Pointers to the legacy kernels organized in a 3 dim array by:
 176  *  electrostatics type, energy calculation on/off, and pruning on/off.
 177  *
 178  *  Note that the order of electrostatics (1st dimension) has to match the
 179  *  order of corresponding enumerated types defined in nbnxn_cuda_types.h.
 180  */
 181 static const nbnxn_cu_kfunc_ptr_t
 182 nb_legacy_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 183 {
 184     { { k_nbnxn_cutoff_legacy,              k_nbnxn_cutoff_prune_legacy },
 185       { k_nbnxn_cutoff_ener_legacy,         k_nbnxn_cutoff_ener_prune_legacy } },
 186     { { k_nbnxn_rf_legacy,                  k_nbnxn_rf_prune_legacy },
 187       { k_nbnxn_rf_ener_legacy,             k_nbnxn_rf_ener_prune_legacy } },
 188     { { k_nbnxn_ewald_tab_legacy,           k_nbnxn_ewald_tab_prune_legacy },
 189       { k_nbnxn_ewald_tab_ener_legacy,      k_nbnxn_ewald_tab_ener_prune_legacy } },
 190     { { k_nbnxn_ewald_tab_twin_legacy,      k_nbnxn_ewald_tab_twin_prune_legacy },
 191       { k_nbnxn_ewald_tab_twin_ener_legacy, k_nbnxn_ewald_tab_twin_ener_prune_legacy } },
 192 };
 193
 194 /*! Return a pointer to the kernel version to be executed at the current step. */
 195 static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int kver, int eeltype,
 196                                                        bool bDoEne, bool bDoPrune)
 197 {
 198     assert(kver < eNbnxnCuKNR);
 199     assert(eeltype < eelCuNR);
 200
 201     if (NBNXN_KVER_LEGACY(kver))
 202     {
 203         /* no analytical Ewald with legacy kernels */
 204         assert(eeltype <= eelCuEWALD_TAB_TWIN);
 205
 206         return nb_legacy_kfunc_ptr[eeltype][bDoEne][bDoPrune];
 207     }
 208     else
 209     {
 210         return nb_default_kfunc_ptr[eeltype][bDoEne][bDoPrune];
 211     }
 212 }
 213
 214 /*! Calculates the amount of shared memory required for kernel version in use. */
 215 static inline int calc_shmem_required(int kver)
 216 {
 217     int shmem;
 218
 219     /* size of shmem (force-buffers/xq/atom type preloading) */
 220     if (NBNXN_KVER_LEGACY(kver))
 221     {
 222         /* i-atom x+q in shared memory */
 223         shmem =  NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 224         /* force reduction buffers in shared memory */
 225         shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 226     }
 227     else
 228     {
 229         /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
 230         /* i-atom x+q in shared memory */
 231         shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 232         /* cj in shared memory, for both warps separately */
 233         shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
 234 #ifdef IATYPE_SHMEM
 235         /* i-atom types in shared memory */
 236         shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
 237 #endif
 238 #if __CUDA_ARCH__ < 300
 239         /* force reduction buffers in shared memory */
 240         shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 241 #endif
 242     }
 243
 244     return shmem;
 245 }
 246
 247 /*! As we execute nonbonded workload in separate streams, before launching
 248    the kernel we need to make sure that he following operations have completed:
 249    - atomdata allocation and related H2D transfers (every nstlist step);
 250    - pair list H2D transfer (every nstlist step);
 251    - shift vector H2D transfer (every nstlist step);
 252    - force (+shift force and energy) output clearing (every step).
 253
 254    These operations are issued in the local stream at the beginning of the step
 255    and therefore always complete before the local kernel launch. The non-local
 256    kernel is launched after the local on the same device/context, so this is
 257    inherently scheduled after the operations in the local stream (including the
 258    above "misc_ops").
 259    However, for the sake of having a future-proof implementation, we use the
 260    misc_ops_done event to record the point in time when the above  operations
 261    are finished and synchronize with this event in the non-local stream.
 262 */
 263 void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
 264                               const nbnxn_atomdata_t *nbatom,
 265                               int flags,
 266                               int iloc)
 267 {
 268     cudaError_t stat;
 269     int adat_begin, adat_len;  /* local/nonlocal offset and length used for xq and f */
 270     /* CUDA kernel launch-related stuff */
 271     int  shmem, nblock;
 272     dim3 dim_block, dim_grid;
 273     nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
 274
 275     cu_atomdata_t   *adat   = cu_nb->atdat;
 276     cu_nbparam_t    *nbp    = cu_nb->nbparam;
 277     cu_plist_t      *plist  = cu_nb->plist[iloc];
 278     cu_timers_t     *t      = cu_nb->timers;
 279     cudaStream_t    stream  = cu_nb->stream[iloc];
 280
 281     bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
 282     bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
 283     bool bDoTime     = cu_nb->bDoTime;
 284
 285     /* turn energy calculation always on/off (for debugging/testing only) */
 286     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 287
 288     /* don't launch the kernel if there is no work to do */
 289     if (plist->nsci == 0)
 290     {
 291         return;
 292     }
 293
 294     /* calculate the atom data index range based on locality */
 295     if (LOCAL_I(iloc))
 296     {
 297         adat_begin  = 0;
 298         adat_len    = adat->natoms_local;
 299     }
 300     else
 301     {
 302         adat_begin  = adat->natoms_local;
 303         adat_len    = adat->natoms - adat->natoms_local;
 304     }
 305
 306     /* When we get here all misc operations issues in the local stream are done,
 307        so we record that in the local stream and wait for it in the nonlocal one. */
 308     if (cu_nb->bUseTwoStreams)
 309     {
 310         if (iloc == eintLocal)
 311         {
 312             stat = cudaEventRecord(cu_nb->misc_ops_done, stream);
 313             CU_RET_ERR(stat, "cudaEventRecord on misc_ops_done failed");
 314         }
 315         else
 316         {
 317             stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_done, 0);
 318             CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_done failed");
 319         }
 320     }
 321
 322     /* beginning of timed HtoD section */
 323     if (bDoTime)
 324     {
 325         stat = cudaEventRecord(t->start_nb_h2d[iloc], stream);
 326         CU_RET_ERR(stat, "cudaEventRecord failed");
 327     }
 328
 329     /* HtoD x, q */
 330     cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
 331                       adat_len * sizeof(*adat->xq), stream);
 332
 333     if (bDoTime)
 334     {
 335         stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream);
 336         CU_RET_ERR(stat, "cudaEventRecord failed");
 337     }
 338
 339     /* beginning of timed nonbonded calculation section */
 340     if (bDoTime)
 341     {
 342         stat = cudaEventRecord(t->start_nb_k[iloc], stream);
 343         CU_RET_ERR(stat, "cudaEventRecord failed");
 344     }
 345
 346     /* get the pointer to the kernel flavor we need to use */
 347     nb_kernel = select_nbnxn_kernel(cu_nb->kernel_ver, nbp->eeltype, bCalcEner,
 348                                     plist->bDoPrune || always_prune);
 349
 350     /* kernel launch config */
 351     nblock    = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
 352     dim_block = dim3(CL_SIZE, CL_SIZE, 1);
 353     dim_grid  = dim3(nblock, 1, 1);
 354     shmem     = calc_shmem_required(cu_nb->kernel_ver);
 355
 356     if (debug)
 357     {
 358         fprintf(debug, "GPU launch configuration:\n\tThread block: %dx%dx%d\n\t"
 359                 "Grid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
 360                 dim_block.x, dim_block.y, dim_block.z,
 361                 dim_grid.x, dim_grid.y, plist->nsci*NCL_PER_SUPERCL,
 362                 NCL_PER_SUPERCL, plist->na_c);
 363     }
 364
 365     nb_kernel<<<dim_grid, dim_block, shmem, stream>>>(*adat, *nbp, *plist, bCalcFshift);
 366     CU_LAUNCH_ERR("k_calc_nb");
 367
 368     if (bDoTime)
 369     {
 370         stat = cudaEventRecord(t->stop_nb_k[iloc], stream);
 371         CU_RET_ERR(stat, "cudaEventRecord failed");
 372     }
 373 }
 374
 375 void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
 376                                const nbnxn_atomdata_t *nbatom,
 377                                int flags,
 378                                int aloc)
 379 {
 380     cudaError_t stat;
 381     int adat_begin, adat_len, adat_end;  /* local/nonlocal offset and length used for xq and f */
 382     int iloc = -1;
 383
 384     /* determine interaction locality from atom locality */
 385     if (LOCAL_A(aloc))
 386     {
 387         iloc = eintLocal;
 388     }
 389     else if (NONLOCAL_A(aloc))
 390     {
 391         iloc = eintNonlocal;
 392     }
 393     else
 394     {
 395         char stmp[STRLEN];
 396         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 397                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 398         gmx_incons(stmp);
 399     }
 400
 401     cu_atomdata_t   *adat   = cu_nb->atdat;
 402     cu_timers_t     *t      = cu_nb->timers;
 403     bool            bDoTime = cu_nb->bDoTime;
 404     cudaStream_t    stream  = cu_nb->stream[iloc];
 405
 406     bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
 407     bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
 408
 409     /* don't launch copy-back if there was no work to do */
 410     if (cu_nb->plist[iloc]->nsci == 0)
 411     {
 412         return;
 413     }
 414
 415     /* calculate the atom data index range based on locality */
 416     if (LOCAL_A(aloc))
 417     {
 418         adat_begin  = 0;
 419         adat_len    = adat->natoms_local;
 420         adat_end    = cu_nb->atdat->natoms_local;
 421     }
 422     else
 423     {
 424         adat_begin  = adat->natoms_local;
 425         adat_len    = adat->natoms - adat->natoms_local;
 426         adat_end    = cu_nb->atdat->natoms;
 427     }
 428
 429     /* beginning of timed D2H section */
 430     if (bDoTime)
 431     {
 432         stat = cudaEventRecord(t->start_nb_d2h[iloc], stream);
 433         CU_RET_ERR(stat, "cudaEventRecord failed");
 434     }
 435
 436     if (!cu_nb->bUseStreamSync)
 437     {
 438         /* For safety reasons set a few (5%) forces to NaN. This way even if the
 439            polling "hack" fails with some future NVIDIA driver we'll get a crash. */
 440         for (int i = adat_begin; i < 3*adat_end + 2; i += adat_len/20)
 441         {
 442 #ifdef NAN
 443             nbatom->out[0].f[i] = NAN;
 444 #else
 445 #  ifdef _MSVC
 446             if (numeric_limits<float>::has_quiet_NaN)
 447             {
 448                 nbatom->out[0].f[i] = numeric_limits<float>::quiet_NaN();
 449             }
 450             else
 451 #  endif
 452             {
 453                 nbatom->out[0].f[i] = GMX_REAL_MAX;
 454             }
 455 #endif
 456         }
 457
 458         /* Set the last four bytes of the force array to a bit pattern
 459            which can't be the result of the force calculation:
 460            max exponent (127) and zero mantissa. */
 461         *(unsigned int*)&nbatom->out[0].f[adat_end*3 - 1] = poll_wait_pattern;
 462     }
 463
 464     /* With DD the local D2H transfer can only start after the non-local
 465        has been launched. */
 466     if (iloc == eintLocal && cu_nb->bUseTwoStreams)
 467     {
 468         stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
 469         CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
 470     }
 471
 472     /* DtoH f */
 473     cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin,
 474                       (adat_len)*sizeof(*adat->f), stream);
 475
 476     /* After the non-local D2H is launched the nonlocal_done event can be
 477        recorded which signals that the local D2H can proceed. This event is not
 478        placed after the non-local kernel because we first need the non-local
 479        data back first. */
 480     if (iloc == eintNonlocal)
 481     {
 482         stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
 483         CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
 484     }
 485
 486     /* only transfer energies in the local stream */
 487     if (LOCAL_I(iloc))
 488     {
 489         /* DtoH fshift */
 490         if (bCalcFshift)
 491         {
 492             cu_copy_D2H_async(cu_nb->nbst.fshift, adat->fshift,
 493                               SHIFTS * sizeof(*cu_nb->nbst.fshift), stream);
 494         }
 495
 496         /* DtoH energies */
 497         if (bCalcEner)
 498         {
 499             cu_copy_D2H_async(cu_nb->nbst.e_lj, adat->e_lj,
 500                               sizeof(*cu_nb->nbst.e_lj), stream);
 501             cu_copy_D2H_async(cu_nb->nbst.e_el, adat->e_el,
 502                               sizeof(*cu_nb->nbst.e_el), stream);
 503         }
 504     }
 505
 506     if (bDoTime)
 507     {
 508         stat = cudaEventRecord(t->stop_nb_d2h[iloc], stream);
 509         CU_RET_ERR(stat, "cudaEventRecord failed");
 510     }
 511 }
 512
 513 /* Atomic compare-exchange operation on unsigned values. It is used in
 514  * polling wait for the GPU.
 515  */
 516 static inline bool atomic_cas(volatile unsigned int *ptr,
 517                               unsigned int oldval,
 518                               unsigned int newval)
 519 {
 520     assert(ptr);
 521
 522 #ifdef TMPI_ATOMICS
 523     return tMPI_Atomic_cas((tMPI_Atomic_t *)ptr, oldval, newval);
 524 #else
 525     gmx_incons("Atomic operations not available, atomic_cas() should not have been called!");
 526     return true;
 527 #endif
 528 }
 529
 530 void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
 531                          const nbnxn_atomdata_t *nbatom,
 532                          int flags, int aloc,
 533                          real *e_lj, real *e_el, rvec *fshift)
 534 {
 535     /* NOTE:  only implemented for single-precision at this time */
 536     cudaError_t stat;
 537     int i, adat_end, iloc = -1;
 538     volatile unsigned int *poll_word;
 539
 540     /* determine interaction locality from atom locality */
 541     if (LOCAL_A(aloc))
 542     {
 543         iloc = eintLocal;
 544     }
 545     else if (NONLOCAL_A(aloc))
 546     {
 547         iloc = eintNonlocal;
 548     }
 549     else
 550     {
 551         char stmp[STRLEN];
 552         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 553                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 554         gmx_incons(stmp);
 555     }
 556
 557     cu_plist_t      *plist   = cu_nb->plist[iloc];
 558     cu_timers_t     *timers  = cu_nb->timers;
 559     wallclock_gpu_t *timings = cu_nb->timings;
 560     nb_staging      nbst     = cu_nb->nbst;
 561
 562     bool    bCalcEner   = flags & GMX_FORCE_VIRIAL;
 563     bool    bCalcFshift = flags & GMX_FORCE_VIRIAL;
 564
 565     /* turn energy calculation always on/off (for debugging/testing only) */
 566     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 567
 568     /* don't launch wait/update timers & counters if there was no work to do
 569
 570        NOTE: if timing with multiple GPUs (streams) becomes possible, the
 571        counters could end up being inconsistent due to not being incremented
 572        on some of the nodes! */
 573     if (cu_nb->plist[iloc]->nsci == 0)
 574     {
 575         return;
 576     }
 577
 578     /* calculate the atom data index range based on locality */
 579     if (LOCAL_A(aloc))
 580     {
 581         adat_end = cu_nb->atdat->natoms_local;
 582     }
 583     else
 584     {
 585         adat_end = cu_nb->atdat->natoms;
 586     }
 587
 588     if (cu_nb->bUseStreamSync)
 589     {
 590         stat = cudaStreamSynchronize(cu_nb->stream[iloc]);
 591         CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
 592     }
 593     else
 594     {
 595         /* Busy-wait until we get the signal pattern set in last byte
 596          * of the l/nl float vector. This pattern corresponds to a floating
 597          * point number which can't be the result of the force calculation
 598          * (maximum, 127 exponent and 0 mantissa).
 599          * The polling uses atomic compare-exchange.
 600          */
 601         poll_word = (volatile unsigned int*)&nbatom->out[0].f[adat_end*3 - 1];
 602         while (atomic_cas(poll_word, poll_wait_pattern, poll_wait_pattern)) {}
 603     }
 604
 605     /* timing data accumulation */
 606     if (cu_nb->bDoTime)
 607     {
 608         /* only increase counter once (at local F wait) */
 609         if (LOCAL_I(iloc))
 610         {
 611             timings->nb_c++;
 612             timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
 613         }
 614
 615         /* kernel timings */
 616         timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
 617             cu_event_elapsed(timers->start_nb_k[iloc], timers->stop_nb_k[iloc]);
 618
 619         /* X/q H2D and F D2H timings */
 620         timings->nb_h2d_t += cu_event_elapsed(timers->start_nb_h2d[iloc],
 621                                                  timers->stop_nb_h2d[iloc]);
 622         timings->nb_d2h_t += cu_event_elapsed(timers->start_nb_d2h[iloc],
 623                                                  timers->stop_nb_d2h[iloc]);
 624
 625         /* only count atdat and pair-list H2D at pair-search step */
 626         if (plist->bDoPrune)
 627         {
 628             /* atdat transfer timing (add only once, at local F wait) */
 629             if (LOCAL_A(aloc))
 630             {
 631                 timings->pl_h2d_c++;
 632                 timings->pl_h2d_t += cu_event_elapsed(timers->start_atdat,
 633                                                          timers->stop_atdat);
 634             }
 635
 636             timings->pl_h2d_t += cu_event_elapsed(timers->start_pl_h2d[iloc],
 637                                                      timers->stop_pl_h2d[iloc]);
 638         }
 639     }
 640
 641     /* add up energies and shift forces (only once at local F wait) */
 642     if (LOCAL_I(iloc))
 643     {
 644         if (bCalcEner)
 645         {
 646             *e_lj += *nbst.e_lj;
 647             *e_el += *nbst.e_el;
 648         }
 649
 650         if (bCalcFshift)
 651         {
 652             for (i = 0; i < SHIFTS; i++)
 653             {
 654                 fshift[i][0] += nbst.fshift[i].x;
 655                 fshift[i][1] += nbst.fshift[i].y;
 656                 fshift[i][2] += nbst.fshift[i].z;
 657             }
 658         }
 659     }
 660
 661     /* turn off pruning (doesn't matter if this is pair-search step or not) */
 662     plist->bDoPrune = false;
 663 }
 664
 665 /*! Return the reference to the nbfp texture. */
 666 const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref()
 667 {
 668     return nbfp_texref;
 669 }
 670
 671 /*! Return the reference to the coulomb_tab. */
 672 const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref()
 673 {
 674     return coulomb_tab_texref;
 675 }
 676
 677 /*! Set up the cache configuration for the non-bonded kernels,
 678  */
 679 void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
 680 {
 681     cudaError_t stat;
 682
 683     for (int i = 0; i < eelCuNR; i++)
 684     {
 685         for (int j = 0; j < nEnergyKernelTypes; j++)
 686         {
 687             for (int k = 0; k < nPruneKernelTypes; k++)
 688             {
 689                 /* Legacy kernel 16/48 kB Shared/L1
 690                  * No analytical Ewald!
 691                  */
 692                 if (i != eelCuEWALD_ANA && i != eelCuEWALD_ANA_TWIN)
 693                 {
 694                     stat = cudaFuncSetCacheConfig(nb_legacy_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
 695                     CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 696                 }
 697
 698                 if (devinfo->prop.major >= 3)
 699                 {
 700                     /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
 701                     stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferShared);
 702                 }
 703                 else
 704                 {
 705                     /* On Fermi prefer L1 gives 2% higher performance */
 706                     /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
 707                     stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
 708                 }
 709                 CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 710             }
 711         }
 712     }
 713 }