src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *  \brief Define OpenCL implementation of nbnxm_gpu_data_mgmt.h
  37  *
  38  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  39  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  40  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  41  *  \author Szilárd Páll <pall.szilard@gmail.com>
  42  */
  43 #include "gmxpre.h"
  44
  45 #include <assert.h>
  46 #include <stdarg.h>
  47 #include <stdio.h>
  48 #include <stdlib.h>
  49 #include <string.h>
  50
  51 #include <cmath>
  52
  53 #include "gromacs/gpu_utils/gpu_utils.h"
  54 #include "gromacs/gpu_utils/oclutils.h"
  55 #include "gromacs/hardware/gpu_hw_info.h"
  56 #include "gromacs/math/vectypes.h"
  57 #include "gromacs/mdlib/force_flags.h"
  58 #include "gromacs/mdtypes/interaction_const.h"
  59 #include "gromacs/mdtypes/md_enums.h"
  60 #include "gromacs/nbnxm/gpu_data_mgmt.h"
  61 #include "gromacs/nbnxm/gpu_jit_support.h"
  62 #include "gromacs/nbnxm/nbnxm.h"
  63 #include "gromacs/nbnxm/nbnxm_gpu.h"
  64 #include "gromacs/pbcutil/ishift.h"
  65 #include "gromacs/timing/gpu_timing.h"
  66 #include "gromacs/utility/cstringutil.h"
  67 #include "gromacs/utility/fatalerror.h"
  68 #include "gromacs/utility/gmxassert.h"
  69 #include "gromacs/utility/real.h"
  70 #include "gromacs/utility/smalloc.h"
  71
  72 #include "nbnxm_ocl_internal.h"
  73 #include "nbnxm_ocl_types.h"
  74
  75 /*! \brief This parameter should be determined heuristically from the
  76  * kernel execution times
  77  *
  78  * This value is best for small systems on a single AMD Radeon R9 290X
  79  * (and about 5% faster than 40, which is the default for CUDA
  80  * devices). Larger simulation systems were quite insensitive to the
  81  * value of this parameter.
  82  */
  83 static unsigned int gpu_min_ci_balanced_factor = 50;
  84
  85
  86 /*! \brief Returns true if LJ combination rules are used in the non-bonded kernels.
  87  *
  88  * Full doc in nbnxn_ocl_internal.h */
  89 bool useLjCombRule(int vdwType)
  90 {
  91     return (vdwType == evdwOclCUTCOMBGEOM ||
  92             vdwType == evdwOclCUTCOMBLB);
  93 }
  94
  95 /*! \brief Tabulates the Ewald Coulomb force and initializes the size/scale
  96  * and the table GPU array.
  97  *
  98  * If called with an already allocated table, it just re-uploads the
  99  * table.
 100  */
 101 static void init_ewald_coulomb_force_table(const interaction_const_t       *ic,
 102                                            cl_nbparam_t                    *nbp,
 103                                            const gmx_device_runtime_data_t *runData)
 104 {
 105     cl_mem       coul_tab;
 106
 107     cl_int       cl_error;
 108
 109     if (nbp->coulomb_tab_climg2d != nullptr)
 110     {
 111         freeDeviceBuffer(&(nbp->coulomb_tab_climg2d));
 112     }
 113
 114     /* Switched from using textures to using buffers */
 115     // TODO: decide which alternative is most efficient - textures or buffers.
 116     /*
 117        cl_image_format array_format;
 118
 119        array_format.image_channel_data_type = CL_FLOAT;
 120        array_format.image_channel_order     = CL_R;
 121
 122        coul_tab = clCreateImage2D(runData->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
 123        &array_format, tabsize, 1, 0, ftmp, &cl_error);
 124      */
 125
 126     coul_tab = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
 127                               ic->tabq_size*sizeof(cl_float), ic->tabq_coul_F, &cl_error);
 128     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 129                        ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 130
 131     nbp->coulomb_tab_climg2d  = coul_tab;
 132     nbp->coulomb_tab_scale    = ic->tabq_scale;
 133 }
 134
 135
 136 /*! \brief Initializes the atomdata structure first time, it only gets filled at
 137     pair-search.
 138  */
 139 static void init_atomdata_first(cl_atomdata_t *ad, int ntypes, gmx_device_runtime_data_t *runData)
 140 {
 141     cl_int cl_error;
 142
 143     ad->ntypes  = ntypes;
 144
 145     /* An element of the shift_vec device buffer has the same size as one element
 146        of the host side shift_vec buffer. */
 147     ad->shift_vec_elem_size = sizeof(*nbnxn_atomdata_t::shift_vec.data());
 148
 149     ad->shift_vec = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
 150                                    SHIFTS * ad->shift_vec_elem_size, nullptr, &cl_error);
 151     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 152                        ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 153     ad->bShiftVecUploaded = CL_FALSE;
 154
 155     /* An element of the fshift device buffer has the same size as one element
 156        of the host side fshift buffer. */
 157     ad->fshift_elem_size = sizeof(*cl_nb_staging_t::fshift);
 158
 159     ad->fshift = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
 160                                 SHIFTS * ad->fshift_elem_size, nullptr, &cl_error);
 161     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 162                        ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 163
 164     ad->e_lj = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
 165                               sizeof(float), nullptr, &cl_error);
 166     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 167                        ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 168
 169     ad->e_el = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
 170                               sizeof(float), nullptr, &cl_error);
 171     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 172                        ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 173
 174     /* initialize to nullptr pointers to data that is not allocated here and will
 175        need reallocation in nbnxn_gpu_init_atomdata */
 176     ad->xq = nullptr;
 177     ad->f  = nullptr;
 178
 179     /* size -1 indicates that the respective array hasn't been initialized yet */
 180     ad->natoms = -1;
 181     ad->nalloc = -1;
 182 }
 183
 184 /*! \brief Copies all parameters related to the cut-off from ic to nbp
 185  */
 186 static void set_cutoff_parameters(cl_nbparam_t              *nbp,
 187                                   const interaction_const_t *ic,
 188                                   const NbnxnListParameters *listParams)
 189 {
 190     nbp->ewald_beta        = ic->ewaldcoeff_q;
 191     nbp->sh_ewald          = ic->sh_ewald;
 192     nbp->epsfac            = ic->epsfac;
 193     nbp->two_k_rf          = 2.0 * ic->k_rf;
 194     nbp->c_rf              = ic->c_rf;
 195     nbp->rvdw_sq           = ic->rvdw * ic->rvdw;
 196     nbp->rcoulomb_sq       = ic->rcoulomb * ic->rcoulomb;
 197     nbp->rlistOuter_sq     = listParams->rlistOuter * listParams->rlistOuter;
 198     nbp->rlistInner_sq     = listParams->rlistInner * listParams->rlistInner;
 199     nbp->useDynamicPruning = listParams->useDynamicPruning;
 200
 201     nbp->sh_lj_ewald       = ic->sh_lj_ewald;
 202     nbp->ewaldcoeff_lj     = ic->ewaldcoeff_lj;
 203
 204     nbp->rvdw_switch       = ic->rvdw_switch;
 205     nbp->dispersion_shift  = ic->dispersion_shift;
 206     nbp->repulsion_shift   = ic->repulsion_shift;
 207     nbp->vdw_switch        = ic->vdw_switch;
 208 }
 209
 210 /*! \brief Returns the kinds of electrostatics and Vdw OpenCL
 211  *  kernels that will be used.
 212  *
 213  * Respectively, these values are from enum eelOcl and enum
 214  * evdwOcl. */
 215 static void
 216 map_interaction_types_to_gpu_kernel_flavors(const interaction_const_t *ic,
 217                                             int                        combRule,
 218                                             int                       *gpu_eeltype,
 219                                             int                       *gpu_vdwtype)
 220 {
 221     if (ic->vdwtype == evdwCUT)
 222     {
 223         switch (ic->vdw_modifier)
 224         {
 225             case eintmodNONE:
 226             case eintmodPOTSHIFT:
 227                 switch (combRule)
 228                 {
 229                     case ljcrNONE:
 230                         *gpu_vdwtype = evdwOclCUT;
 231                         break;
 232                     case ljcrGEOM:
 233                         *gpu_vdwtype = evdwOclCUTCOMBGEOM;
 234                         break;
 235                     case ljcrLB:
 236                         *gpu_vdwtype = evdwOclCUTCOMBLB;
 237                         break;
 238                     default:
 239                         gmx_incons("The requested LJ combination rule is not implemented in the OpenCL GPU accelerated kernels!");
 240                 }
 241                 break;
 242             case eintmodFORCESWITCH:
 243                 *gpu_vdwtype = evdwOclFSWITCH;
 244                 break;
 245             case eintmodPOTSWITCH:
 246                 *gpu_vdwtype = evdwOclPSWITCH;
 247                 break;
 248             default:
 249                 gmx_incons("The requested VdW interaction modifier is not implemented in the GPU accelerated kernels!");
 250         }
 251     }
 252     else if (ic->vdwtype == evdwPME)
 253     {
 254         if (ic->ljpme_comb_rule == ljcrGEOM)
 255         {
 256             *gpu_vdwtype = evdwOclEWALDGEOM;
 257         }
 258         else
 259         {
 260             *gpu_vdwtype = evdwOclEWALDLB;
 261         }
 262     }
 263     else
 264     {
 265         gmx_incons("The requested VdW type is not implemented in the GPU accelerated kernels!");
 266     }
 267
 268     if (ic->eeltype == eelCUT)
 269     {
 270         *gpu_eeltype = eelOclCUT;
 271     }
 272     else if (EEL_RF(ic->eeltype))
 273     {
 274         *gpu_eeltype = eelOclRF;
 275     }
 276     else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
 277     {
 278         /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
 279         *gpu_eeltype = nbnxn_gpu_pick_ewald_kernel_type(false);
 280     }
 281     else
 282     {
 283         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
 284         gmx_incons("The requested electrostatics type is not implemented in the GPU accelerated kernels!");
 285     }
 286 }
 287
 288 /*! \brief Initializes the nonbonded parameter data structure.
 289  */
 290 static void init_nbparam(cl_nbparam_t                    *nbp,
 291                          const interaction_const_t       *ic,
 292                          const NbnxnListParameters       *listParams,
 293                          const nbnxn_atomdata_t::Params  &nbatParams,
 294                          const gmx_device_runtime_data_t *runData)
 295 {
 296     cl_int cl_error;
 297
 298     set_cutoff_parameters(nbp, ic, listParams);
 299
 300     map_interaction_types_to_gpu_kernel_flavors(ic,
 301                                                 nbatParams.comb_rule,
 302                                                 &(nbp->eeltype),
 303                                                 &(nbp->vdwtype));
 304
 305     if (ic->vdwtype == evdwPME)
 306     {
 307         if (ic->ljpme_comb_rule == ljcrGEOM)
 308         {
 309             GMX_ASSERT(nbatParams.comb_rule == ljcrGEOM, "Combination rule mismatch!");
 310         }
 311         else
 312         {
 313             GMX_ASSERT(nbatParams.comb_rule == ljcrLB, "Combination rule mismatch!");
 314         }
 315     }
 316     /* generate table for PME */
 317     nbp->coulomb_tab_climg2d = nullptr;
 318     if (nbp->eeltype == eelOclEWALD_TAB || nbp->eeltype == eelOclEWALD_TAB_TWIN)
 319     {
 320         init_ewald_coulomb_force_table(ic, nbp, runData);
 321     }
 322     else
 323     // TODO: improvement needed.
 324     // The image2d is created here even if eeltype is not eelCuEWALD_TAB or eelCuEWALD_TAB_TWIN because the OpenCL kernels
 325     // don't accept nullptr values for image2D parameters.
 326     {
 327         /* Switched from using textures to using buffers */
 328         // TODO: decide which alternative is most efficient - textures or buffers.
 329         /*
 330            cl_image_format array_format;
 331
 332            array_format.image_channel_data_type = CL_FLOAT;
 333            array_format.image_channel_order     = CL_R;
 334
 335            nbp->coulomb_tab_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE,
 336             &array_format, 1, 1, 0, nullptr, &cl_error);
 337          */
 338
 339         nbp->coulomb_tab_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY, sizeof(cl_float), nullptr, &cl_error);
 340         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 341                            ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 342     }
 343
 344     const int nnbfp      = 2*nbatParams.numTypes*nbatParams.numTypes;
 345     const int nnbfp_comb = 2*nbatParams.numTypes;
 346
 347     {
 348         /* Switched from using textures to using buffers */
 349         // TODO: decide which alternative is most efficient - textures or buffers.
 350         /*
 351            cl_image_format array_format;
 352
 353            array_format.image_channel_data_type = CL_FLOAT;
 354            array_format.image_channel_order     = CL_R;
 355
 356            nbp->nbfp_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
 357             &array_format, nnbfp, 1, 0, nbat->nbfp, &cl_error);
 358          */
 359
 360         nbp->nbfp_climg2d =
 361             clCreateBuffer(runData->context,
 362                            CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
 363                            nnbfp*sizeof(cl_float),
 364                            const_cast<float *>(nbatParams.nbfp.data()),
 365                            &cl_error);
 366         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 367                            ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 368
 369         if (ic->vdwtype == evdwPME)
 370         {
 371             /* Switched from using textures to using buffers */
 372             // TODO: decide which alternative is most efficient - textures or buffers.
 373             /*  nbp->nbfp_comb_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
 374                 &array_format, nnbfp_comb, 1, 0, nbat->nbfp_comb, &cl_error);*/
 375             nbp->nbfp_comb_climg2d =
 376                 clCreateBuffer(runData->context,
 377                                CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
 378                                nnbfp_comb*sizeof(cl_float),
 379                                const_cast<float *>(nbatParams.nbfp_comb.data()),
 380                                &cl_error);
 381             GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 382                                ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 383         }
 384         else
 385         {
 386             // TODO: improvement needed.
 387             // The image2d is created here even if vdwtype is not evdwPME because the OpenCL kernels
 388             // don't accept nullptr values for image2D parameters.
 389             /* Switched from using textures to using buffers */
 390             // TODO: decide which alternative is most efficient - textures or buffers.
 391             /* nbp->nbfp_comb_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE,
 392                 &array_format, 1, 1, 0, nullptr, &cl_error);*/
 393             nbp->nbfp_comb_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY, sizeof(cl_float), nullptr, &cl_error);
 394             GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 395                                ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 396         }
 397     }
 398 }
 399
 400 //! This function is documented in the header file
 401 void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
 402                                         const interaction_const_t   *ic,
 403                                         const NbnxnListParameters   *listParams)
 404 {
 405     if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
 406     {
 407         return;
 408     }
 409     gmx_nbnxn_ocl_t    *nb  = nbv->gpu_nbv;
 410     cl_nbparam_t       *nbp = nb->nbparam;
 411
 412     set_cutoff_parameters(nbp, ic, listParams);
 413
 414     nbp->eeltype = nbnxn_gpu_pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw);
 415
 416     init_ewald_coulomb_force_table(ic, nb->nbparam, nb->dev_rundata);
 417 }
 418
 419 /*! \brief Initializes the pair list data structure.
 420  */
 421 static void init_plist(cl_plist_t *pl)
 422 {
 423     /* initialize to nullptr pointers to data that is not allocated here and will
 424        need reallocation in nbnxn_gpu_init_pairlist */
 425     pl->sci     = nullptr;
 426     pl->cj4     = nullptr;
 427     pl->imask   = nullptr;
 428     pl->excl    = nullptr;
 429
 430     /* size -1 indicates that the respective array hasn't been initialized yet */
 431     pl->na_c           = -1;
 432     pl->nsci           = -1;
 433     pl->sci_nalloc     = -1;
 434     pl->ncj4           = -1;
 435     pl->cj4_nalloc     = -1;
 436     pl->nimask         = -1;
 437     pl->imask_nalloc   = -1;
 438     pl->nexcl          = -1;
 439     pl->excl_nalloc    = -1;
 440     pl->haveFreshList  = false;
 441 }
 442
 443 /*! \brief Initializes the timer data structure.
 444  */
 445 static void init_timers(cl_timers_t *t,
 446                         bool         bUseTwoStreams)
 447 {
 448     for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++)
 449     {
 450         t->didPairlistH2D[i]  = false;
 451         t->didPrune[i]        = false;
 452         t->didRollingPrune[i] = false;
 453     }
 454 }
 455
 456 /*! \brief Initializes the timings data structure.
 457  */
 458 static void init_timings(gmx_wallclock_gpu_nbnxn_t *t)
 459 {
 460     int i, j;
 461
 462     t->nb_h2d_t = 0.0;
 463     t->nb_d2h_t = 0.0;
 464     t->nb_c     = 0;
 465     t->pl_h2d_t = 0.0;
 466     t->pl_h2d_c = 0;
 467     for (i = 0; i < 2; i++)
 468     {
 469         for (j = 0; j < 2; j++)
 470         {
 471             t->ktime[i][j].t = 0.0;
 472             t->ktime[i][j].c = 0;
 473         }
 474     }
 475
 476     t->pruneTime.c        = 0;
 477     t->pruneTime.t        = 0.0;
 478     t->dynamicPruneTime.c = 0;
 479     t->dynamicPruneTime.t = 0.0;
 480 }
 481
 482 /*! \brief Creates context for OpenCL GPU given by \p mygpu
 483  *
 484  * A fatal error results if creation fails.
 485  *
 486  * \param[inout] runtimeData runtime data including program and context
 487  * \param[in]    devInfo     device info struct
 488  * \param[in]    rank        MPI rank (for error reporting)
 489  */
 490 static void
 491 nbnxn_gpu_create_context(gmx_device_runtime_data_t *runtimeData,
 492                          const gmx_device_info_t   *devInfo,
 493                          int                        rank)
 494 {
 495     cl_context_properties     context_properties[3];
 496     cl_platform_id            platform_id;
 497     cl_device_id              device_id;
 498     cl_context                context;
 499     cl_int                    cl_error;
 500
 501     assert(runtimeData != nullptr);
 502     assert(devInfo != nullptr);
 503
 504     platform_id      = devInfo->ocl_gpu_id.ocl_platform_id;
 505     device_id        = devInfo->ocl_gpu_id.ocl_device_id;
 506
 507     context_properties[0] = CL_CONTEXT_PLATFORM;
 508     context_properties[1] = reinterpret_cast<cl_context_properties>(platform_id);
 509     context_properties[2] = 0; /* Terminates the list of properties */
 510
 511     context = clCreateContext(context_properties, 1, &device_id, nullptr, nullptr, &cl_error);
 512     if (CL_SUCCESS != cl_error)
 513     {
 514         gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s:\n OpenCL error %d: %s",
 515                   rank,
 516                   devInfo->device_name,
 517                   cl_error, ocl_get_error_string(cl_error).c_str());
 518     }
 519
 520     runtimeData->context = context;
 521 }
 522
 523 /*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
 524 static cl_kernel nbnxn_gpu_create_kernel(gmx_nbnxn_ocl_t *nb,
 525                                          const char      *kernel_name)
 526 {
 527     cl_kernel kernel;
 528     cl_int    cl_error;
 529
 530     kernel = clCreateKernel(nb->dev_rundata->program, kernel_name, &cl_error);
 531     if (CL_SUCCESS != cl_error)
 532     {
 533         gmx_fatal(FARGS, "Failed to create kernel '%s' for GPU #%s: OpenCL error %d",
 534                   kernel_name,
 535                   nb->dev_info->device_name,
 536                   cl_error);
 537     }
 538
 539     return kernel;
 540 }
 541
 542 /*! \brief Clears nonbonded shift force output array and energy outputs on the GPU.
 543  */
 544 static void
 545 nbnxn_ocl_clear_e_fshift(gmx_nbnxn_ocl_t *nb)
 546 {
 547
 548     cl_int               cl_error;
 549     cl_atomdata_t *      adat     = nb->atdat;
 550     cl_command_queue     ls       = nb->stream[eintLocal];
 551
 552     size_t               local_work_size[3]   = {1, 1, 1};
 553     size_t               global_work_size[3]  = {1, 1, 1};
 554
 555     cl_int               shifts   = SHIFTS*3;
 556
 557     cl_int               arg_no;
 558
 559     cl_kernel            zero_e_fshift = nb->kernel_zero_e_fshift;
 560
 561     local_work_size[0]   = 64;
 562     // Round the total number of threads up from the array size
 563     global_work_size[0]  = ((shifts + local_work_size[0] - 1)/local_work_size[0])*local_work_size[0];
 564
 565     arg_no    = 0;
 566     cl_error  = clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->fshift));
 567     cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_lj));
 568     cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_el));
 569     cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_uint), &shifts);
 570     GMX_ASSERT(cl_error == CL_SUCCESS, ocl_get_error_string(cl_error).c_str());
 571
 572     cl_error = clEnqueueNDRangeKernel(ls, zero_e_fshift, 3, nullptr, global_work_size, local_work_size, 0, nullptr, nullptr);
 573     GMX_ASSERT(cl_error == CL_SUCCESS, ocl_get_error_string(cl_error).c_str());
 574 }
 575
 576 /*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
 577 static void nbnxn_gpu_init_kernels(gmx_nbnxn_ocl_t *nb)
 578 {
 579     /* Init to 0 main kernel arrays */
 580     /* They will be later on initialized in select_nbnxn_kernel */
 581     // TODO: consider always creating all variants of the kernels here so that there is no
 582     // need for late call to clCreateKernel -- if that gives any advantage?
 583     memset(nb->kernel_ener_noprune_ptr, 0, sizeof(nb->kernel_ener_noprune_ptr));
 584     memset(nb->kernel_ener_prune_ptr, 0, sizeof(nb->kernel_ener_prune_ptr));
 585     memset(nb->kernel_noener_noprune_ptr, 0, sizeof(nb->kernel_noener_noprune_ptr));
 586     memset(nb->kernel_noener_prune_ptr, 0, sizeof(nb->kernel_noener_prune_ptr));
 587
 588     /* Init pruning kernels
 589      *
 590      * TODO: we could avoid creating kernels if dynamic pruning is turned off,
 591      * but ATM that depends on force flags not passed into the initialization.
 592      */
 593     nb->kernel_pruneonly[epruneFirst]   = nbnxn_gpu_create_kernel(nb, "nbnxn_kernel_prune_opencl");
 594     nb->kernel_pruneonly[epruneRolling] = nbnxn_gpu_create_kernel(nb, "nbnxn_kernel_prune_rolling_opencl");
 595
 596     /* Init auxiliary kernels */
 597     nb->kernel_zero_e_fshift = nbnxn_gpu_create_kernel(nb, "zero_e_fshift");
 598 }
 599
 600 /*! \brief Initializes simulation constant data.
 601  *
 602  *  Initializes members of the atomdata and nbparam structs and
 603  *  clears e/fshift output buffers.
 604  */
 605 static void nbnxn_ocl_init_const(gmx_nbnxn_ocl_t                *nb,
 606                                  const interaction_const_t      *ic,
 607                                  const NbnxnListParameters      *listParams,
 608                                  const nbnxn_atomdata_t::Params &nbatParams)
 609 {
 610     init_atomdata_first(nb->atdat, nbatParams.numTypes, nb->dev_rundata);
 611     init_nbparam(nb->nbparam, ic, listParams, nbatParams, nb->dev_rundata);
 612 }
 613
 614
 615 //! This function is documented in the header file
 616 void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
 617                     const gmx_device_info_t   *deviceInfo,
 618                     const interaction_const_t *ic,
 619                     const NbnxnListParameters *listParams,
 620                     const nbnxn_atomdata_t    *nbat,
 621                     int                        rank,
 622                     gmx_bool                   bLocalAndNonlocal)
 623 {
 624     gmx_nbnxn_ocl_t            *nb;
 625     cl_int                      cl_error;
 626     cl_command_queue_properties queue_properties;
 627
 628     assert(ic);
 629
 630     if (p_nb == nullptr)
 631     {
 632         return;
 633     }
 634
 635     snew(nb, 1);
 636     snew(nb->atdat, 1);
 637     snew(nb->nbparam, 1);
 638     snew(nb->plist[eintLocal], 1);
 639     if (bLocalAndNonlocal)
 640     {
 641         snew(nb->plist[eintNonlocal], 1);
 642     }
 643
 644     nb->bUseTwoStreams = static_cast<cl_bool>(bLocalAndNonlocal);
 645
 646     nb->timers = new cl_timers_t();
 647     snew(nb->timings, 1);
 648
 649     /* set device info, just point it to the right GPU among the detected ones */
 650     nb->dev_info = deviceInfo;
 651     snew(nb->dev_rundata, 1);
 652
 653     /* init nbst */
 654     pmalloc(reinterpret_cast<void**>(&nb->nbst.e_lj), sizeof(*nb->nbst.e_lj));
 655     pmalloc(reinterpret_cast<void**>(&nb->nbst.e_el), sizeof(*nb->nbst.e_el));
 656     pmalloc(reinterpret_cast<void**>(&nb->nbst.fshift), SHIFTS * sizeof(*nb->nbst.fshift));
 657
 658     init_plist(nb->plist[eintLocal]);
 659
 660     /* OpenCL timing disabled if GMX_DISABLE_GPU_TIMING is defined. */
 661     nb->bDoTime = static_cast<cl_bool>(getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
 662
 663     /* Create queues only after bDoTime has been initialized */
 664     if (nb->bDoTime)
 665     {
 666         queue_properties = CL_QUEUE_PROFILING_ENABLE;
 667     }
 668     else
 669     {
 670         queue_properties = 0;
 671     }
 672
 673     nbnxn_gpu_create_context(nb->dev_rundata, nb->dev_info, rank);
 674
 675     /* local/non-local GPU streams */
 676     nb->stream[eintLocal] = clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
 677     if (CL_SUCCESS != cl_error)
 678     {
 679         gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
 680                   rank,
 681                   nb->dev_info->device_name,
 682                   cl_error);
 683     }
 684
 685     if (nb->bUseTwoStreams)
 686     {
 687         init_plist(nb->plist[eintNonlocal]);
 688
 689         nb->stream[eintNonlocal] = clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
 690         if (CL_SUCCESS != cl_error)
 691         {
 692             gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
 693                       rank,
 694                       nb->dev_info->device_name,
 695                       cl_error);
 696         }
 697     }
 698
 699     if (nb->bDoTime)
 700     {
 701         init_timers(nb->timers, nb->bUseTwoStreams == CL_TRUE);
 702         init_timings(nb->timings);
 703     }
 704
 705     nbnxn_ocl_init_const(nb, ic, listParams, nbat->params());
 706
 707     /* Enable LJ param manual prefetch for AMD or Intel or if we request through env. var.
 708      * TODO: decide about NVIDIA
 709      */
 710     nb->bPrefetchLjParam =
 711         (getenv("GMX_OCL_DISABLE_I_PREFETCH") == nullptr) &&
 712         ((nb->dev_info->vendor_e == OCL_VENDOR_AMD) || (nb->dev_info->vendor_e == OCL_VENDOR_INTEL)
 713          || (getenv("GMX_OCL_ENABLE_I_PREFETCH") != nullptr));
 714
 715     /* NOTE: in CUDA we pick L1 cache configuration for the nbnxn kernels here,
 716      * but sadly this is not supported in OpenCL (yet?). Consider adding it if
 717      * it becomes supported.
 718      */
 719     nbnxn_gpu_compile_kernels(nb);
 720     nbnxn_gpu_init_kernels(nb);
 721
 722     /* clear energy and shift force outputs */
 723     nbnxn_ocl_clear_e_fshift(nb);
 724
 725     *p_nb = nb;
 726
 727     if (debug)
 728     {
 729         fprintf(debug, "Initialized OpenCL data structures.\n");
 730     }
 731 }
 732
 733 /*! \brief Clears the first natoms_clear elements of the GPU nonbonded force output array.
 734  */
 735 static void nbnxn_ocl_clear_f(gmx_nbnxn_ocl_t *nb, int natoms_clear)
 736 {
 737     if (natoms_clear == 0)
 738     {
 739         return;
 740     }
 741
 742     cl_int gmx_used_in_debug cl_error;
 743
 744     cl_atomdata_t           *atomData = nb->atdat;
 745     cl_command_queue         ls       = nb->stream[eintLocal];
 746     cl_float                 value    = 0.0f;
 747
 748     cl_error = clEnqueueFillBuffer(ls, atomData->f, &value, sizeof(cl_float),
 749                                    0, natoms_clear*sizeof(rvec), 0, nullptr, nullptr);
 750     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clEnqueueFillBuffer failed: " +
 751                                                 ocl_get_error_string(cl_error)).c_str());
 752 }
 753
 754 //! This function is documented in the header file
 755 void
 756 nbnxn_gpu_clear_outputs(gmx_nbnxn_ocl_t   *nb,
 757                         int                flags)
 758 {
 759     nbnxn_ocl_clear_f(nb, nb->atdat->natoms);
 760     /* clear shift force array and energies if the outputs were
 761        used in the current step */
 762     if (flags & GMX_FORCE_VIRIAL)
 763     {
 764         nbnxn_ocl_clear_e_fshift(nb);
 765     }
 766
 767     /* kick off buffer clearing kernel to ensure concurrency with constraints/update */
 768     cl_int gmx_unused cl_error;
 769     cl_error = clFlush(nb->stream[eintLocal]);
 770     assert(CL_SUCCESS == cl_error);
 771 }
 772
 773 //! This function is documented in the header file
 774 void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t        *nb,
 775                              const NbnxnPairlistGpu *h_plist,
 776                              int                     iloc)
 777 {
 778     char             sbuf[STRLEN];
 779     // Timing accumulation should happen only if there was work to do
 780     // because getLastRangeTime() gets skipped with empty lists later
 781     // which leads to the counter not being reset.
 782     bool             bDoTime    = ((nb->bDoTime == CL_TRUE) && !h_plist->sci.empty());
 783     cl_command_queue stream     = nb->stream[iloc];
 784     cl_plist_t      *d_plist    = nb->plist[iloc];
 785
 786     if (d_plist->na_c < 0)
 787     {
 788         d_plist->na_c = h_plist->na_ci;
 789     }
 790     else
 791     {
 792         if (d_plist->na_c != h_plist->na_ci)
 793         {
 794             sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
 795                     d_plist->na_c, h_plist->na_ci);
 796             gmx_incons(sbuf);
 797         }
 798     }
 799
 800     if (bDoTime)
 801     {
 802         nb->timers->pl_h2d[iloc].openTimingRegion(stream);
 803         nb->timers->didPairlistH2D[iloc] = true;
 804     }
 805
 806     // TODO most of this function is same in CUDA and OpenCL, move into the header
 807     Context context = nb->dev_rundata->context;
 808
 809     reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(),
 810                            &d_plist->nsci, &d_plist->sci_nalloc, context);
 811     copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(),
 812                        stream, GpuApiCallBehavior::Async,
 813                        bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
 814
 815     reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(),
 816                            &d_plist->ncj4, &d_plist->cj4_nalloc, context);
 817     copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(),
 818                        stream, GpuApiCallBehavior::Async,
 819                        bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
 820
 821     reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size()*c_nbnxnGpuClusterpairSplit,
 822                            &d_plist->nimask, &d_plist->imask_nalloc, context);
 823
 824     reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(),
 825                            &d_plist->nexcl, &d_plist->excl_nalloc, context);
 826     copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(),
 827                        stream, GpuApiCallBehavior::Async,
 828                        bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
 829
 830     if (bDoTime)
 831     {
 832         nb->timers->pl_h2d[iloc].closeTimingRegion(stream);
 833     }
 834
 835     /* need to prune the pair list during the next step */
 836     d_plist->haveFreshList = true;
 837 }
 838
 839 //! This function is documented in the header file
 840 void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_ocl_t        *nb,
 841                                const nbnxn_atomdata_t *nbatom)
 842 {
 843     cl_atomdata_t   *adat  = nb->atdat;
 844     cl_command_queue ls    = nb->stream[eintLocal];
 845
 846     /* only if we have a dynamic box */
 847     if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
 848     {
 849         ocl_copy_H2D_async(adat->shift_vec, nbatom->shift_vec.data(), 0,
 850                            SHIFTS * adat->shift_vec_elem_size, ls, nullptr);
 851         adat->bShiftVecUploaded = CL_TRUE;
 852     }
 853 }
 854
 855 //! This function is documented in the header file
 856 void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t               *nb,
 857                              const nbnxn_atomdata_t        *nbat)
 858 {
 859     cl_int           cl_error;
 860     int              nalloc, natoms;
 861     bool             realloced;
 862     bool             bDoTime = nb->bDoTime == CL_TRUE;
 863     cl_timers_t     *timers  = nb->timers;
 864     cl_atomdata_t   *d_atdat = nb->atdat;
 865     cl_command_queue ls      = nb->stream[eintLocal];
 866
 867     natoms    = nbat->numAtoms();
 868     realloced = false;
 869
 870     if (bDoTime)
 871     {
 872         /* time async copy */
 873         timers->atdat.openTimingRegion(ls);
 874     }
 875
 876     /* need to reallocate if we have to copy more atoms than the amount of space
 877        available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
 878     if (natoms > d_atdat->nalloc)
 879     {
 880         nalloc = over_alloc_small(natoms);
 881
 882         /* free up first if the arrays have already been initialized */
 883         if (d_atdat->nalloc != -1)
 884         {
 885             freeDeviceBuffer(&d_atdat->f);
 886             freeDeviceBuffer(&d_atdat->xq);
 887             freeDeviceBuffer(&d_atdat->lj_comb);
 888             freeDeviceBuffer(&d_atdat->atom_types);
 889         }
 890
 891         d_atdat->f_elem_size = sizeof(rvec);
 892
 893         d_atdat->f = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
 894                                     nalloc * d_atdat->f_elem_size, nullptr, &cl_error);
 895         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 896                            ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 897
 898         d_atdat->xq = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
 899                                      nalloc * sizeof(cl_float4), nullptr, &cl_error);
 900         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 901                            ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 902
 903         if (useLjCombRule(nb->nbparam->vdwtype))
 904         {
 905             d_atdat->lj_comb = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
 906                                               nalloc * sizeof(cl_float2), nullptr, &cl_error);
 907             GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 908                                ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 909         }
 910         else
 911         {
 912             d_atdat->atom_types = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
 913                                                  nalloc * sizeof(int), nullptr, &cl_error);
 914             GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 915                                ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 916         }
 917
 918         d_atdat->nalloc = nalloc;
 919         realloced       = true;
 920     }
 921
 922     d_atdat->natoms       = natoms;
 923     d_atdat->natoms_local = nbat->natoms_local;
 924
 925     /* need to clear GPU f output if realloc happened */
 926     if (realloced)
 927     {
 928         nbnxn_ocl_clear_f(nb, nalloc);
 929     }
 930
 931     if (useLjCombRule(nb->nbparam->vdwtype))
 932     {
 933         ocl_copy_H2D_async(d_atdat->lj_comb, nbat->params().lj_comb.data(), 0,
 934                            natoms*sizeof(cl_float2), ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
 935     }
 936     else
 937     {
 938         ocl_copy_H2D_async(d_atdat->atom_types, nbat->params().type.data(), 0,
 939                            natoms*sizeof(int), ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
 940
 941     }
 942
 943     if (bDoTime)
 944     {
 945         timers->atdat.closeTimingRegion(ls);
 946     }
 947
 948     /* kick off the tasks enqueued above to ensure concurrency with the search */
 949     cl_error = clFlush(ls);
 950     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 951                        ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
 952 }
 953
 954 /*! \brief Releases an OpenCL kernel pointer */
 955 static void free_kernel(cl_kernel *kernel_ptr)
 956 {
 957     cl_int gmx_unused cl_error;
 958
 959     assert(nullptr != kernel_ptr);
 960
 961     if (*kernel_ptr)
 962     {
 963         cl_error = clReleaseKernel(*kernel_ptr);
 964         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clReleaseKernel failed: " + ocl_get_error_string(cl_error)).c_str());
 965
 966         *kernel_ptr = nullptr;
 967     }
 968 }
 969
 970 /*! \brief Releases a list of OpenCL kernel pointers */
 971 static void free_kernels(cl_kernel *kernels, int count)
 972 {
 973     int i;
 974
 975     for (i = 0; i < count; i++)
 976     {
 977         free_kernel(kernels + i);
 978     }
 979 }
 980
 981 /*! \brief Free the OpenCL runtime data (context and program).
 982  *
 983  *  The function releases the OpenCL context and program assuciated with the
 984  *  device that the calling PP rank is running on.
 985  *
 986  *  \param runData [in]  porinter to the structure with runtime data.
 987  */
 988 static void free_gpu_device_runtime_data(gmx_device_runtime_data_t *runData)
 989 {
 990     if (runData == nullptr)
 991     {
 992         return;
 993     }
 994
 995     cl_int gmx_unused cl_error;
 996
 997     if (runData->context)
 998     {
 999         cl_error         = clReleaseContext(runData->context);
1000         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clReleaseContext failed: " + ocl_get_error_string(cl_error)).c_str());
1001         runData->context = nullptr;
1002     }
1003
1004     if (runData->program)
1005     {
1006         cl_error         = clReleaseProgram(runData->program);
1007         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clReleaseProgram failed: " + ocl_get_error_string(cl_error)).c_str());
1008         runData->program = nullptr;
1009     }
1010
1011 }
1012
1013 //! This function is documented in the header file
1014 void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
1015 {
1016     if (nb == nullptr)
1017     {
1018         return;
1019     }
1020
1021     /* Free kernels */
1022     int kernel_count = sizeof(nb->kernel_ener_noprune_ptr) / sizeof(nb->kernel_ener_noprune_ptr[0][0]);
1023     free_kernels(nb->kernel_ener_noprune_ptr[0], kernel_count);
1024
1025     kernel_count = sizeof(nb->kernel_ener_prune_ptr) / sizeof(nb->kernel_ener_prune_ptr[0][0]);
1026     free_kernels(nb->kernel_ener_prune_ptr[0], kernel_count);
1027
1028     kernel_count = sizeof(nb->kernel_noener_noprune_ptr) / sizeof(nb->kernel_noener_noprune_ptr[0][0]);
1029     free_kernels(nb->kernel_noener_noprune_ptr[0], kernel_count);
1030
1031     kernel_count = sizeof(nb->kernel_noener_prune_ptr) / sizeof(nb->kernel_noener_prune_ptr[0][0]);
1032     free_kernels(nb->kernel_noener_prune_ptr[0], kernel_count);
1033
1034     free_kernel(&(nb->kernel_zero_e_fshift));
1035
1036     /* Free atdat */
1037     freeDeviceBuffer(&(nb->atdat->xq));
1038     freeDeviceBuffer(&(nb->atdat->f));
1039     freeDeviceBuffer(&(nb->atdat->e_lj));
1040     freeDeviceBuffer(&(nb->atdat->e_el));
1041     freeDeviceBuffer(&(nb->atdat->fshift));
1042     freeDeviceBuffer(&(nb->atdat->lj_comb));
1043     freeDeviceBuffer(&(nb->atdat->atom_types));
1044     freeDeviceBuffer(&(nb->atdat->shift_vec));
1045     sfree(nb->atdat);
1046
1047     /* Free nbparam */
1048     freeDeviceBuffer(&(nb->nbparam->nbfp_climg2d));
1049     freeDeviceBuffer(&(nb->nbparam->nbfp_comb_climg2d));
1050     freeDeviceBuffer(&(nb->nbparam->coulomb_tab_climg2d));
1051     sfree(nb->nbparam);
1052
1053     /* Free plist */
1054     auto *plist = nb->plist[eintLocal];
1055     freeDeviceBuffer(&plist->sci);
1056     freeDeviceBuffer(&plist->cj4);
1057     freeDeviceBuffer(&plist->imask);
1058     freeDeviceBuffer(&plist->excl);
1059     sfree(plist);
1060     if (nb->bUseTwoStreams)
1061     {
1062         auto *plist_nl = nb->plist[eintNonlocal];
1063         freeDeviceBuffer(&plist_nl->sci);
1064         freeDeviceBuffer(&plist_nl->cj4);
1065         freeDeviceBuffer(&plist_nl->imask);
1066         freeDeviceBuffer(&plist_nl->excl);
1067         sfree(plist_nl);
1068     }
1069
1070     /* Free nbst */
1071     pfree(nb->nbst.e_lj);
1072     nb->nbst.e_lj = nullptr;
1073
1074     pfree(nb->nbst.e_el);
1075     nb->nbst.e_el = nullptr;
1076
1077     pfree(nb->nbst.fshift);
1078     nb->nbst.fshift = nullptr;
1079
1080     /* Free command queues */
1081     clReleaseCommandQueue(nb->stream[eintLocal]);
1082     nb->stream[eintLocal] = nullptr;
1083     if (nb->bUseTwoStreams)
1084     {
1085         clReleaseCommandQueue(nb->stream[eintNonlocal]);
1086         nb->stream[eintNonlocal] = nullptr;
1087     }
1088     /* Free other events */
1089     if (nb->nonlocal_done)
1090     {
1091         clReleaseEvent(nb->nonlocal_done);
1092         nb->nonlocal_done = nullptr;
1093     }
1094     if (nb->misc_ops_and_local_H2D_done)
1095     {
1096         clReleaseEvent(nb->misc_ops_and_local_H2D_done);
1097         nb->misc_ops_and_local_H2D_done = nullptr;
1098     }
1099
1100     free_gpu_device_runtime_data(nb->dev_rundata);
1101     sfree(nb->dev_rundata);
1102
1103     /* Free timers and timings */
1104     delete nb->timers;
1105     sfree(nb->timings);
1106     sfree(nb);
1107
1108     if (debug)
1109     {
1110         fprintf(debug, "Cleaned up OpenCL data structures.\n");
1111     }
1112 }
1113
1114 //! This function is documented in the header file
1115 gmx_wallclock_gpu_nbnxn_t *nbnxn_gpu_get_timings(gmx_nbnxn_ocl_t *nb)
1116 {
1117     return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
1118 }
1119
1120 //! This function is documented in the header file
1121 void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
1122 {
1123     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
1124     {
1125         init_timings(nbv->gpu_nbv->timings);
1126     }
1127 }
1128
1129 //! This function is documented in the header file
1130 int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_ocl_t *nb)
1131 {
1132     return nb != nullptr ?
1133            gpu_min_ci_balanced_factor * nb->dev_info->compute_units : 0;
1134 }
1135
1136 //! This function is documented in the header file
1137 gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_ocl_t *nb)
1138 {
1139     return ((nb->nbparam->eeltype == eelOclEWALD_ANA) ||
1140             (nb->nbparam->eeltype == eelOclEWALD_ANA_TWIN));
1141 }