src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *  \brief Define OpenCL implementation of nbnxn_gpu_data_mgmt.h
  37  *
  38  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  39  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  40  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  41  *  \author Szilárd Páll <pall.szilard@gmail.com>
  42  */
  43 #include "gmxpre.h"
  44
  45 #include <assert.h>
  46 #include <stdarg.h>
  47 #include <stdio.h>
  48 #include <stdlib.h>
  49 #include <string.h>
  50
  51 #include <cmath>
  52
  53 #include "gromacs/gpu_utils/gpu_utils.h"
  54 #include "gromacs/gpu_utils/oclutils.h"
  55 #include "gromacs/hardware/gpu_hw_info.h"
  56 #include "gromacs/math/vectypes.h"
  57 #include "gromacs/mdlib/force_flags.h"
  58 #include "gromacs/mdlib/nb_verlet.h"
  59 #include "gromacs/mdlib/nbnxn_consts.h"
  60 #include "gromacs/mdlib/nbnxn_gpu.h"
  61 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  62 #include "gromacs/mdlib/nbnxn_gpu_jit_support.h"
  63 #include "gromacs/mdtypes/interaction_const.h"
  64 #include "gromacs/mdtypes/md_enums.h"
  65 #include "gromacs/pbcutil/ishift.h"
  66 #include "gromacs/timing/gpu_timing.h"
  67 #include "gromacs/utility/cstringutil.h"
  68 #include "gromacs/utility/fatalerror.h"
  69 #include "gromacs/utility/gmxassert.h"
  70 #include "gromacs/utility/real.h"
  71 #include "gromacs/utility/smalloc.h"
  72
  73 #include "nbnxn_ocl_internal.h"
  74 #include "nbnxn_ocl_types.h"
  75
  76 /*! \brief This parameter should be determined heuristically from the
  77  * kernel execution times
  78  *
  79  * This value is best for small systems on a single AMD Radeon R9 290X
  80  * (and about 5% faster than 40, which is the default for CUDA
  81  * devices). Larger simulation systems were quite insensitive to the
  82  * value of this parameter.
  83  */
  84 static unsigned int gpu_min_ci_balanced_factor = 50;
  85
  86
  87 /*! \brief Returns true if LJ combination rules are used in the non-bonded kernels.
  88  *
  89  * Full doc in nbnxn_ocl_internal.h */
  90 bool useLjCombRule(int vdwType)
  91 {
  92     return (vdwType == evdwOclCUTCOMBGEOM ||
  93             vdwType == evdwOclCUTCOMBLB);
  94 }
  95
  96 /*! \brief Tabulates the Ewald Coulomb force and initializes the size/scale
  97  * and the table GPU array.
  98  *
  99  * If called with an already allocated table, it just re-uploads the
 100  * table.
 101  */
 102 static void init_ewald_coulomb_force_table(const interaction_const_t       *ic,
 103                                            cl_nbparam_t                    *nbp,
 104                                            const gmx_device_runtime_data_t *runData)
 105 {
 106     cl_mem       coul_tab;
 107
 108     cl_int       cl_error;
 109
 110     if (nbp->coulomb_tab_climg2d != nullptr)
 111     {
 112         freeDeviceBuffer(&(nbp->coulomb_tab_climg2d));
 113     }
 114
 115     /* Switched from using textures to using buffers */
 116     // TODO: decide which alternative is most efficient - textures or buffers.
 117     /*
 118        cl_image_format array_format;
 119
 120        array_format.image_channel_data_type = CL_FLOAT;
 121        array_format.image_channel_order     = CL_R;
 122
 123        coul_tab = clCreateImage2D(runData->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
 124        &array_format, tabsize, 1, 0, ftmp, &cl_error);
 125      */
 126
 127     coul_tab = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
 128                               ic->tabq_size*sizeof(cl_float), ic->tabq_coul_F, &cl_error);
 129     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 130                        ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 131
 132     nbp->coulomb_tab_climg2d  = coul_tab;
 133     nbp->coulomb_tab_scale    = ic->tabq_scale;
 134 }
 135
 136
 137 /*! \brief Initializes the atomdata structure first time, it only gets filled at
 138     pair-search.
 139  */
 140 static void init_atomdata_first(cl_atomdata_t *ad, int ntypes, gmx_device_runtime_data_t *runData)
 141 {
 142     cl_int cl_error;
 143
 144     ad->ntypes  = ntypes;
 145
 146     /* An element of the shift_vec device buffer has the same size as one element
 147        of the host side shift_vec buffer. */
 148     ad->shift_vec_elem_size = sizeof(*nbnxn_atomdata_t::shift_vec.data());
 149
 150     ad->shift_vec = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
 151                                    SHIFTS * ad->shift_vec_elem_size, nullptr, &cl_error);
 152     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 153                        ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 154     ad->bShiftVecUploaded = CL_FALSE;
 155
 156     /* An element of the fshift device buffer has the same size as one element
 157        of the host side fshift buffer. */
 158     ad->fshift_elem_size = sizeof(*cl_nb_staging_t::fshift);
 159
 160     ad->fshift = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
 161                                 SHIFTS * ad->fshift_elem_size, nullptr, &cl_error);
 162     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 163                        ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 164
 165     ad->e_lj = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
 166                               sizeof(float), nullptr, &cl_error);
 167     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 168                        ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 169
 170     ad->e_el = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
 171                               sizeof(float), nullptr, &cl_error);
 172     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 173                        ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 174
 175     /* initialize to nullptr pointers to data that is not allocated here and will
 176        need reallocation in nbnxn_gpu_init_atomdata */
 177     ad->xq = nullptr;
 178     ad->f  = nullptr;
 179
 180     /* size -1 indicates that the respective array hasn't been initialized yet */
 181     ad->natoms = -1;
 182     ad->nalloc = -1;
 183 }
 184
 185 /*! \brief Copies all parameters related to the cut-off from ic to nbp
 186  */
 187 static void set_cutoff_parameters(cl_nbparam_t              *nbp,
 188                                   const interaction_const_t *ic,
 189                                   const NbnxnListParameters *listParams)
 190 {
 191     nbp->ewald_beta        = ic->ewaldcoeff_q;
 192     nbp->sh_ewald          = ic->sh_ewald;
 193     nbp->epsfac            = ic->epsfac;
 194     nbp->two_k_rf          = 2.0 * ic->k_rf;
 195     nbp->c_rf              = ic->c_rf;
 196     nbp->rvdw_sq           = ic->rvdw * ic->rvdw;
 197     nbp->rcoulomb_sq       = ic->rcoulomb * ic->rcoulomb;
 198     nbp->rlistOuter_sq     = listParams->rlistOuter * listParams->rlistOuter;
 199     nbp->rlistInner_sq     = listParams->rlistInner * listParams->rlistInner;
 200     nbp->useDynamicPruning = listParams->useDynamicPruning;
 201
 202     nbp->sh_lj_ewald       = ic->sh_lj_ewald;
 203     nbp->ewaldcoeff_lj     = ic->ewaldcoeff_lj;
 204
 205     nbp->rvdw_switch       = ic->rvdw_switch;
 206     nbp->dispersion_shift  = ic->dispersion_shift;
 207     nbp->repulsion_shift   = ic->repulsion_shift;
 208     nbp->vdw_switch        = ic->vdw_switch;
 209 }
 210
 211 /*! \brief Returns the kinds of electrostatics and Vdw OpenCL
 212  *  kernels that will be used.
 213  *
 214  * Respectively, these values are from enum eelOcl and enum
 215  * evdwOcl. */
 216 static void
 217 map_interaction_types_to_gpu_kernel_flavors(const interaction_const_t *ic,
 218                                             int                        combRule,
 219                                             int                       *gpu_eeltype,
 220                                             int                       *gpu_vdwtype)
 221 {
 222     if (ic->vdwtype == evdwCUT)
 223     {
 224         switch (ic->vdw_modifier)
 225         {
 226             case eintmodNONE:
 227             case eintmodPOTSHIFT:
 228                 switch (combRule)
 229                 {
 230                     case ljcrNONE:
 231                         *gpu_vdwtype = evdwOclCUT;
 232                         break;
 233                     case ljcrGEOM:
 234                         *gpu_vdwtype = evdwOclCUTCOMBGEOM;
 235                         break;
 236                     case ljcrLB:
 237                         *gpu_vdwtype = evdwOclCUTCOMBLB;
 238                         break;
 239                     default:
 240                         gmx_incons("The requested LJ combination rule is not implemented in the OpenCL GPU accelerated kernels!");
 241                 }
 242                 break;
 243             case eintmodFORCESWITCH:
 244                 *gpu_vdwtype = evdwOclFSWITCH;
 245                 break;
 246             case eintmodPOTSWITCH:
 247                 *gpu_vdwtype = evdwOclPSWITCH;
 248                 break;
 249             default:
 250                 gmx_incons("The requested VdW interaction modifier is not implemented in the GPU accelerated kernels!");
 251         }
 252     }
 253     else if (ic->vdwtype == evdwPME)
 254     {
 255         if (ic->ljpme_comb_rule == ljcrGEOM)
 256         {
 257             *gpu_vdwtype = evdwOclEWALDGEOM;
 258         }
 259         else
 260         {
 261             *gpu_vdwtype = evdwOclEWALDLB;
 262         }
 263     }
 264     else
 265     {
 266         gmx_incons("The requested VdW type is not implemented in the GPU accelerated kernels!");
 267     }
 268
 269     if (ic->eeltype == eelCUT)
 270     {
 271         *gpu_eeltype = eelOclCUT;
 272     }
 273     else if (EEL_RF(ic->eeltype))
 274     {
 275         *gpu_eeltype = eelOclRF;
 276     }
 277     else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
 278     {
 279         /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
 280         *gpu_eeltype = nbnxn_gpu_pick_ewald_kernel_type(false);
 281     }
 282     else
 283     {
 284         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
 285         gmx_incons("The requested electrostatics type is not implemented in the GPU accelerated kernels!");
 286     }
 287 }
 288
 289 /*! \brief Initializes the nonbonded parameter data structure.
 290  */
 291 static void init_nbparam(cl_nbparam_t                    *nbp,
 292                          const interaction_const_t       *ic,
 293                          const NbnxnListParameters       *listParams,
 294                          const nbnxn_atomdata_t::Params  &nbatParams,
 295                          const gmx_device_runtime_data_t *runData)
 296 {
 297     cl_int cl_error;
 298
 299     set_cutoff_parameters(nbp, ic, listParams);
 300
 301     map_interaction_types_to_gpu_kernel_flavors(ic,
 302                                                 nbatParams.comb_rule,
 303                                                 &(nbp->eeltype),
 304                                                 &(nbp->vdwtype));
 305
 306     if (ic->vdwtype == evdwPME)
 307     {
 308         if (ic->ljpme_comb_rule == ljcrGEOM)
 309         {
 310             GMX_ASSERT(nbatParams.comb_rule == ljcrGEOM, "Combination rule mismatch!");
 311         }
 312         else
 313         {
 314             GMX_ASSERT(nbatParams.comb_rule == ljcrLB, "Combination rule mismatch!");
 315         }
 316     }
 317     /* generate table for PME */
 318     nbp->coulomb_tab_climg2d = nullptr;
 319     if (nbp->eeltype == eelOclEWALD_TAB || nbp->eeltype == eelOclEWALD_TAB_TWIN)
 320     {
 321         init_ewald_coulomb_force_table(ic, nbp, runData);
 322     }
 323     else
 324     // TODO: improvement needed.
 325     // The image2d is created here even if eeltype is not eelCuEWALD_TAB or eelCuEWALD_TAB_TWIN because the OpenCL kernels
 326     // don't accept nullptr values for image2D parameters.
 327     {
 328         /* Switched from using textures to using buffers */
 329         // TODO: decide which alternative is most efficient - textures or buffers.
 330         /*
 331            cl_image_format array_format;
 332
 333            array_format.image_channel_data_type = CL_FLOAT;
 334            array_format.image_channel_order     = CL_R;
 335
 336            nbp->coulomb_tab_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE,
 337             &array_format, 1, 1, 0, nullptr, &cl_error);
 338          */
 339
 340         nbp->coulomb_tab_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY, sizeof(cl_float), nullptr, &cl_error);
 341         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 342                            ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 343     }
 344
 345     const int nnbfp      = 2*nbatParams.numTypes*nbatParams.numTypes;
 346     const int nnbfp_comb = 2*nbatParams.numTypes;
 347
 348     {
 349         /* Switched from using textures to using buffers */
 350         // TODO: decide which alternative is most efficient - textures or buffers.
 351         /*
 352            cl_image_format array_format;
 353
 354            array_format.image_channel_data_type = CL_FLOAT;
 355            array_format.image_channel_order     = CL_R;
 356
 357            nbp->nbfp_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
 358             &array_format, nnbfp, 1, 0, nbat->nbfp, &cl_error);
 359          */
 360
 361         nbp->nbfp_climg2d =
 362             clCreateBuffer(runData->context,
 363                            CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
 364                            nnbfp*sizeof(cl_float),
 365                            const_cast<float *>(nbatParams.nbfp.data()),
 366                            &cl_error);
 367         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 368                            ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 369
 370         if (ic->vdwtype == evdwPME)
 371         {
 372             /* Switched from using textures to using buffers */
 373             // TODO: decide which alternative is most efficient - textures or buffers.
 374             /*  nbp->nbfp_comb_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
 375                 &array_format, nnbfp_comb, 1, 0, nbat->nbfp_comb, &cl_error);*/
 376             nbp->nbfp_comb_climg2d =
 377                 clCreateBuffer(runData->context,
 378                                CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
 379                                nnbfp_comb*sizeof(cl_float),
 380                                const_cast<float *>(nbatParams.nbfp_comb.data()),
 381                                &cl_error);
 382             GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 383                                ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 384         }
 385         else
 386         {
 387             // TODO: improvement needed.
 388             // The image2d is created here even if vdwtype is not evdwPME because the OpenCL kernels
 389             // don't accept nullptr values for image2D parameters.
 390             /* Switched from using textures to using buffers */
 391             // TODO: decide which alternative is most efficient - textures or buffers.
 392             /* nbp->nbfp_comb_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE,
 393                 &array_format, 1, 1, 0, nullptr, &cl_error);*/
 394             nbp->nbfp_comb_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY, sizeof(cl_float), nullptr, &cl_error);
 395             GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 396                                ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 397         }
 398     }
 399 }
 400
 401 //! This function is documented in the header file
 402 void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
 403                                         const interaction_const_t   *ic,
 404                                         const NbnxnListParameters   *listParams)
 405 {
 406     if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
 407     {
 408         return;
 409     }
 410     gmx_nbnxn_ocl_t    *nb  = nbv->gpu_nbv;
 411     cl_nbparam_t       *nbp = nb->nbparam;
 412
 413     set_cutoff_parameters(nbp, ic, listParams);
 414
 415     nbp->eeltype = nbnxn_gpu_pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw);
 416
 417     init_ewald_coulomb_force_table(ic, nb->nbparam, nb->dev_rundata);
 418 }
 419
 420 /*! \brief Initializes the pair list data structure.
 421  */
 422 static void init_plist(cl_plist_t *pl)
 423 {
 424     /* initialize to nullptr pointers to data that is not allocated here and will
 425        need reallocation in nbnxn_gpu_init_pairlist */
 426     pl->sci     = nullptr;
 427     pl->cj4     = nullptr;
 428     pl->imask   = nullptr;
 429     pl->excl    = nullptr;
 430
 431     /* size -1 indicates that the respective array hasn't been initialized yet */
 432     pl->na_c           = -1;
 433     pl->nsci           = -1;
 434     pl->sci_nalloc     = -1;
 435     pl->ncj4           = -1;
 436     pl->cj4_nalloc     = -1;
 437     pl->nimask         = -1;
 438     pl->imask_nalloc   = -1;
 439     pl->nexcl          = -1;
 440     pl->excl_nalloc    = -1;
 441     pl->haveFreshList  = false;
 442 }
 443
 444 /*! \brief Initializes the timer data structure.
 445  */
 446 static void init_timers(cl_timers_t *t,
 447                         bool         bUseTwoStreams)
 448 {
 449     for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++)
 450     {
 451         t->didPairlistH2D[i]  = false;
 452         t->didPrune[i]        = false;
 453         t->didRollingPrune[i] = false;
 454     }
 455 }
 456
 457 /*! \brief Initializes the timings data structure.
 458  */
 459 static void init_timings(gmx_wallclock_gpu_nbnxn_t *t)
 460 {
 461     int i, j;
 462
 463     t->nb_h2d_t = 0.0;
 464     t->nb_d2h_t = 0.0;
 465     t->nb_c     = 0;
 466     t->pl_h2d_t = 0.0;
 467     t->pl_h2d_c = 0;
 468     for (i = 0; i < 2; i++)
 469     {
 470         for (j = 0; j < 2; j++)
 471         {
 472             t->ktime[i][j].t = 0.0;
 473             t->ktime[i][j].c = 0;
 474         }
 475     }
 476
 477     t->pruneTime.c        = 0;
 478     t->pruneTime.t        = 0.0;
 479     t->dynamicPruneTime.c = 0;
 480     t->dynamicPruneTime.t = 0.0;
 481 }
 482
 483 /*! \brief Creates context for OpenCL GPU given by \p mygpu
 484  *
 485  * A fatal error results if creation fails.
 486  *
 487  * \param[inout] runtimeData runtime data including program and context
 488  * \param[in]    devInfo     device info struct
 489  * \param[in]    rank        MPI rank (for error reporting)
 490  */
 491 static void
 492 nbnxn_gpu_create_context(gmx_device_runtime_data_t *runtimeData,
 493                          const gmx_device_info_t   *devInfo,
 494                          int                        rank)
 495 {
 496     cl_context_properties     context_properties[3];
 497     cl_platform_id            platform_id;
 498     cl_device_id              device_id;
 499     cl_context                context;
 500     cl_int                    cl_error;
 501
 502     assert(runtimeData != nullptr);
 503     assert(devInfo != nullptr);
 504
 505     platform_id      = devInfo->ocl_gpu_id.ocl_platform_id;
 506     device_id        = devInfo->ocl_gpu_id.ocl_device_id;
 507
 508     context_properties[0] = CL_CONTEXT_PLATFORM;
 509     context_properties[1] = reinterpret_cast<cl_context_properties>(platform_id);
 510     context_properties[2] = 0; /* Terminates the list of properties */
 511
 512     context = clCreateContext(context_properties, 1, &device_id, nullptr, nullptr, &cl_error);
 513     if (CL_SUCCESS != cl_error)
 514     {
 515         gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s:\n OpenCL error %d: %s",
 516                   rank,
 517                   devInfo->device_name,
 518                   cl_error, ocl_get_error_string(cl_error).c_str());
 519     }
 520
 521     runtimeData->context = context;
 522 }
 523
 524 /*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
 525 static cl_kernel nbnxn_gpu_create_kernel(gmx_nbnxn_ocl_t *nb,
 526                                          const char      *kernel_name)
 527 {
 528     cl_kernel kernel;
 529     cl_int    cl_error;
 530
 531     kernel = clCreateKernel(nb->dev_rundata->program, kernel_name, &cl_error);
 532     if (CL_SUCCESS != cl_error)
 533     {
 534         gmx_fatal(FARGS, "Failed to create kernel '%s' for GPU #%s: OpenCL error %d",
 535                   kernel_name,
 536                   nb->dev_info->device_name,
 537                   cl_error);
 538     }
 539
 540     return kernel;
 541 }
 542
 543 /*! \brief Clears nonbonded shift force output array and energy outputs on the GPU.
 544  */
 545 static void
 546 nbnxn_ocl_clear_e_fshift(gmx_nbnxn_ocl_t *nb)
 547 {
 548
 549     cl_int               cl_error;
 550     cl_atomdata_t *      adat     = nb->atdat;
 551     cl_command_queue     ls       = nb->stream[eintLocal];
 552
 553     size_t               local_work_size[3]   = {1, 1, 1};
 554     size_t               global_work_size[3]  = {1, 1, 1};
 555
 556     cl_int               shifts   = SHIFTS*3;
 557
 558     cl_int               arg_no;
 559
 560     cl_kernel            zero_e_fshift = nb->kernel_zero_e_fshift;
 561
 562     local_work_size[0]   = 64;
 563     // Round the total number of threads up from the array size
 564     global_work_size[0]  = ((shifts + local_work_size[0] - 1)/local_work_size[0])*local_work_size[0];
 565
 566     arg_no    = 0;
 567     cl_error  = clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->fshift));
 568     cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_lj));
 569     cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_el));
 570     cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_uint), &shifts);
 571     GMX_ASSERT(cl_error == CL_SUCCESS, ocl_get_error_string(cl_error).c_str());
 572
 573     cl_error = clEnqueueNDRangeKernel(ls, zero_e_fshift, 3, nullptr, global_work_size, local_work_size, 0, nullptr, nullptr);
 574     GMX_ASSERT(cl_error == CL_SUCCESS, ocl_get_error_string(cl_error).c_str());
 575 }
 576
 577 /*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
 578 static void nbnxn_gpu_init_kernels(gmx_nbnxn_ocl_t *nb)
 579 {
 580     /* Init to 0 main kernel arrays */
 581     /* They will be later on initialized in select_nbnxn_kernel */
 582     // TODO: consider always creating all variants of the kernels here so that there is no
 583     // need for late call to clCreateKernel -- if that gives any advantage?
 584     memset(nb->kernel_ener_noprune_ptr, 0, sizeof(nb->kernel_ener_noprune_ptr));
 585     memset(nb->kernel_ener_prune_ptr, 0, sizeof(nb->kernel_ener_prune_ptr));
 586     memset(nb->kernel_noener_noprune_ptr, 0, sizeof(nb->kernel_noener_noprune_ptr));
 587     memset(nb->kernel_noener_prune_ptr, 0, sizeof(nb->kernel_noener_prune_ptr));
 588
 589     /* Init pruning kernels
 590      *
 591      * TODO: we could avoid creating kernels if dynamic pruning is turned off,
 592      * but ATM that depends on force flags not passed into the initialization.
 593      */
 594     nb->kernel_pruneonly[epruneFirst]   = nbnxn_gpu_create_kernel(nb, "nbnxn_kernel_prune_opencl");
 595     nb->kernel_pruneonly[epruneRolling] = nbnxn_gpu_create_kernel(nb, "nbnxn_kernel_prune_rolling_opencl");
 596
 597     /* Init auxiliary kernels */
 598     nb->kernel_zero_e_fshift = nbnxn_gpu_create_kernel(nb, "zero_e_fshift");
 599 }
 600
 601 /*! \brief Initializes simulation constant data.
 602  *
 603  *  Initializes members of the atomdata and nbparam structs and
 604  *  clears e/fshift output buffers.
 605  */
 606 static void nbnxn_ocl_init_const(gmx_nbnxn_ocl_t                *nb,
 607                                  const interaction_const_t      *ic,
 608                                  const NbnxnListParameters      *listParams,
 609                                  const nbnxn_atomdata_t::Params &nbatParams)
 610 {
 611     init_atomdata_first(nb->atdat, nbatParams.numTypes, nb->dev_rundata);
 612     init_nbparam(nb->nbparam, ic, listParams, nbatParams, nb->dev_rundata);
 613 }
 614
 615
 616 //! This function is documented in the header file
 617 void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
 618                     const gmx_device_info_t   *deviceInfo,
 619                     const interaction_const_t *ic,
 620                     const NbnxnListParameters *listParams,
 621                     const nbnxn_atomdata_t    *nbat,
 622                     int                        rank,
 623                     gmx_bool                   bLocalAndNonlocal)
 624 {
 625     gmx_nbnxn_ocl_t            *nb;
 626     cl_int                      cl_error;
 627     cl_command_queue_properties queue_properties;
 628
 629     assert(ic);
 630
 631     if (p_nb == nullptr)
 632     {
 633         return;
 634     }
 635
 636     snew(nb, 1);
 637     snew(nb->atdat, 1);
 638     snew(nb->nbparam, 1);
 639     snew(nb->plist[eintLocal], 1);
 640     if (bLocalAndNonlocal)
 641     {
 642         snew(nb->plist[eintNonlocal], 1);
 643     }
 644
 645     nb->bUseTwoStreams = static_cast<cl_bool>(bLocalAndNonlocal);
 646
 647     nb->timers = new cl_timers_t();
 648     snew(nb->timings, 1);
 649
 650     /* set device info, just point it to the right GPU among the detected ones */
 651     nb->dev_info = deviceInfo;
 652     snew(nb->dev_rundata, 1);
 653
 654     /* init nbst */
 655     pmalloc(reinterpret_cast<void**>(&nb->nbst.e_lj), sizeof(*nb->nbst.e_lj));
 656     pmalloc(reinterpret_cast<void**>(&nb->nbst.e_el), sizeof(*nb->nbst.e_el));
 657     pmalloc(reinterpret_cast<void**>(&nb->nbst.fshift), SHIFTS * sizeof(*nb->nbst.fshift));
 658
 659     init_plist(nb->plist[eintLocal]);
 660
 661     /* OpenCL timing disabled if GMX_DISABLE_GPU_TIMING is defined. */
 662     nb->bDoTime = static_cast<cl_bool>(getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
 663
 664     /* Create queues only after bDoTime has been initialized */
 665     if (nb->bDoTime)
 666     {
 667         queue_properties = CL_QUEUE_PROFILING_ENABLE;
 668     }
 669     else
 670     {
 671         queue_properties = 0;
 672     }
 673
 674     nbnxn_gpu_create_context(nb->dev_rundata, nb->dev_info, rank);
 675
 676     /* local/non-local GPU streams */
 677     nb->stream[eintLocal] = clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
 678     if (CL_SUCCESS != cl_error)
 679     {
 680         gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
 681                   rank,
 682                   nb->dev_info->device_name,
 683                   cl_error);
 684     }
 685
 686     if (nb->bUseTwoStreams)
 687     {
 688         init_plist(nb->plist[eintNonlocal]);
 689
 690         nb->stream[eintNonlocal] = clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
 691         if (CL_SUCCESS != cl_error)
 692         {
 693             gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
 694                       rank,
 695                       nb->dev_info->device_name,
 696                       cl_error);
 697         }
 698     }
 699
 700     if (nb->bDoTime)
 701     {
 702         init_timers(nb->timers, nb->bUseTwoStreams == CL_TRUE);
 703         init_timings(nb->timings);
 704     }
 705
 706     nbnxn_ocl_init_const(nb, ic, listParams, nbat->params());
 707
 708     /* Enable LJ param manual prefetch for AMD or Intel or if we request through env. var.
 709      * TODO: decide about NVIDIA
 710      */
 711     nb->bPrefetchLjParam =
 712         (getenv("GMX_OCL_DISABLE_I_PREFETCH") == nullptr) &&
 713         ((nb->dev_info->vendor_e == OCL_VENDOR_AMD) || (nb->dev_info->vendor_e == OCL_VENDOR_INTEL)
 714          || (getenv("GMX_OCL_ENABLE_I_PREFETCH") != nullptr));
 715
 716     /* NOTE: in CUDA we pick L1 cache configuration for the nbnxn kernels here,
 717      * but sadly this is not supported in OpenCL (yet?). Consider adding it if
 718      * it becomes supported.
 719      */
 720     nbnxn_gpu_compile_kernels(nb);
 721     nbnxn_gpu_init_kernels(nb);
 722
 723     /* clear energy and shift force outputs */
 724     nbnxn_ocl_clear_e_fshift(nb);
 725
 726     *p_nb = nb;
 727
 728     if (debug)
 729     {
 730         fprintf(debug, "Initialized OpenCL data structures.\n");
 731     }
 732 }
 733
 734 /*! \brief Clears the first natoms_clear elements of the GPU nonbonded force output array.
 735  */
 736 static void nbnxn_ocl_clear_f(gmx_nbnxn_ocl_t *nb, int natoms_clear)
 737 {
 738     if (natoms_clear == 0)
 739     {
 740         return;
 741     }
 742
 743     cl_int gmx_used_in_debug cl_error;
 744
 745     cl_atomdata_t           *atomData = nb->atdat;
 746     cl_command_queue         ls       = nb->stream[eintLocal];
 747     cl_float                 value    = 0.0f;
 748
 749     cl_error = clEnqueueFillBuffer(ls, atomData->f, &value, sizeof(cl_float),
 750                                    0, natoms_clear*sizeof(rvec), 0, nullptr, nullptr);
 751     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clEnqueueFillBuffer failed: " +
 752                                                 ocl_get_error_string(cl_error)).c_str());
 753 }
 754
 755 //! This function is documented in the header file
 756 void
 757 nbnxn_gpu_clear_outputs(gmx_nbnxn_ocl_t   *nb,
 758                         int                flags)
 759 {
 760     nbnxn_ocl_clear_f(nb, nb->atdat->natoms);
 761     /* clear shift force array and energies if the outputs were
 762        used in the current step */
 763     if (flags & GMX_FORCE_VIRIAL)
 764     {
 765         nbnxn_ocl_clear_e_fshift(nb);
 766     }
 767
 768     /* kick off buffer clearing kernel to ensure concurrency with constraints/update */
 769     cl_int gmx_unused cl_error;
 770     cl_error = clFlush(nb->stream[eintLocal]);
 771     assert(CL_SUCCESS == cl_error);
 772 }
 773
 774 //! This function is documented in the header file
 775 void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t        *nb,
 776                              const NbnxnPairlistGpu *h_plist,
 777                              int                     iloc)
 778 {
 779     char             sbuf[STRLEN];
 780     // Timing accumulation should happen only if there was work to do
 781     // because getLastRangeTime() gets skipped with empty lists later
 782     // which leads to the counter not being reset.
 783     bool             bDoTime    = ((nb->bDoTime == CL_TRUE) && h_plist->nsci > 0);
 784     cl_command_queue stream     = nb->stream[iloc];
 785     cl_plist_t      *d_plist    = nb->plist[iloc];
 786
 787     if (d_plist->na_c < 0)
 788     {
 789         d_plist->na_c = h_plist->na_ci;
 790     }
 791     else
 792     {
 793         if (d_plist->na_c != h_plist->na_ci)
 794         {
 795             sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
 796                     d_plist->na_c, h_plist->na_ci);
 797             gmx_incons(sbuf);
 798         }
 799     }
 800
 801     if (bDoTime)
 802     {
 803         nb->timers->pl_h2d[iloc].openTimingRegion(stream);
 804         nb->timers->didPairlistH2D[iloc] = true;
 805     }
 806
 807     // TODO most of this function is same in CUDA and OpenCL, move into the header
 808     Context context = nb->dev_rundata->context;
 809
 810     reallocateDeviceBuffer(&d_plist->sci, h_plist->nsci,
 811                            &d_plist->nsci, &d_plist->sci_nalloc, context);
 812     copyToDeviceBuffer(&d_plist->sci, h_plist->sci, 0, h_plist->nsci,
 813                        stream, GpuApiCallBehavior::Async,
 814                        bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
 815
 816     reallocateDeviceBuffer(&d_plist->cj4, h_plist->ncj4,
 817                            &d_plist->ncj4, &d_plist->cj4_nalloc, context);
 818     copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4, 0, h_plist->ncj4,
 819                        stream, GpuApiCallBehavior::Async,
 820                        bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
 821
 822     reallocateDeviceBuffer(&d_plist->imask, h_plist->ncj4*c_nbnxnGpuClusterpairSplit,
 823                            &d_plist->nimask, &d_plist->imask_nalloc, context);
 824
 825     reallocateDeviceBuffer(&d_plist->excl, h_plist->nexcl,
 826                            &d_plist->nexcl, &d_plist->excl_nalloc, context);
 827     copyToDeviceBuffer(&d_plist->excl, h_plist->excl, 0, h_plist->nexcl,
 828                        stream, GpuApiCallBehavior::Async,
 829                        bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
 830
 831     if (bDoTime)
 832     {
 833         nb->timers->pl_h2d[iloc].closeTimingRegion(stream);
 834     }
 835
 836     /* need to prune the pair list during the next step */
 837     d_plist->haveFreshList = true;
 838 }
 839
 840 //! This function is documented in the header file
 841 void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_ocl_t        *nb,
 842                                const nbnxn_atomdata_t *nbatom)
 843 {
 844     cl_atomdata_t   *adat  = nb->atdat;
 845     cl_command_queue ls    = nb->stream[eintLocal];
 846
 847     /* only if we have a dynamic box */
 848     if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
 849     {
 850         ocl_copy_H2D_async(adat->shift_vec, nbatom->shift_vec.data(), 0,
 851                            SHIFTS * adat->shift_vec_elem_size, ls, nullptr);
 852         adat->bShiftVecUploaded = CL_TRUE;
 853     }
 854 }
 855
 856 //! This function is documented in the header file
 857 void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t               *nb,
 858                              const nbnxn_atomdata_t        *nbat)
 859 {
 860     cl_int           cl_error;
 861     int              nalloc, natoms;
 862     bool             realloced;
 863     bool             bDoTime = nb->bDoTime == CL_TRUE;
 864     cl_timers_t     *timers  = nb->timers;
 865     cl_atomdata_t   *d_atdat = nb->atdat;
 866     cl_command_queue ls      = nb->stream[eintLocal];
 867
 868     natoms    = nbat->numAtoms();
 869     realloced = false;
 870
 871     if (bDoTime)
 872     {
 873         /* time async copy */
 874         timers->atdat.openTimingRegion(ls);
 875     }
 876
 877     /* need to reallocate if we have to copy more atoms than the amount of space
 878        available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
 879     if (natoms > d_atdat->nalloc)
 880     {
 881         nalloc = over_alloc_small(natoms);
 882
 883         /* free up first if the arrays have already been initialized */
 884         if (d_atdat->nalloc != -1)
 885         {
 886             freeDeviceBuffer(&d_atdat->f);
 887             freeDeviceBuffer(&d_atdat->xq);
 888             freeDeviceBuffer(&d_atdat->lj_comb);
 889             freeDeviceBuffer(&d_atdat->atom_types);
 890         }
 891
 892         d_atdat->f_elem_size = sizeof(rvec);
 893
 894         d_atdat->f = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
 895                                     nalloc * d_atdat->f_elem_size, nullptr, &cl_error);
 896         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 897                            ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 898
 899         d_atdat->xq = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
 900                                      nalloc * sizeof(cl_float4), nullptr, &cl_error);
 901         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 902                            ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 903
 904         if (useLjCombRule(nb->nbparam->vdwtype))
 905         {
 906             d_atdat->lj_comb = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
 907                                               nalloc * sizeof(cl_float2), nullptr, &cl_error);
 908             GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 909                                ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 910         }
 911         else
 912         {
 913             d_atdat->atom_types = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
 914                                                  nalloc * sizeof(int), nullptr, &cl_error);
 915             GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 916                                ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 917         }
 918
 919         d_atdat->nalloc = nalloc;
 920         realloced       = true;
 921     }
 922
 923     d_atdat->natoms       = natoms;
 924     d_atdat->natoms_local = nbat->natoms_local;
 925
 926     /* need to clear GPU f output if realloc happened */
 927     if (realloced)
 928     {
 929         nbnxn_ocl_clear_f(nb, nalloc);
 930     }
 931
 932     if (useLjCombRule(nb->nbparam->vdwtype))
 933     {
 934         ocl_copy_H2D_async(d_atdat->lj_comb, nbat->params().lj_comb.data(), 0,
 935                            natoms*sizeof(cl_float2), ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
 936     }
 937     else
 938     {
 939         ocl_copy_H2D_async(d_atdat->atom_types, nbat->params().type.data(), 0,
 940                            natoms*sizeof(int), ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
 941
 942     }
 943
 944     if (bDoTime)
 945     {
 946         timers->atdat.closeTimingRegion(ls);
 947     }
 948
 949     /* kick off the tasks enqueued above to ensure concurrency with the search */
 950     cl_error = clFlush(ls);
 951     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 952                        ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
 953 }
 954
 955 /*! \brief Releases an OpenCL kernel pointer */
 956 static void free_kernel(cl_kernel *kernel_ptr)
 957 {
 958     cl_int gmx_unused cl_error;
 959
 960     assert(nullptr != kernel_ptr);
 961
 962     if (*kernel_ptr)
 963     {
 964         cl_error = clReleaseKernel(*kernel_ptr);
 965         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clReleaseKernel failed: " + ocl_get_error_string(cl_error)).c_str());
 966
 967         *kernel_ptr = nullptr;
 968     }
 969 }
 970
 971 /*! \brief Releases a list of OpenCL kernel pointers */
 972 static void free_kernels(cl_kernel *kernels, int count)
 973 {
 974     int i;
 975
 976     for (i = 0; i < count; i++)
 977     {
 978         free_kernel(kernels + i);
 979     }
 980 }
 981
 982 /*! \brief Free the OpenCL runtime data (context and program).
 983  *
 984  *  The function releases the OpenCL context and program assuciated with the
 985  *  device that the calling PP rank is running on.
 986  *
 987  *  \param runData [in]  porinter to the structure with runtime data.
 988  */
 989 static void free_gpu_device_runtime_data(gmx_device_runtime_data_t *runData)
 990 {
 991     if (runData == nullptr)
 992     {
 993         return;
 994     }
 995
 996     cl_int gmx_unused cl_error;
 997
 998     if (runData->context)
 999     {
1000         cl_error         = clReleaseContext(runData->context);
1001         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clReleaseContext failed: " + ocl_get_error_string(cl_error)).c_str());
1002         runData->context = nullptr;
1003     }
1004
1005     if (runData->program)
1006     {
1007         cl_error         = clReleaseProgram(runData->program);
1008         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clReleaseProgram failed: " + ocl_get_error_string(cl_error)).c_str());
1009         runData->program = nullptr;
1010     }
1011
1012 }
1013
1014 //! This function is documented in the header file
1015 void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
1016 {
1017     if (nb == nullptr)
1018     {
1019         return;
1020     }
1021
1022     /* Free kernels */
1023     int kernel_count = sizeof(nb->kernel_ener_noprune_ptr) / sizeof(nb->kernel_ener_noprune_ptr[0][0]);
1024     free_kernels(nb->kernel_ener_noprune_ptr[0], kernel_count);
1025
1026     kernel_count = sizeof(nb->kernel_ener_prune_ptr) / sizeof(nb->kernel_ener_prune_ptr[0][0]);
1027     free_kernels(nb->kernel_ener_prune_ptr[0], kernel_count);
1028
1029     kernel_count = sizeof(nb->kernel_noener_noprune_ptr) / sizeof(nb->kernel_noener_noprune_ptr[0][0]);
1030     free_kernels(nb->kernel_noener_noprune_ptr[0], kernel_count);
1031
1032     kernel_count = sizeof(nb->kernel_noener_prune_ptr) / sizeof(nb->kernel_noener_prune_ptr[0][0]);
1033     free_kernels(nb->kernel_noener_prune_ptr[0], kernel_count);
1034
1035     free_kernel(&(nb->kernel_zero_e_fshift));
1036
1037     /* Free atdat */
1038     freeDeviceBuffer(&(nb->atdat->xq));
1039     freeDeviceBuffer(&(nb->atdat->f));
1040     freeDeviceBuffer(&(nb->atdat->e_lj));
1041     freeDeviceBuffer(&(nb->atdat->e_el));
1042     freeDeviceBuffer(&(nb->atdat->fshift));
1043     freeDeviceBuffer(&(nb->atdat->lj_comb));
1044     freeDeviceBuffer(&(nb->atdat->atom_types));
1045     freeDeviceBuffer(&(nb->atdat->shift_vec));
1046     sfree(nb->atdat);
1047
1048     /* Free nbparam */
1049     freeDeviceBuffer(&(nb->nbparam->nbfp_climg2d));
1050     freeDeviceBuffer(&(nb->nbparam->nbfp_comb_climg2d));
1051     freeDeviceBuffer(&(nb->nbparam->coulomb_tab_climg2d));
1052     sfree(nb->nbparam);
1053
1054     /* Free plist */
1055     auto *plist = nb->plist[eintLocal];
1056     freeDeviceBuffer(&plist->sci);
1057     freeDeviceBuffer(&plist->cj4);
1058     freeDeviceBuffer(&plist->imask);
1059     freeDeviceBuffer(&plist->excl);
1060     sfree(plist);
1061     if (nb->bUseTwoStreams)
1062     {
1063         auto *plist_nl = nb->plist[eintNonlocal];
1064         freeDeviceBuffer(&plist_nl->sci);
1065         freeDeviceBuffer(&plist_nl->cj4);
1066         freeDeviceBuffer(&plist_nl->imask);
1067         freeDeviceBuffer(&plist_nl->excl);
1068         sfree(plist_nl);
1069     }
1070
1071     /* Free nbst */
1072     pfree(nb->nbst.e_lj);
1073     nb->nbst.e_lj = nullptr;
1074
1075     pfree(nb->nbst.e_el);
1076     nb->nbst.e_el = nullptr;
1077
1078     pfree(nb->nbst.fshift);
1079     nb->nbst.fshift = nullptr;
1080
1081     /* Free command queues */
1082     clReleaseCommandQueue(nb->stream[eintLocal]);
1083     nb->stream[eintLocal] = nullptr;
1084     if (nb->bUseTwoStreams)
1085     {
1086         clReleaseCommandQueue(nb->stream[eintNonlocal]);
1087         nb->stream[eintNonlocal] = nullptr;
1088     }
1089     /* Free other events */
1090     if (nb->nonlocal_done)
1091     {
1092         clReleaseEvent(nb->nonlocal_done);
1093         nb->nonlocal_done = nullptr;
1094     }
1095     if (nb->misc_ops_and_local_H2D_done)
1096     {
1097         clReleaseEvent(nb->misc_ops_and_local_H2D_done);
1098         nb->misc_ops_and_local_H2D_done = nullptr;
1099     }
1100
1101     free_gpu_device_runtime_data(nb->dev_rundata);
1102     sfree(nb->dev_rundata);
1103
1104     /* Free timers and timings */
1105     delete nb->timers;
1106     sfree(nb->timings);
1107     sfree(nb);
1108
1109     if (debug)
1110     {
1111         fprintf(debug, "Cleaned up OpenCL data structures.\n");
1112     }
1113 }
1114
1115 //! This function is documented in the header file
1116 gmx_wallclock_gpu_nbnxn_t *nbnxn_gpu_get_timings(gmx_nbnxn_ocl_t *nb)
1117 {
1118     return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
1119 }
1120
1121 //! This function is documented in the header file
1122 void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
1123 {
1124     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
1125     {
1126         init_timings(nbv->gpu_nbv->timings);
1127     }
1128 }
1129
1130 //! This function is documented in the header file
1131 int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_ocl_t *nb)
1132 {
1133     return nb != nullptr ?
1134            gpu_min_ci_balanced_factor * nb->dev_info->compute_units : 0;
1135 }
1136
1137 //! This function is documented in the header file
1138 gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_ocl_t *nb)
1139 {
1140     return ((nb->nbparam->eeltype == eelOclEWALD_ANA) ||
1141             (nb->nbparam->eeltype == eelOclEWALD_ANA_TWIN));
1142 }