src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *  \brief Define OpenCL implementation of nbnxm_gpu_data_mgmt.h
  37  *
  38  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  39  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  40  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  41  *  \author Szilárd Páll <pall.szilard@gmail.com>
  42  *  \ingroup module_nbnxm
  43  */
  44 #include "gmxpre.h"
  45
  46 #include <assert.h>
  47 #include <stdarg.h>
  48 #include <stdio.h>
  49 #include <stdlib.h>
  50 #include <string.h>
  51
  52 #include <cmath>
  53
  54 // TODO We would like to move this down, but the way gmx_nbnxn_gpu_t
  55 //      is currently declared means this has to be before gpu_types.h
  56 #include "nbnxm_ocl_types.h"
  57
  58 // TODO Remove this comment when the above order issue is resolved
  59 #include "gromacs/gpu_utils/gpu_utils.h"
  60 #include "gromacs/gpu_utils/oclutils.h"
  61 #include "gromacs/hardware/gpu_hw_info.h"
  62 #include "gromacs/math/vectypes.h"
  63 #include "gromacs/mdlib/force_flags.h"
  64 #include "gromacs/mdtypes/interaction_const.h"
  65 #include "gromacs/mdtypes/md_enums.h"
  66 #include "gromacs/nbnxm/atomdata.h"
  67 #include "gromacs/nbnxm/gpu_data_mgmt.h"
  68 #include "gromacs/nbnxm/gpu_jit_support.h"
  69 #include "gromacs/nbnxm/nbnxm.h"
  70 #include "gromacs/nbnxm/nbnxm_gpu.h"
  71 #include "gromacs/nbnxm/pairlistsets.h"
  72 #include "gromacs/pbcutil/ishift.h"
  73 #include "gromacs/timing/gpu_timing.h"
  74 #include "gromacs/utility/cstringutil.h"
  75 #include "gromacs/utility/fatalerror.h"
  76 #include "gromacs/utility/gmxassert.h"
  77 #include "gromacs/utility/real.h"
  78 #include "gromacs/utility/smalloc.h"
  79
  80 #include "nbnxm_ocl_internal.h"
  81
  82 namespace Nbnxm
  83 {
  84
  85 /*! \brief Copies of values from cl_driver_diagnostics_intel.h,
  86  * which isn't guaranteed to be available. */
  87 /**@{*/
  88 #define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL           0x4106
  89 #define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL     0x1
  90 #define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL      0x2
  91 #define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL  0x4
  92 /**@}*/
  93
  94 /*! \brief This parameter should be determined heuristically from the
  95  * kernel execution times
  96  *
  97  * This value is best for small systems on a single AMD Radeon R9 290X
  98  * (and about 5% faster than 40, which is the default for CUDA
  99  * devices). Larger simulation systems were quite insensitive to the
 100  * value of this parameter.
 101  */
 102 static unsigned int gpu_min_ci_balanced_factor = 50;
 103
 104
 105 /*! \brief Returns true if LJ combination rules are used in the non-bonded kernels.
 106  *
 107  * Full doc in nbnxn_ocl_internal.h */
 108 bool useLjCombRule(int vdwType)
 109 {
 110     return (vdwType == evdwOclCUTCOMBGEOM ||
 111             vdwType == evdwOclCUTCOMBLB);
 112 }
 113
 114 /*! \brief Tabulates the Ewald Coulomb force and initializes the size/scale
 115  * and the table GPU array.
 116  *
 117  * If called with an already allocated table, it just re-uploads the
 118  * table.
 119  */
 120 static void init_ewald_coulomb_force_table(const EwaldCorrectionTables     &tables,
 121                                            cl_nbparam_t                    *nbp,
 122                                            const gmx_device_runtime_data_t *runData)
 123 {
 124     cl_mem       coul_tab;
 125
 126     cl_int       cl_error;
 127
 128     if (nbp->coulomb_tab_climg2d != nullptr)
 129     {
 130         freeDeviceBuffer(&(nbp->coulomb_tab_climg2d));
 131     }
 132
 133     /* Switched from using textures to using buffers */
 134     // TODO: decide which alternative is most efficient - textures or buffers.
 135     /*
 136        cl_image_format array_format;
 137
 138        array_format.image_channel_data_type = CL_FLOAT;
 139        array_format.image_channel_order     = CL_R;
 140
 141        coul_tab = clCreateImage2D(runData->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
 142        &array_format, tabsize, 1, 0, ftmp, &cl_error);
 143      */
 144
 145     coul_tab = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
 146                               tables.tableF.size()*sizeof(cl_float), const_cast<real *>(tables.tableF.data()), &cl_error);
 147     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 148                        ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 149
 150     nbp->coulomb_tab_climg2d  = coul_tab;
 151     nbp->coulomb_tab_scale    = tables.scale;
 152 }
 153
 154
 155 /*! \brief Initializes the atomdata structure first time, it only gets filled at
 156     pair-search.
 157  */
 158 static void init_atomdata_first(cl_atomdata_t *ad, int ntypes, gmx_device_runtime_data_t *runData)
 159 {
 160     cl_int cl_error;
 161
 162     ad->ntypes  = ntypes;
 163
 164     /* An element of the shift_vec device buffer has the same size as one element
 165        of the host side shift_vec buffer. */
 166     ad->shift_vec_elem_size = sizeof(*nbnxn_atomdata_t::shift_vec.data());
 167
 168     ad->shift_vec = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
 169                                    SHIFTS * ad->shift_vec_elem_size, nullptr, &cl_error);
 170     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 171                        ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 172     ad->bShiftVecUploaded = CL_FALSE;
 173
 174     /* An element of the fshift device buffer has the same size as one element
 175        of the host side fshift buffer. */
 176     ad->fshift_elem_size = sizeof(*cl_nb_staging_t::fshift);
 177
 178     ad->fshift = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
 179                                 SHIFTS * ad->fshift_elem_size, nullptr, &cl_error);
 180     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 181                        ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 182
 183     ad->e_lj = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
 184                               sizeof(float), nullptr, &cl_error);
 185     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 186                        ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 187
 188     ad->e_el = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
 189                               sizeof(float), nullptr, &cl_error);
 190     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 191                        ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 192
 193     /* initialize to nullptr pointers to data that is not allocated here and will
 194        need reallocation in nbnxn_gpu_init_atomdata */
 195     ad->xq = nullptr;
 196     ad->f  = nullptr;
 197
 198     /* size -1 indicates that the respective array hasn't been initialized yet */
 199     ad->natoms = -1;
 200     ad->nalloc = -1;
 201 }
 202
 203 /*! \brief Copies all parameters related to the cut-off from ic to nbp
 204  */
 205 static void set_cutoff_parameters(cl_nbparam_t              *nbp,
 206                                   const interaction_const_t *ic,
 207                                   const PairlistParams      &listParams)
 208 {
 209     nbp->ewald_beta        = ic->ewaldcoeff_q;
 210     nbp->sh_ewald          = ic->sh_ewald;
 211     nbp->epsfac            = ic->epsfac;
 212     nbp->two_k_rf          = 2.0 * ic->k_rf;
 213     nbp->c_rf              = ic->c_rf;
 214     nbp->rvdw_sq           = ic->rvdw * ic->rvdw;
 215     nbp->rcoulomb_sq       = ic->rcoulomb * ic->rcoulomb;
 216     nbp->rlistOuter_sq     = listParams.rlistOuter * listParams.rlistOuter;
 217     nbp->rlistInner_sq     = listParams.rlistInner * listParams.rlistInner;
 218     nbp->useDynamicPruning = listParams.useDynamicPruning;
 219
 220     nbp->sh_lj_ewald       = ic->sh_lj_ewald;
 221     nbp->ewaldcoeff_lj     = ic->ewaldcoeff_lj;
 222
 223     nbp->rvdw_switch       = ic->rvdw_switch;
 224     nbp->dispersion_shift  = ic->dispersion_shift;
 225     nbp->repulsion_shift   = ic->repulsion_shift;
 226     nbp->vdw_switch        = ic->vdw_switch;
 227 }
 228
 229 /*! \brief Returns the kinds of electrostatics and Vdw OpenCL
 230  *  kernels that will be used.
 231  *
 232  * Respectively, these values are from enum eelOcl and enum
 233  * evdwOcl. */
 234 static void
 235 map_interaction_types_to_gpu_kernel_flavors(const interaction_const_t *ic,
 236                                             int                        combRule,
 237                                             int                       *gpu_eeltype,
 238                                             int                       *gpu_vdwtype)
 239 {
 240     if (ic->vdwtype == evdwCUT)
 241     {
 242         switch (ic->vdw_modifier)
 243         {
 244             case eintmodNONE:
 245             case eintmodPOTSHIFT:
 246                 switch (combRule)
 247                 {
 248                     case ljcrNONE:
 249                         *gpu_vdwtype = evdwOclCUT;
 250                         break;
 251                     case ljcrGEOM:
 252                         *gpu_vdwtype = evdwOclCUTCOMBGEOM;
 253                         break;
 254                     case ljcrLB:
 255                         *gpu_vdwtype = evdwOclCUTCOMBLB;
 256                         break;
 257                     default:
 258                         gmx_incons("The requested LJ combination rule is not implemented in the OpenCL GPU accelerated kernels!");
 259                 }
 260                 break;
 261             case eintmodFORCESWITCH:
 262                 *gpu_vdwtype = evdwOclFSWITCH;
 263                 break;
 264             case eintmodPOTSWITCH:
 265                 *gpu_vdwtype = evdwOclPSWITCH;
 266                 break;
 267             default:
 268                 gmx_incons("The requested VdW interaction modifier is not implemented in the GPU accelerated kernels!");
 269         }
 270     }
 271     else if (ic->vdwtype == evdwPME)
 272     {
 273         if (ic->ljpme_comb_rule == ljcrGEOM)
 274         {
 275             *gpu_vdwtype = evdwOclEWALDGEOM;
 276         }
 277         else
 278         {
 279             *gpu_vdwtype = evdwOclEWALDLB;
 280         }
 281     }
 282     else
 283     {
 284         gmx_incons("The requested VdW type is not implemented in the GPU accelerated kernels!");
 285     }
 286
 287     if (ic->eeltype == eelCUT)
 288     {
 289         *gpu_eeltype = eelOclCUT;
 290     }
 291     else if (EEL_RF(ic->eeltype))
 292     {
 293         *gpu_eeltype = eelOclRF;
 294     }
 295     else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
 296     {
 297         *gpu_eeltype = nbnxn_gpu_pick_ewald_kernel_type(*ic);
 298     }
 299     else
 300     {
 301         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
 302         gmx_incons("The requested electrostatics type is not implemented in the GPU accelerated kernels!");
 303     }
 304 }
 305
 306 /*! \brief Initializes the nonbonded parameter data structure.
 307  */
 308 static void init_nbparam(cl_nbparam_t                    *nbp,
 309                          const interaction_const_t       *ic,
 310                          const PairlistParams            &listParams,
 311                          const nbnxn_atomdata_t::Params  &nbatParams,
 312                          const gmx_device_runtime_data_t *runData)
 313 {
 314     cl_int cl_error;
 315
 316     set_cutoff_parameters(nbp, ic, listParams);
 317
 318     map_interaction_types_to_gpu_kernel_flavors(ic,
 319                                                 nbatParams.comb_rule,
 320                                                 &(nbp->eeltype),
 321                                                 &(nbp->vdwtype));
 322
 323     if (ic->vdwtype == evdwPME)
 324     {
 325         if (ic->ljpme_comb_rule == ljcrGEOM)
 326         {
 327             GMX_ASSERT(nbatParams.comb_rule == ljcrGEOM, "Combination rule mismatch!");
 328         }
 329         else
 330         {
 331             GMX_ASSERT(nbatParams.comb_rule == ljcrLB, "Combination rule mismatch!");
 332         }
 333     }
 334     /* generate table for PME */
 335     nbp->coulomb_tab_climg2d = nullptr;
 336     if (nbp->eeltype == eelOclEWALD_TAB || nbp->eeltype == eelOclEWALD_TAB_TWIN)
 337     {
 338         GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
 339         init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, runData);
 340     }
 341     else
 342     // TODO: improvement needed.
 343     // The image2d is created here even if eeltype is not eelCuEWALD_TAB or eelCuEWALD_TAB_TWIN because the OpenCL kernels
 344     // don't accept nullptr values for image2D parameters.
 345     {
 346         /* Switched from using textures to using buffers */
 347         // TODO: decide which alternative is most efficient - textures or buffers.
 348         /*
 349            cl_image_format array_format;
 350
 351            array_format.image_channel_data_type = CL_FLOAT;
 352            array_format.image_channel_order     = CL_R;
 353
 354            nbp->coulomb_tab_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE,
 355             &array_format, 1, 1, 0, nullptr, &cl_error);
 356          */
 357
 358         nbp->coulomb_tab_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY, sizeof(cl_float), nullptr, &cl_error);
 359         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 360                            ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 361     }
 362
 363     const int nnbfp      = 2*nbatParams.numTypes*nbatParams.numTypes;
 364     const int nnbfp_comb = 2*nbatParams.numTypes;
 365
 366     {
 367         /* Switched from using textures to using buffers */
 368         // TODO: decide which alternative is most efficient - textures or buffers.
 369         /*
 370            cl_image_format array_format;
 371
 372            array_format.image_channel_data_type = CL_FLOAT;
 373            array_format.image_channel_order     = CL_R;
 374
 375            nbp->nbfp_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
 376             &array_format, nnbfp, 1, 0, nbat->nbfp, &cl_error);
 377          */
 378
 379         nbp->nbfp_climg2d =
 380             clCreateBuffer(runData->context,
 381                            CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
 382                            nnbfp*sizeof(cl_float),
 383                            const_cast<float *>(nbatParams.nbfp.data()),
 384                            &cl_error);
 385         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 386                            ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 387
 388         if (ic->vdwtype == evdwPME)
 389         {
 390             /* Switched from using textures to using buffers */
 391             // TODO: decide which alternative is most efficient - textures or buffers.
 392             /*  nbp->nbfp_comb_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
 393                 &array_format, nnbfp_comb, 1, 0, nbat->nbfp_comb, &cl_error);*/
 394             nbp->nbfp_comb_climg2d =
 395                 clCreateBuffer(runData->context,
 396                                CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
 397                                nnbfp_comb*sizeof(cl_float),
 398                                const_cast<float *>(nbatParams.nbfp_comb.data()),
 399                                &cl_error);
 400             GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 401                                ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 402         }
 403         else
 404         {
 405             // TODO: improvement needed.
 406             // The image2d is created here even if vdwtype is not evdwPME because the OpenCL kernels
 407             // don't accept nullptr values for image2D parameters.
 408             /* Switched from using textures to using buffers */
 409             // TODO: decide which alternative is most efficient - textures or buffers.
 410             /* nbp->nbfp_comb_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE,
 411                 &array_format, 1, 1, 0, nullptr, &cl_error);*/
 412             nbp->nbfp_comb_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY, sizeof(cl_float), nullptr, &cl_error);
 413             GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 414                                ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 415         }
 416     }
 417 }
 418
 419 //! This function is documented in the header file
 420 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
 421                                   const interaction_const_t   *ic)
 422 {
 423     if (!nbv || !nbv->useGpu())
 424     {
 425         return;
 426     }
 427     gmx_nbnxn_ocl_t    *nb  = nbv->gpu_nbv;
 428     cl_nbparam_t       *nbp = nb->nbparam;
 429
 430     set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
 431
 432     nbp->eeltype = nbnxn_gpu_pick_ewald_kernel_type(*ic);
 433
 434     GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
 435     init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, nb->dev_rundata);
 436 }
 437
 438 /*! \brief Initializes the pair list data structure.
 439  */
 440 static void init_plist(cl_plist_t *pl)
 441 {
 442     /* initialize to nullptr pointers to data that is not allocated here and will
 443        need reallocation in nbnxn_gpu_init_pairlist */
 444     pl->sci     = nullptr;
 445     pl->cj4     = nullptr;
 446     pl->imask   = nullptr;
 447     pl->excl    = nullptr;
 448
 449     /* size -1 indicates that the respective array hasn't been initialized yet */
 450     pl->na_c           = -1;
 451     pl->nsci           = -1;
 452     pl->sci_nalloc     = -1;
 453     pl->ncj4           = -1;
 454     pl->cj4_nalloc     = -1;
 455     pl->nimask         = -1;
 456     pl->imask_nalloc   = -1;
 457     pl->nexcl          = -1;
 458     pl->excl_nalloc    = -1;
 459     pl->haveFreshList  = false;
 460 }
 461
 462 /*! \brief Initializes the timings data structure.
 463  */
 464 static void init_timings(gmx_wallclock_gpu_nbnxn_t *t)
 465 {
 466     int i, j;
 467
 468     t->nb_h2d_t = 0.0;
 469     t->nb_d2h_t = 0.0;
 470     t->nb_c     = 0;
 471     t->pl_h2d_t = 0.0;
 472     t->pl_h2d_c = 0;
 473     for (i = 0; i < 2; i++)
 474     {
 475         for (j = 0; j < 2; j++)
 476         {
 477             t->ktime[i][j].t = 0.0;
 478             t->ktime[i][j].c = 0;
 479         }
 480     }
 481
 482     t->pruneTime.c        = 0;
 483     t->pruneTime.t        = 0.0;
 484     t->dynamicPruneTime.c = 0;
 485     t->dynamicPruneTime.t = 0.0;
 486 }
 487
 488
 489 //! OpenCL notification callback function
 490 static void CL_CALLBACK
 491 ocl_notify_fn(const char *pErrInfo, const void gmx_unused *private_info, size_t gmx_unused cb, void gmx_unused *user_data)
 492 {
 493     if (pErrInfo != nullptr)
 494     {
 495         printf("%s\n", pErrInfo ); // Print error/hint
 496     }
 497 }
 498
 499 /*! \brief Creates context for OpenCL GPU given by \p mygpu
 500  *
 501  * A fatal error results if creation fails.
 502  *
 503  * \param[inout] runtimeData runtime data including program and context
 504  * \param[in]    devInfo     device info struct
 505  * \param[in]    rank        MPI rank (for error reporting)
 506  */
 507 static void
 508 nbnxn_gpu_create_context(gmx_device_runtime_data_t *runtimeData,
 509                          const gmx_device_info_t   *devInfo,
 510                          int                        rank)
 511 {
 512     cl_context_properties     context_properties[5];
 513     cl_platform_id            platform_id;
 514     cl_device_id              device_id;
 515     cl_context                context;
 516     cl_int                    cl_error;
 517
 518     assert(runtimeData != nullptr);
 519     assert(devInfo != nullptr);
 520
 521     platform_id      = devInfo->ocl_gpu_id.ocl_platform_id;
 522     device_id        = devInfo->ocl_gpu_id.ocl_device_id;
 523
 524     int i = 0;
 525     context_properties[i++] = CL_CONTEXT_PLATFORM;
 526     context_properties[i++] = reinterpret_cast<cl_context_properties>(platform_id);
 527     if (getenv("GMX_OCL_SHOW_DIAGNOSTICS"))
 528     {
 529         context_properties[i++] = CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL;
 530         context_properties[i++] = CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL | CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL;
 531     }
 532     context_properties[i++] = 0; /* Terminates the list of properties */
 533
 534     context = clCreateContext(context_properties, 1, &device_id, ocl_notify_fn, nullptr, &cl_error);
 535     if (CL_SUCCESS != cl_error)
 536     {
 537         gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s:\n OpenCL error %d: %s",
 538                   rank,
 539                   devInfo->device_name,
 540                   cl_error, ocl_get_error_string(cl_error).c_str());
 541     }
 542
 543     runtimeData->context = context;
 544 }
 545
 546 /*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
 547 static cl_kernel nbnxn_gpu_create_kernel(gmx_nbnxn_ocl_t *nb,
 548                                          const char      *kernel_name)
 549 {
 550     cl_kernel kernel;
 551     cl_int    cl_error;
 552
 553     kernel = clCreateKernel(nb->dev_rundata->program, kernel_name, &cl_error);
 554     if (CL_SUCCESS != cl_error)
 555     {
 556         gmx_fatal(FARGS, "Failed to create kernel '%s' for GPU #%s: OpenCL error %d",
 557                   kernel_name,
 558                   nb->dev_info->device_name,
 559                   cl_error);
 560     }
 561
 562     return kernel;
 563 }
 564
 565 /*! \brief Clears nonbonded shift force output array and energy outputs on the GPU.
 566  */
 567 static void
 568 nbnxn_ocl_clear_e_fshift(gmx_nbnxn_ocl_t *nb)
 569 {
 570
 571     cl_int               cl_error;
 572     cl_atomdata_t *      adat     = nb->atdat;
 573     cl_command_queue     ls       = nb->stream[InteractionLocality::Local];
 574
 575     size_t               local_work_size[3]   = {1, 1, 1};
 576     size_t               global_work_size[3]  = {1, 1, 1};
 577
 578     cl_int               shifts   = SHIFTS*3;
 579
 580     cl_int               arg_no;
 581
 582     cl_kernel            zero_e_fshift = nb->kernel_zero_e_fshift;
 583
 584     local_work_size[0]   = 64;
 585     // Round the total number of threads up from the array size
 586     global_work_size[0]  = ((shifts + local_work_size[0] - 1)/local_work_size[0])*local_work_size[0];
 587
 588     arg_no    = 0;
 589     cl_error  = clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->fshift));
 590     cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_lj));
 591     cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_el));
 592     cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_uint), &shifts);
 593     GMX_ASSERT(cl_error == CL_SUCCESS, ocl_get_error_string(cl_error).c_str());
 594
 595     cl_error = clEnqueueNDRangeKernel(ls, zero_e_fshift, 3, nullptr, global_work_size, local_work_size, 0, nullptr, nullptr);
 596     GMX_ASSERT(cl_error == CL_SUCCESS, ocl_get_error_string(cl_error).c_str());
 597 }
 598
 599 /*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
 600 static void nbnxn_gpu_init_kernels(gmx_nbnxn_ocl_t *nb)
 601 {
 602     /* Init to 0 main kernel arrays */
 603     /* They will be later on initialized in select_nbnxn_kernel */
 604     // TODO: consider always creating all variants of the kernels here so that there is no
 605     // need for late call to clCreateKernel -- if that gives any advantage?
 606     memset(nb->kernel_ener_noprune_ptr, 0, sizeof(nb->kernel_ener_noprune_ptr));
 607     memset(nb->kernel_ener_prune_ptr, 0, sizeof(nb->kernel_ener_prune_ptr));
 608     memset(nb->kernel_noener_noprune_ptr, 0, sizeof(nb->kernel_noener_noprune_ptr));
 609     memset(nb->kernel_noener_prune_ptr, 0, sizeof(nb->kernel_noener_prune_ptr));
 610
 611     /* Init pruning kernels
 612      *
 613      * TODO: we could avoid creating kernels if dynamic pruning is turned off,
 614      * but ATM that depends on force flags not passed into the initialization.
 615      */
 616     nb->kernel_pruneonly[epruneFirst]   = nbnxn_gpu_create_kernel(nb, "nbnxn_kernel_prune_opencl");
 617     nb->kernel_pruneonly[epruneRolling] = nbnxn_gpu_create_kernel(nb, "nbnxn_kernel_prune_rolling_opencl");
 618
 619     /* Init auxiliary kernels */
 620     nb->kernel_zero_e_fshift = nbnxn_gpu_create_kernel(nb, "zero_e_fshift");
 621 }
 622
 623 /*! \brief Initializes simulation constant data.
 624  *
 625  *  Initializes members of the atomdata and nbparam structs and
 626  *  clears e/fshift output buffers.
 627  */
 628 static void nbnxn_ocl_init_const(gmx_nbnxn_ocl_t                *nb,
 629                                  const interaction_const_t      *ic,
 630                                  const PairlistParams           &listParams,
 631                                  const nbnxn_atomdata_t::Params &nbatParams)
 632 {
 633     init_atomdata_first(nb->atdat, nbatParams.numTypes, nb->dev_rundata);
 634     init_nbparam(nb->nbparam, ic, listParams, nbatParams, nb->dev_rundata);
 635 }
 636
 637
 638 //! This function is documented in the header file
 639 gmx_nbnxn_ocl_t *
 640 gpu_init(const gmx_device_info_t   *deviceInfo,
 641          const interaction_const_t *ic,
 642          const PairlistParams      &listParams,
 643          const nbnxn_atomdata_t    *nbat,
 644          const int                  rank,
 645          const gmx_bool             bLocalAndNonlocal)
 646 {
 647     gmx_nbnxn_ocl_t            *nb;
 648     cl_int                      cl_error;
 649     cl_command_queue_properties queue_properties;
 650
 651     assert(ic);
 652
 653     snew(nb, 1);
 654     snew(nb->atdat, 1);
 655     snew(nb->nbparam, 1);
 656     snew(nb->plist[InteractionLocality::Local], 1);
 657     if (bLocalAndNonlocal)
 658     {
 659         snew(nb->plist[InteractionLocality::NonLocal], 1);
 660     }
 661
 662     nb->bUseTwoStreams = static_cast<cl_bool>(bLocalAndNonlocal);
 663
 664     nb->timers = new cl_timers_t();
 665     snew(nb->timings, 1);
 666
 667     /* set device info, just point it to the right GPU among the detected ones */
 668     nb->dev_info = deviceInfo;
 669     snew(nb->dev_rundata, 1);
 670
 671     /* init nbst */
 672     pmalloc(reinterpret_cast<void**>(&nb->nbst.e_lj), sizeof(*nb->nbst.e_lj));
 673     pmalloc(reinterpret_cast<void**>(&nb->nbst.e_el), sizeof(*nb->nbst.e_el));
 674     pmalloc(reinterpret_cast<void**>(&nb->nbst.fshift), SHIFTS * sizeof(*nb->nbst.fshift));
 675
 676     init_plist(nb->plist[InteractionLocality::Local]);
 677
 678     /* OpenCL timing disabled if GMX_DISABLE_GPU_TIMING is defined. */
 679     nb->bDoTime = static_cast<cl_bool>(getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
 680
 681     /* Create queues only after bDoTime has been initialized */
 682     if (nb->bDoTime)
 683     {
 684         queue_properties = CL_QUEUE_PROFILING_ENABLE;
 685     }
 686     else
 687     {
 688         queue_properties = 0;
 689     }
 690
 691     nbnxn_gpu_create_context(nb->dev_rundata, nb->dev_info, rank);
 692
 693     /* local/non-local GPU streams */
 694     nb->stream[InteractionLocality::Local] =
 695         clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
 696     if (CL_SUCCESS != cl_error)
 697     {
 698         gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
 699                   rank,
 700                   nb->dev_info->device_name,
 701                   cl_error);
 702     }
 703
 704     if (nb->bUseTwoStreams)
 705     {
 706         init_plist(nb->plist[InteractionLocality::NonLocal]);
 707
 708         nb->stream[InteractionLocality::NonLocal] =
 709             clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
 710         if (CL_SUCCESS != cl_error)
 711         {
 712             gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
 713                       rank,
 714                       nb->dev_info->device_name,
 715                       cl_error);
 716         }
 717     }
 718
 719     if (nb->bDoTime)
 720     {
 721         init_timings(nb->timings);
 722     }
 723
 724     nbnxn_ocl_init_const(nb, ic, listParams, nbat->params());
 725
 726     /* Enable LJ param manual prefetch for AMD or Intel or if we request through env. var.
 727      * TODO: decide about NVIDIA
 728      */
 729     nb->bPrefetchLjParam =
 730         (getenv("GMX_OCL_DISABLE_I_PREFETCH") == nullptr) &&
 731         ((nb->dev_info->vendor_e == OCL_VENDOR_AMD) || (nb->dev_info->vendor_e == OCL_VENDOR_INTEL)
 732          || (getenv("GMX_OCL_ENABLE_I_PREFETCH") != nullptr));
 733
 734     /* NOTE: in CUDA we pick L1 cache configuration for the nbnxn kernels here,
 735      * but sadly this is not supported in OpenCL (yet?). Consider adding it if
 736      * it becomes supported.
 737      */
 738     nbnxn_gpu_compile_kernels(nb);
 739     nbnxn_gpu_init_kernels(nb);
 740
 741     /* clear energy and shift force outputs */
 742     nbnxn_ocl_clear_e_fshift(nb);
 743
 744     if (debug)
 745     {
 746         fprintf(debug, "Initialized OpenCL data structures.\n");
 747     }
 748
 749     return nb;
 750 }
 751
 752 /*! \brief Clears the first natoms_clear elements of the GPU nonbonded force output array.
 753  */
 754 static void nbnxn_ocl_clear_f(gmx_nbnxn_ocl_t *nb, int natoms_clear)
 755 {
 756     if (natoms_clear == 0)
 757     {
 758         return;
 759     }
 760
 761     cl_int gmx_used_in_debug cl_error;
 762
 763     cl_atomdata_t           *atomData = nb->atdat;
 764     cl_command_queue         ls       = nb->stream[InteractionLocality::Local];
 765     cl_float                 value    = 0.0F;
 766
 767     cl_error = clEnqueueFillBuffer(ls, atomData->f, &value, sizeof(cl_float),
 768                                    0, natoms_clear*sizeof(rvec), 0, nullptr, nullptr);
 769     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clEnqueueFillBuffer failed: " +
 770                                                 ocl_get_error_string(cl_error)).c_str());
 771 }
 772
 773 //! This function is documented in the header file
 774 void gpu_clear_outputs(gmx_nbnxn_ocl_t *nb,
 775                        bool             computeVirial)
 776 {
 777     nbnxn_ocl_clear_f(nb, nb->atdat->natoms);
 778     /* clear shift force array and energies if the outputs were
 779        used in the current step */
 780     if (computeVirial)
 781     {
 782         nbnxn_ocl_clear_e_fshift(nb);
 783     }
 784
 785     /* kick off buffer clearing kernel to ensure concurrency with constraints/update */
 786     cl_int gmx_unused cl_error;
 787     cl_error = clFlush(nb->stream[InteractionLocality::Local]);
 788     assert(CL_SUCCESS == cl_error);
 789 }
 790
 791 //! This function is documented in the header file
 792 void gpu_init_pairlist(gmx_nbnxn_ocl_t           *nb,
 793                        const NbnxnPairlistGpu    *h_plist,
 794                        const InteractionLocality  iloc)
 795 {
 796     char             sbuf[STRLEN];
 797     // Timing accumulation should happen only if there was work to do
 798     // because getLastRangeTime() gets skipped with empty lists later
 799     // which leads to the counter not being reset.
 800     bool             bDoTime    = ((nb->bDoTime == CL_TRUE) && !h_plist->sci.empty());
 801     cl_command_queue stream     = nb->stream[iloc];
 802     cl_plist_t      *d_plist    = nb->plist[iloc];
 803
 804     if (d_plist->na_c < 0)
 805     {
 806         d_plist->na_c = h_plist->na_ci;
 807     }
 808     else
 809     {
 810         if (d_plist->na_c != h_plist->na_ci)
 811         {
 812             sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
 813                     d_plist->na_c, h_plist->na_ci);
 814             gmx_incons(sbuf);
 815         }
 816     }
 817
 818     gpu_timers_t::Interaction &iTimers = nb->timers->interaction[iloc];
 819
 820     if (bDoTime)
 821     {
 822         iTimers.pl_h2d.openTimingRegion(stream);
 823         iTimers.didPairlistH2D = true;
 824     }
 825
 826     // TODO most of this function is same in CUDA and OpenCL, move into the header
 827     DeviceContext context = nb->dev_rundata->context;
 828
 829     reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(),
 830                            &d_plist->nsci, &d_plist->sci_nalloc, context);
 831     copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(),
 832                        stream, GpuApiCallBehavior::Async,
 833                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 834
 835     reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(),
 836                            &d_plist->ncj4, &d_plist->cj4_nalloc, context);
 837     copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(),
 838                        stream, GpuApiCallBehavior::Async,
 839                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 840
 841     reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size()*c_nbnxnGpuClusterpairSplit,
 842                            &d_plist->nimask, &d_plist->imask_nalloc, context);
 843
 844     reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(),
 845                            &d_plist->nexcl, &d_plist->excl_nalloc, context);
 846     copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(),
 847                        stream, GpuApiCallBehavior::Async,
 848                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 849
 850     if (bDoTime)
 851     {
 852         iTimers.pl_h2d.closeTimingRegion(stream);
 853     }
 854
 855     /* need to prune the pair list during the next step */
 856     d_plist->haveFreshList = true;
 857 }
 858
 859 //! This function is documented in the header file
 860 void gpu_upload_shiftvec(gmx_nbnxn_ocl_t        *nb,
 861                          const nbnxn_atomdata_t *nbatom)
 862 {
 863     cl_atomdata_t   *adat  = nb->atdat;
 864     cl_command_queue ls    = nb->stream[InteractionLocality::Local];
 865
 866     /* only if we have a dynamic box */
 867     if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
 868     {
 869         ocl_copy_H2D_async(adat->shift_vec, nbatom->shift_vec.data(), 0,
 870                            SHIFTS * adat->shift_vec_elem_size, ls, nullptr);
 871         adat->bShiftVecUploaded = CL_TRUE;
 872     }
 873 }
 874
 875 //! This function is documented in the header file
 876 void gpu_init_atomdata(gmx_nbnxn_ocl_t        *nb,
 877                        const nbnxn_atomdata_t *nbat)
 878 {
 879     cl_int           cl_error;
 880     int              nalloc, natoms;
 881     bool             realloced;
 882     bool             bDoTime = nb->bDoTime == CL_TRUE;
 883     cl_timers_t     *timers  = nb->timers;
 884     cl_atomdata_t   *d_atdat = nb->atdat;
 885     cl_command_queue ls      = nb->stream[InteractionLocality::Local];
 886
 887     natoms    = nbat->numAtoms();
 888     realloced = false;
 889
 890     if (bDoTime)
 891     {
 892         /* time async copy */
 893         timers->atdat.openTimingRegion(ls);
 894     }
 895
 896     /* need to reallocate if we have to copy more atoms than the amount of space
 897        available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
 898     if (natoms > d_atdat->nalloc)
 899     {
 900         nalloc = over_alloc_small(natoms);
 901
 902         /* free up first if the arrays have already been initialized */
 903         if (d_atdat->nalloc != -1)
 904         {
 905             freeDeviceBuffer(&d_atdat->f);
 906             freeDeviceBuffer(&d_atdat->xq);
 907             freeDeviceBuffer(&d_atdat->lj_comb);
 908             freeDeviceBuffer(&d_atdat->atom_types);
 909         }
 910
 911         d_atdat->f_elem_size = sizeof(rvec);
 912
 913         d_atdat->f = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
 914                                     nalloc * d_atdat->f_elem_size, nullptr, &cl_error);
 915         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 916                            ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 917
 918         d_atdat->xq = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
 919                                      nalloc * sizeof(cl_float4), nullptr, &cl_error);
 920         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 921                            ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 922
 923         if (useLjCombRule(nb->nbparam->vdwtype))
 924         {
 925             d_atdat->lj_comb = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
 926                                               nalloc * sizeof(cl_float2), nullptr, &cl_error);
 927             GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 928                                ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 929         }
 930         else
 931         {
 932             d_atdat->atom_types = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
 933                                                  nalloc * sizeof(int), nullptr, &cl_error);
 934             GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 935                                ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 936         }
 937
 938         d_atdat->nalloc = nalloc;
 939         realloced       = true;
 940     }
 941
 942     d_atdat->natoms       = natoms;
 943     d_atdat->natoms_local = nbat->natoms_local;
 944
 945     /* need to clear GPU f output if realloc happened */
 946     if (realloced)
 947     {
 948         nbnxn_ocl_clear_f(nb, nalloc);
 949     }
 950
 951     if (useLjCombRule(nb->nbparam->vdwtype))
 952     {
 953         ocl_copy_H2D_async(d_atdat->lj_comb, nbat->params().lj_comb.data(), 0,
 954                            natoms*sizeof(cl_float2), ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
 955     }
 956     else
 957     {
 958         ocl_copy_H2D_async(d_atdat->atom_types, nbat->params().type.data(), 0,
 959                            natoms*sizeof(int), ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
 960
 961     }
 962
 963     if (bDoTime)
 964     {
 965         timers->atdat.closeTimingRegion(ls);
 966     }
 967
 968     /* kick off the tasks enqueued above to ensure concurrency with the search */
 969     cl_error = clFlush(ls);
 970     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
 971                        ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
 972 }
 973
 974 /*! \brief Releases an OpenCL kernel pointer */
 975 static void free_kernel(cl_kernel *kernel_ptr)
 976 {
 977     cl_int gmx_unused cl_error;
 978
 979     assert(nullptr != kernel_ptr);
 980
 981     if (*kernel_ptr)
 982     {
 983         cl_error = clReleaseKernel(*kernel_ptr);
 984         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clReleaseKernel failed: " + ocl_get_error_string(cl_error)).c_str());
 985
 986         *kernel_ptr = nullptr;
 987     }
 988 }
 989
 990 /*! \brief Releases a list of OpenCL kernel pointers */
 991 static void free_kernels(cl_kernel *kernels, int count)
 992 {
 993     int i;
 994
 995     for (i = 0; i < count; i++)
 996     {
 997         free_kernel(kernels + i);
 998     }
 999 }
1000
1001 /*! \brief Free the OpenCL runtime data (context and program).
1002  *
1003  *  The function releases the OpenCL context and program assuciated with the
1004  *  device that the calling PP rank is running on.
1005  *
1006  *  \param runData [in]  porinter to the structure with runtime data.
1007  */
1008 static void free_gpu_device_runtime_data(gmx_device_runtime_data_t *runData)
1009 {
1010     if (runData == nullptr)
1011     {
1012         return;
1013     }
1014
1015     cl_int gmx_unused cl_error;
1016
1017     if (runData->context)
1018     {
1019         cl_error         = clReleaseContext(runData->context);
1020         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clReleaseContext failed: " + ocl_get_error_string(cl_error)).c_str());
1021         runData->context = nullptr;
1022     }
1023
1024     if (runData->program)
1025     {
1026         cl_error         = clReleaseProgram(runData->program);
1027         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clReleaseProgram failed: " + ocl_get_error_string(cl_error)).c_str());
1028         runData->program = nullptr;
1029     }
1030
1031 }
1032
1033 //! This function is documented in the header file
1034 void gpu_free(gmx_nbnxn_ocl_t *nb)
1035 {
1036     if (nb == nullptr)
1037     {
1038         return;
1039     }
1040
1041     /* Free kernels */
1042     int kernel_count = sizeof(nb->kernel_ener_noprune_ptr) / sizeof(nb->kernel_ener_noprune_ptr[0][0]);
1043     free_kernels(nb->kernel_ener_noprune_ptr[0], kernel_count);
1044
1045     kernel_count = sizeof(nb->kernel_ener_prune_ptr) / sizeof(nb->kernel_ener_prune_ptr[0][0]);
1046     free_kernels(nb->kernel_ener_prune_ptr[0], kernel_count);
1047
1048     kernel_count = sizeof(nb->kernel_noener_noprune_ptr) / sizeof(nb->kernel_noener_noprune_ptr[0][0]);
1049     free_kernels(nb->kernel_noener_noprune_ptr[0], kernel_count);
1050
1051     kernel_count = sizeof(nb->kernel_noener_prune_ptr) / sizeof(nb->kernel_noener_prune_ptr[0][0]);
1052     free_kernels(nb->kernel_noener_prune_ptr[0], kernel_count);
1053
1054     free_kernel(&(nb->kernel_zero_e_fshift));
1055
1056     /* Free atdat */
1057     freeDeviceBuffer(&(nb->atdat->xq));
1058     freeDeviceBuffer(&(nb->atdat->f));
1059     freeDeviceBuffer(&(nb->atdat->e_lj));
1060     freeDeviceBuffer(&(nb->atdat->e_el));
1061     freeDeviceBuffer(&(nb->atdat->fshift));
1062     freeDeviceBuffer(&(nb->atdat->lj_comb));
1063     freeDeviceBuffer(&(nb->atdat->atom_types));
1064     freeDeviceBuffer(&(nb->atdat->shift_vec));
1065     sfree(nb->atdat);
1066
1067     /* Free nbparam */
1068     freeDeviceBuffer(&(nb->nbparam->nbfp_climg2d));
1069     freeDeviceBuffer(&(nb->nbparam->nbfp_comb_climg2d));
1070     freeDeviceBuffer(&(nb->nbparam->coulomb_tab_climg2d));
1071     sfree(nb->nbparam);
1072
1073     /* Free plist */
1074     auto *plist = nb->plist[InteractionLocality::Local];
1075     freeDeviceBuffer(&plist->sci);
1076     freeDeviceBuffer(&plist->cj4);
1077     freeDeviceBuffer(&plist->imask);
1078     freeDeviceBuffer(&plist->excl);
1079     sfree(plist);
1080     if (nb->bUseTwoStreams)
1081     {
1082         auto *plist_nl = nb->plist[InteractionLocality::NonLocal];
1083         freeDeviceBuffer(&plist_nl->sci);
1084         freeDeviceBuffer(&plist_nl->cj4);
1085         freeDeviceBuffer(&plist_nl->imask);
1086         freeDeviceBuffer(&plist_nl->excl);
1087         sfree(plist_nl);
1088     }
1089
1090     /* Free nbst */
1091     pfree(nb->nbst.e_lj);
1092     nb->nbst.e_lj = nullptr;
1093
1094     pfree(nb->nbst.e_el);
1095     nb->nbst.e_el = nullptr;
1096
1097     pfree(nb->nbst.fshift);
1098     nb->nbst.fshift = nullptr;
1099
1100     /* Free command queues */
1101     clReleaseCommandQueue(nb->stream[InteractionLocality::Local]);
1102     nb->stream[InteractionLocality::Local] = nullptr;
1103     if (nb->bUseTwoStreams)
1104     {
1105         clReleaseCommandQueue(nb->stream[InteractionLocality::NonLocal]);
1106         nb->stream[InteractionLocality::NonLocal] = nullptr;
1107     }
1108     /* Free other events */
1109     if (nb->nonlocal_done)
1110     {
1111         clReleaseEvent(nb->nonlocal_done);
1112         nb->nonlocal_done = nullptr;
1113     }
1114     if (nb->misc_ops_and_local_H2D_done)
1115     {
1116         clReleaseEvent(nb->misc_ops_and_local_H2D_done);
1117         nb->misc_ops_and_local_H2D_done = nullptr;
1118     }
1119
1120     free_gpu_device_runtime_data(nb->dev_rundata);
1121     sfree(nb->dev_rundata);
1122
1123     /* Free timers and timings */
1124     delete nb->timers;
1125     sfree(nb->timings);
1126     sfree(nb);
1127
1128     if (debug)
1129     {
1130         fprintf(debug, "Cleaned up OpenCL data structures.\n");
1131     }
1132 }
1133
1134 //! This function is documented in the header file
1135 gmx_wallclock_gpu_nbnxn_t *gpu_get_timings(gmx_nbnxn_ocl_t *nb)
1136 {
1137     return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
1138 }
1139
1140 //! This function is documented in the header file
1141 void gpu_reset_timings(nonbonded_verlet_t* nbv)
1142 {
1143     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
1144     {
1145         init_timings(nbv->gpu_nbv->timings);
1146     }
1147 }
1148
1149 //! This function is documented in the header file
1150 int gpu_min_ci_balanced(gmx_nbnxn_ocl_t *nb)
1151 {
1152     return nb != nullptr ?
1153            gpu_min_ci_balanced_factor * nb->dev_info->compute_units : 0;
1154 }
1155
1156 //! This function is documented in the header file
1157 gmx_bool gpu_is_kernel_ewald_analytical(const gmx_nbnxn_ocl_t *nb)
1158 {
1159     return ((nb->nbparam->eeltype == eelOclEWALD_ANA) ||
1160             (nb->nbparam->eeltype == eelOclEWALD_ANA_TWIN));
1161 }
1162
1163 } // namespace Nbnxm