src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *  \brief Define OpenCL implementation of nbnxn_gpu_data_mgmt.h
  37  *
  38  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  39  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  40  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  41  *  \author Szilárd Páll <pall.szilard@gmail.com>
  42  */
  43 #include "gmxpre.h"
  44
  45 #include <assert.h>
  46 #include <math.h>
  47 #include <stdarg.h>
  48 #include <stdio.h>
  49 #include <stdlib.h>
  50 #include <string.h>
  51
  52 #include "gromacs/gpu_utils/gpu_utils.h"
  53 #include "gromacs/gpu_utils/oclutils.h"
  54 #include "gromacs/hardware/gpu_hw_info.h"
  55 #include "gromacs/math/vectypes.h"
  56 #include "gromacs/mdlib/force_flags.h"
  57 #include "gromacs/mdlib/nb_verlet.h"
  58 #include "gromacs/mdlib/nbnxn_consts.h"
  59 #include "gromacs/mdlib/nbnxn_gpu.h"
  60 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  61 #include "gromacs/mdlib/nbnxn_gpu_jit_support.h"
  62 #include "gromacs/mdtypes/interaction_const.h"
  63 #include "gromacs/mdtypes/md_enums.h"
  64 #include "gromacs/pbcutil/ishift.h"
  65 #include "gromacs/timing/gpu_timing.h"
  66 #include "gromacs/utility/cstringutil.h"
  67 #include "gromacs/utility/fatalerror.h"
  68 #include "gromacs/utility/gmxassert.h"
  69 #include "gromacs/utility/real.h"
  70 #include "gromacs/utility/smalloc.h"
  71
  72 #include "nbnxn_ocl_internal.h"
  73 #include "nbnxn_ocl_types.h"
  74
  75 /*! \brief This parameter should be determined heuristically from the
  76  * kernel execution times
  77  *
  78  * This value is best for small systems on a single AMD Radeon R9 290X
  79  * (and about 5% faster than 40, which is the default for CUDA
  80  * devices). Larger simulation systems were quite insensitive to the
  81  * value of this parameter.
  82  */
  83 static unsigned int gpu_min_ci_balanced_factor = 50;
  84
  85
  86 /*! \brief Returns true if LJ combination rules are used in the non-bonded kernels.
  87  *
  88  * Full doc in nbnxn_ocl_internal.h */
  89 bool useLjCombRule(int vdwType)
  90 {
  91     return (vdwType == evdwOclCUTCOMBGEOM ||
  92             vdwType == evdwOclCUTCOMBLB);
  93 }
  94
  95 /*! \brief Free device buffers
  96  *
  97  * If the pointers to the size variables are NULL no resetting happens.
  98  */
  99 void ocl_free_buffered(cl_mem d_ptr, int *n, int *nalloc)
 100 {
 101     cl_int gmx_unused cl_error;
 102
 103     if (d_ptr)
 104     {
 105         cl_error = clReleaseMemObject(d_ptr);
 106         assert(cl_error == CL_SUCCESS);
 107         // TODO: handle errors
 108     }
 109
 110     if (n)
 111     {
 112         *n = -1;
 113     }
 114
 115     if (nalloc)
 116     {
 117         *nalloc = -1;
 118     }
 119 }
 120
 121 /*! \brief Reallocation device buffers
 122  *
 123  *  Reallocation of the memory pointed by d_ptr and copying of the data from
 124  *  the location pointed by h_src host-side pointer is done. Allocation is
 125  *  buffered and therefore freeing is only needed if the previously allocated
 126  *  space is not enough.
 127  *  The H2D copy is launched in command queue s and can be done synchronously or
 128  *  asynchronously (the default is the latter).
 129  *  If copy_event is not NULL, on return it will contain an event object
 130  *  identifying the H2D copy. The event can further be used to queue a wait
 131  *  for this operation or to query profiling information.
 132  *  OpenCL equivalent of cu_realloc_buffered.
 133  */
 134 void ocl_realloc_buffered(cl_mem *d_dest, void *h_src,
 135                           size_t type_size,
 136                           int *curr_size, int *curr_alloc_size,
 137                           int req_size,
 138                           cl_context context,
 139                           cl_command_queue s,
 140                           bool bAsync = true,
 141                           cl_event *copy_event = NULL)
 142 {
 143     if (d_dest == NULL || req_size < 0)
 144     {
 145         return;
 146     }
 147
 148     /* reallocate only if the data does not fit = allocation size is smaller
 149        than the current requested size */
 150     if (req_size > *curr_alloc_size)
 151     {
 152         cl_int gmx_unused cl_error;
 153
 154         /* only free if the array has already been initialized */
 155         if (*curr_alloc_size >= 0)
 156         {
 157             ocl_free_buffered(*d_dest, curr_size, curr_alloc_size);
 158         }
 159
 160         *curr_alloc_size = over_alloc_large(req_size);
 161
 162         *d_dest = clCreateBuffer(context, CL_MEM_READ_WRITE, *curr_alloc_size * type_size, NULL, &cl_error);
 163         assert(cl_error == CL_SUCCESS);
 164         // TODO: handle errors, check clCreateBuffer flags
 165     }
 166
 167     /* size could have changed without actual reallocation */
 168     *curr_size = req_size;
 169
 170     /* upload to device */
 171     if (h_src)
 172     {
 173         if (bAsync)
 174         {
 175             ocl_copy_H2D_async(*d_dest, h_src, 0, *curr_size * type_size, s, copy_event);
 176         }
 177         else
 178         {
 179             ocl_copy_H2D(*d_dest, h_src,  0, *curr_size * type_size, s);
 180         }
 181     }
 182 }
 183
 184 /*! \brief Releases the input OpenCL buffer */
 185 static void free_ocl_buffer(cl_mem *buffer)
 186 {
 187     cl_int gmx_unused cl_error;
 188
 189     assert(NULL != buffer);
 190
 191     if (*buffer)
 192     {
 193         cl_error = clReleaseMemObject(*buffer);
 194         assert(CL_SUCCESS == cl_error);
 195         *buffer = NULL;
 196     }
 197 }
 198
 199 /*! \brief Tabulates the Ewald Coulomb force and initializes the size/scale
 200  * and the table GPU array.
 201  *
 202  * If called with an already allocated table, it just re-uploads the
 203  * table.
 204  */
 205 static void init_ewald_coulomb_force_table(const interaction_const_t       *ic,
 206                                            cl_nbparam_t                    *nbp,
 207                                            const gmx_device_runtime_data_t *runData)
 208 {
 209     cl_mem       coul_tab;
 210
 211     cl_int       cl_error;
 212
 213     if (nbp->coulomb_tab_climg2d != NULL)
 214     {
 215         free_ocl_buffer(&(nbp->coulomb_tab_climg2d));
 216     }
 217
 218     /* Switched from using textures to using buffers */
 219     // TODO: decide which alternative is most efficient - textures or buffers.
 220     /*
 221        cl_image_format array_format;
 222
 223        array_format.image_channel_data_type = CL_FLOAT;
 224        array_format.image_channel_order     = CL_R;
 225
 226        coul_tab = clCreateImage2D(runData->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
 227        &array_format, tabsize, 1, 0, ftmp, &cl_error);
 228      */
 229
 230     coul_tab = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ic->tabq_size*sizeof(cl_float), ic->tabq_coul_F, &cl_error);
 231     assert(cl_error == CL_SUCCESS);
 232     // TODO: handle errors, check clCreateBuffer flags
 233
 234     nbp->coulomb_tab_climg2d  = coul_tab;
 235     nbp->coulomb_tab_size     = ic->tabq_size;
 236     nbp->coulomb_tab_scale    = ic->tabq_scale;
 237 }
 238
 239
 240 /*! \brief Initializes the atomdata structure first time, it only gets filled at
 241     pair-search.
 242  */
 243 static void init_atomdata_first(cl_atomdata_t *ad, int ntypes, gmx_device_runtime_data_t *runData)
 244 {
 245     cl_int cl_error;
 246
 247     ad->ntypes  = ntypes;
 248
 249     /* An element of the shift_vec device buffer has the same size as one element
 250        of the host side shift_vec buffer. */
 251     ad->shift_vec_elem_size = sizeof(*(((nbnxn_atomdata_t*)0)->shift_vec));
 252
 253     // TODO: handle errors, check clCreateBuffer flags
 254     ad->shift_vec = clCreateBuffer(runData->context, CL_MEM_READ_WRITE, SHIFTS * ad->shift_vec_elem_size, NULL, &cl_error);
 255     assert(cl_error == CL_SUCCESS);
 256     ad->bShiftVecUploaded = false;
 257
 258     /* An element of the fshift device buffer has the same size as one element
 259        of the host side fshift buffer. */
 260     ad->fshift_elem_size = sizeof(*(((cl_nb_staging_t*)0)->fshift));
 261
 262     ad->fshift = clCreateBuffer(runData->context, CL_MEM_READ_WRITE, SHIFTS * ad->fshift_elem_size, NULL, &cl_error);
 263     assert(cl_error == CL_SUCCESS);
 264     // TODO: handle errors, check clCreateBuffer flags
 265
 266     ad->e_lj = clCreateBuffer(runData->context, CL_MEM_READ_WRITE, sizeof(float), NULL, &cl_error);
 267     assert(cl_error == CL_SUCCESS);
 268     // TODO: handle errors, check clCreateBuffer flags
 269
 270     ad->e_el = clCreateBuffer(runData->context, CL_MEM_READ_WRITE, sizeof(float), NULL, &cl_error);
 271     assert(cl_error == CL_SUCCESS);
 272     // TODO: handle errors, check clCreateBuffer flags
 273
 274     /* initialize to NULL pointers to data that is not allocated here and will
 275        need reallocation in nbnxn_gpu_init_atomdata */
 276     ad->xq = NULL;
 277     ad->f  = NULL;
 278
 279     /* size -1 indicates that the respective array hasn't been initialized yet */
 280     ad->natoms = -1;
 281     ad->nalloc = -1;
 282 }
 283
 284 /*! \brief Copies all parameters related to the cut-off from ic to nbp
 285  */
 286 static void set_cutoff_parameters(cl_nbparam_t              *nbp,
 287                                   const interaction_const_t *ic,
 288                                   const NbnxnListParameters *listParams)
 289 {
 290     nbp->ewald_beta       = ic->ewaldcoeff_q;
 291     nbp->sh_ewald         = ic->sh_ewald;
 292     nbp->epsfac           = ic->epsfac;
 293     nbp->two_k_rf         = 2.0 * ic->k_rf;
 294     nbp->c_rf             = ic->c_rf;
 295     nbp->rvdw_sq          = ic->rvdw * ic->rvdw;
 296     nbp->rcoulomb_sq      = ic->rcoulomb * ic->rcoulomb;
 297     nbp->rlist_sq         = listParams->rlistOuter * listParams->rlistOuter;
 298
 299     nbp->sh_lj_ewald      = ic->sh_lj_ewald;
 300     nbp->ewaldcoeff_lj    = ic->ewaldcoeff_lj;
 301
 302     nbp->rvdw_switch      = ic->rvdw_switch;
 303     nbp->dispersion_shift = ic->dispersion_shift;
 304     nbp->repulsion_shift  = ic->repulsion_shift;
 305     nbp->vdw_switch       = ic->vdw_switch;
 306 }
 307
 308 /*! \brief Returns the kinds of electrostatics and Vdw OpenCL
 309  *  kernels that will be used.
 310  *
 311  * Respectively, these values are from enum eelOcl and enum
 312  * evdwOcl. */
 313 static void
 314 map_interaction_types_to_gpu_kernel_flavors(const interaction_const_t *ic,
 315                                             int                        combRule,
 316                                             int                       *gpu_eeltype,
 317                                             int                       *gpu_vdwtype)
 318 {
 319     if (ic->vdwtype == evdwCUT)
 320     {
 321         switch (ic->vdw_modifier)
 322         {
 323             case eintmodNONE:
 324             case eintmodPOTSHIFT:
 325                 switch (combRule)
 326                 {
 327                     case ljcrNONE:
 328                         *gpu_vdwtype = evdwOclCUT;
 329                         break;
 330                     case ljcrGEOM:
 331                         *gpu_vdwtype = evdwOclCUTCOMBGEOM;
 332                         break;
 333                     case ljcrLB:
 334                         *gpu_vdwtype = evdwOclCUTCOMBLB;
 335                         break;
 336                     default:
 337                         gmx_incons("The requested LJ combination rule is not implemented in the OpenCL GPU accelerated kernels!");
 338                         break;
 339                 }
 340                 break;
 341             case eintmodFORCESWITCH:
 342                 *gpu_vdwtype = evdwOclFSWITCH;
 343                 break;
 344             case eintmodPOTSWITCH:
 345                 *gpu_vdwtype = evdwOclPSWITCH;
 346                 break;
 347             default:
 348                 gmx_incons("The requested VdW interaction modifier is not implemented in the GPU accelerated kernels!");
 349                 break;
 350         }
 351     }
 352     else if (ic->vdwtype == evdwPME)
 353     {
 354         if (ic->ljpme_comb_rule == ljcrGEOM)
 355         {
 356             *gpu_vdwtype = evdwOclEWALDGEOM;
 357         }
 358         else
 359         {
 360             *gpu_vdwtype = evdwOclEWALDLB;
 361         }
 362     }
 363     else
 364     {
 365         gmx_incons("The requested VdW type is not implemented in the GPU accelerated kernels!");
 366     }
 367
 368     if (ic->eeltype == eelCUT)
 369     {
 370         *gpu_eeltype = eelOclCUT;
 371     }
 372     else if (EEL_RF(ic->eeltype))
 373     {
 374         *gpu_eeltype = eelOclRF;
 375     }
 376     else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
 377     {
 378         /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
 379         *gpu_eeltype = nbnxn_gpu_pick_ewald_kernel_type(false);
 380     }
 381     else
 382     {
 383         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
 384         gmx_incons("The requested electrostatics type is not implemented in the GPU accelerated kernels!");
 385     }
 386 }
 387
 388 /*! \brief Initializes the nonbonded parameter data structure.
 389  */
 390 static void init_nbparam(cl_nbparam_t                    *nbp,
 391                          const interaction_const_t       *ic,
 392                          const NbnxnListParameters       *listParams,
 393                          const nbnxn_atomdata_t          *nbat,
 394                          const gmx_device_runtime_data_t *runData)
 395 {
 396     int         ntypes, nnbfp, nnbfp_comb;
 397     cl_int      cl_error;
 398
 399
 400     ntypes = nbat->ntype;
 401
 402     set_cutoff_parameters(nbp, ic, listParams);
 403
 404     map_interaction_types_to_gpu_kernel_flavors(ic,
 405                                                 nbat->comb_rule,
 406                                                 &(nbp->eeltype),
 407                                                 &(nbp->vdwtype));
 408
 409     if (ic->vdwtype == evdwPME)
 410     {
 411         if (ic->ljpme_comb_rule == ljcrGEOM)
 412         {
 413             assert(nbat->comb_rule == ljcrGEOM);
 414         }
 415         else
 416         {
 417             assert(nbat->comb_rule == ljcrLB);
 418         }
 419     }
 420     /* generate table for PME */
 421     nbp->coulomb_tab_climg2d = NULL;
 422     if (nbp->eeltype == eelOclEWALD_TAB || nbp->eeltype == eelOclEWALD_TAB_TWIN)
 423     {
 424         init_ewald_coulomb_force_table(ic, nbp, runData);
 425     }
 426     else
 427     // TODO: improvement needed.
 428     // The image2d is created here even if eeltype is not eelCuEWALD_TAB or eelCuEWALD_TAB_TWIN because the OpenCL kernels
 429     // don't accept NULL values for image2D parameters.
 430     {
 431         /* Switched from using textures to using buffers */
 432         // TODO: decide which alternative is most efficient - textures or buffers.
 433         /*
 434            cl_image_format array_format;
 435
 436            array_format.image_channel_data_type = CL_FLOAT;
 437            array_format.image_channel_order     = CL_R;
 438
 439            nbp->coulomb_tab_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE,
 440             &array_format, 1, 1, 0, NULL, &cl_error);
 441          */
 442
 443         nbp->coulomb_tab_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY, sizeof(cl_float), NULL, &cl_error);
 444         // TODO: handle errors
 445     }
 446
 447     nnbfp      = 2*ntypes*ntypes;
 448     nnbfp_comb = 2*ntypes;
 449
 450     {
 451         /* Switched from using textures to using buffers */
 452         // TODO: decide which alternative is most efficient - textures or buffers.
 453         /*
 454            cl_image_format array_format;
 455
 456            array_format.image_channel_data_type = CL_FLOAT;
 457            array_format.image_channel_order     = CL_R;
 458
 459            nbp->nbfp_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
 460             &array_format, nnbfp, 1, 0, nbat->nbfp, &cl_error);
 461          */
 462
 463         nbp->nbfp_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nnbfp*sizeof(cl_float), nbat->nbfp, &cl_error);
 464         assert(cl_error == CL_SUCCESS);
 465         // TODO: handle errors
 466
 467         if (ic->vdwtype == evdwPME)
 468         {
 469             /* Switched from using textures to using buffers */
 470             // TODO: decide which alternative is most efficient - textures or buffers.
 471             /*  nbp->nbfp_comb_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
 472                 &array_format, nnbfp_comb, 1, 0, nbat->nbfp_comb, &cl_error);*/
 473             nbp->nbfp_comb_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nnbfp_comb*sizeof(cl_float), nbat->nbfp_comb, &cl_error);
 474
 475
 476             assert(cl_error == CL_SUCCESS);
 477             // TODO: handle errors
 478         }
 479         else
 480         {
 481             // TODO: improvement needed.
 482             // The image2d is created here even if vdwtype is not evdwPME because the OpenCL kernels
 483             // don't accept NULL values for image2D parameters.
 484             /* Switched from using textures to using buffers */
 485             // TODO: decide which alternative is most efficient - textures or buffers.
 486             /* nbp->nbfp_comb_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE,
 487                 &array_format, 1, 1, 0, NULL, &cl_error);*/
 488             nbp->nbfp_comb_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY, sizeof(cl_float), NULL, &cl_error);
 489
 490
 491             assert(cl_error == CL_SUCCESS);
 492             // TODO: handle errors
 493         }
 494     }
 495 }
 496
 497 //! This function is documented in the header file
 498 void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
 499                                         const interaction_const_t   *ic,
 500                                         const NbnxnListParameters   *listParams)
 501 {
 502     if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
 503     {
 504         return;
 505     }
 506     gmx_nbnxn_ocl_t    *nb  = nbv->gpu_nbv;
 507     cl_nbparam_t       *nbp = nb->nbparam;
 508
 509     set_cutoff_parameters(nbp, ic, listParams);
 510
 511     nbp->eeltype = nbnxn_gpu_pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw);
 512
 513     init_ewald_coulomb_force_table(ic, nb->nbparam, nb->dev_rundata);
 514 }
 515
 516 /*! \brief Initializes the pair list data structure.
 517  */
 518 static void init_plist(cl_plist_t *pl)
 519 {
 520     /* initialize to NULL pointers to data that is not allocated here and will
 521        need reallocation in nbnxn_gpu_init_pairlist */
 522     pl->sci     = NULL;
 523     pl->cj4     = NULL;
 524     pl->excl    = NULL;
 525
 526     /* size -1 indicates that the respective array hasn't been initialized yet */
 527     pl->na_c        = -1;
 528     pl->nsci        = -1;
 529     pl->sci_nalloc  = -1;
 530     pl->ncj4        = -1;
 531     pl->cj4_nalloc  = -1;
 532     pl->nexcl       = -1;
 533     pl->excl_nalloc = -1;
 534     pl->bDoPrune    = false;
 535 }
 536
 537 /*! \brief Initializes the timer data structure.
 538  */
 539 static void init_timers(cl_timers_t gmx_unused *t, bool gmx_unused bUseTwoStreams)
 540 {
 541     /* Nothing to initialize for OpenCL */
 542 }
 543
 544 /*! \brief Initializes the timings data structure.
 545  */
 546 static void init_timings(gmx_wallclock_gpu_t *t)
 547 {
 548     int i, j;
 549
 550     t->nb_h2d_t = 0.0;
 551     t->nb_d2h_t = 0.0;
 552     t->nb_c     = 0;
 553     t->pl_h2d_t = 0.0;
 554     t->pl_h2d_c = 0;
 555     for (i = 0; i < 2; i++)
 556     {
 557         for (j = 0; j < 2; j++)
 558         {
 559             t->ktime[i][j].t = 0.0;
 560             t->ktime[i][j].c = 0;
 561         }
 562     }
 563 }
 564
 565 /*! \brief Creates context for OpenCL GPU given by \p mygpu
 566  *
 567  * A fatal error results if creation fails.
 568  *
 569  * \param[inout] runtimeData runtime data including program and context
 570  * \param[in]    devInfo     device info struct
 571  * \param[in]    rank        MPI rank (for error reporting)
 572  */
 573 static void
 574 nbnxn_gpu_create_context(gmx_device_runtime_data_t *runtimeData,
 575                          const gmx_device_info_t   *devInfo,
 576                          int                        rank)
 577 {
 578     cl_context_properties     context_properties[3];
 579     cl_platform_id            platform_id;
 580     cl_device_id              device_id;
 581     cl_context                context;
 582     cl_int                    cl_error;
 583
 584     assert(runtimeData != NULL);
 585     assert(devInfo != NULL);
 586
 587     platform_id      = devInfo->ocl_gpu_id.ocl_platform_id;
 588     device_id        = devInfo->ocl_gpu_id.ocl_device_id;
 589
 590     context_properties[0] = CL_CONTEXT_PLATFORM;
 591     context_properties[1] = (cl_context_properties) platform_id;
 592     context_properties[2] = 0; /* Terminates the list of properties */
 593
 594     context = clCreateContext(context_properties, 1, &device_id, NULL, NULL, &cl_error);
 595     if (CL_SUCCESS != cl_error)
 596     {
 597         gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s:\n OpenCL error %d: %s",
 598                   rank,
 599                   devInfo->device_name,
 600                   cl_error, ocl_get_error_string(cl_error).c_str());
 601         return;
 602     }
 603
 604     runtimeData->context = context;
 605 }
 606
 607 /*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
 608 static cl_kernel nbnxn_gpu_create_kernel(gmx_nbnxn_ocl_t *nb,
 609                                          const char      *kernel_name)
 610 {
 611     cl_kernel kernel;
 612     cl_int    cl_error;
 613
 614     kernel = clCreateKernel(nb->dev_rundata->program, kernel_name, &cl_error);
 615     if (CL_SUCCESS != cl_error)
 616     {
 617         gmx_fatal(FARGS, "Failed to create kernel '%s' for GPU #%s: OpenCL error %d",
 618                   kernel_name,
 619                   nb->dev_info->device_name,
 620                   cl_error);
 621     }
 622
 623     return kernel;
 624 }
 625
 626 /*! \brief Clears nonbonded shift force output array and energy outputs on the GPU.
 627  */
 628 static void
 629 nbnxn_ocl_clear_e_fshift(gmx_nbnxn_ocl_t *nb)
 630 {
 631
 632     cl_int               cl_error;
 633     cl_atomdata_t *      adat     = nb->atdat;
 634     cl_command_queue     ls       = nb->stream[eintLocal];
 635
 636     size_t               local_work_size[3]   = {1, 1, 1};
 637     size_t               global_work_size[3]  = {1, 1, 1};
 638
 639     cl_int               shifts   = SHIFTS*3;
 640
 641     cl_int               arg_no;
 642
 643     cl_kernel            zero_e_fshift = nb->kernel_zero_e_fshift;
 644
 645     local_work_size[0]   = 64;
 646     // Round the total number of threads up from the array size
 647     global_work_size[0]  = ((shifts + local_work_size[0] - 1)/local_work_size[0])*local_work_size[0];
 648
 649     arg_no    = 0;
 650     cl_error  = clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->fshift));
 651     cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_lj));
 652     cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_el));
 653     cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_uint), &shifts);
 654     GMX_ASSERT(cl_error == CL_SUCCESS, ocl_get_error_string(cl_error).c_str());
 655
 656     cl_error = clEnqueueNDRangeKernel(ls, zero_e_fshift, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
 657     GMX_ASSERT(cl_error == CL_SUCCESS, ocl_get_error_string(cl_error).c_str());
 658 }
 659
 660 /*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
 661 static void nbnxn_gpu_init_kernels(gmx_nbnxn_ocl_t *nb)
 662 {
 663     /* Init to 0 main kernel arrays */
 664     /* They will be later on initialized in select_nbnxn_kernel */
 665     memset(nb->kernel_ener_noprune_ptr, 0, sizeof(nb->kernel_ener_noprune_ptr));
 666     memset(nb->kernel_ener_prune_ptr, 0, sizeof(nb->kernel_ener_prune_ptr));
 667     memset(nb->kernel_noener_noprune_ptr, 0, sizeof(nb->kernel_noener_noprune_ptr));
 668     memset(nb->kernel_noener_prune_ptr, 0, sizeof(nb->kernel_noener_prune_ptr));
 669
 670     /* Init auxiliary kernels */
 671     nb->kernel_memset_f      = nbnxn_gpu_create_kernel(nb, "memset_f");
 672     nb->kernel_memset_f2     = nbnxn_gpu_create_kernel(nb, "memset_f2");
 673     nb->kernel_memset_f3     = nbnxn_gpu_create_kernel(nb, "memset_f3");
 674     nb->kernel_zero_e_fshift = nbnxn_gpu_create_kernel(nb, "zero_e_fshift");
 675 }
 676
 677 /*! \brief Initializes simulation constant data.
 678  *
 679  *  Initializes members of the atomdata and nbparam structs and
 680  *  clears e/fshift output buffers.
 681  */
 682 static void nbnxn_ocl_init_const(gmx_nbnxn_ocl_t                *nb,
 683                                  const interaction_const_t      *ic,
 684                                  const NbnxnListParameters      *listParams,
 685                                  const nonbonded_verlet_group_t *nbv_group)
 686 {
 687     init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype, nb->dev_rundata);
 688     init_nbparam(nb->nbparam, ic, listParams, nbv_group[0].nbat, nb->dev_rundata);
 689 }
 690
 691
 692 //! This function is documented in the header file
 693 void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
 694                     const gmx_device_info_t   *deviceInfo,
 695                     const interaction_const_t *ic,
 696                     const NbnxnListParameters *listParams,
 697                     nonbonded_verlet_group_t  *nbv_grp,
 698                     int                        rank,
 699                     gmx_bool                   bLocalAndNonlocal)
 700 {
 701     gmx_nbnxn_ocl_t            *nb;
 702     cl_int                      cl_error;
 703     cl_command_queue_properties queue_properties;
 704
 705     assert(ic);
 706
 707     if (p_nb == NULL)
 708     {
 709         return;
 710     }
 711
 712     snew(nb, 1);
 713     snew(nb->atdat, 1);
 714     snew(nb->nbparam, 1);
 715     snew(nb->plist[eintLocal], 1);
 716     if (bLocalAndNonlocal)
 717     {
 718         snew(nb->plist[eintNonlocal], 1);
 719     }
 720
 721     nb->bUseTwoStreams = bLocalAndNonlocal;
 722
 723     snew(nb->timers, 1);
 724     snew(nb->timings, 1);
 725
 726     /* set device info, just point it to the right GPU among the detected ones */
 727     nb->dev_info = deviceInfo;
 728     snew(nb->dev_rundata, 1);
 729
 730     /* init to NULL the debug buffer */
 731     nb->debug_buffer = NULL;
 732
 733     /* init nbst */
 734     ocl_pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj));
 735     ocl_pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
 736     ocl_pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
 737
 738     init_plist(nb->plist[eintLocal]);
 739
 740     /* OpenCL timing disabled if GMX_DISABLE_OCL_TIMING is defined. */
 741     /* TODO deprecate the first env var in the 2017 release. */
 742     nb->bDoTime = (getenv("GMX_DISABLE_OCL_TIMING") == NULL &&
 743                    getenv("GMX_DISABLE_GPU_TIMING") == NULL);
 744
 745     /* Create queues only after bDoTime has been initialized */
 746     if (nb->bDoTime)
 747     {
 748         queue_properties = CL_QUEUE_PROFILING_ENABLE;
 749     }
 750     else
 751     {
 752         queue_properties = 0;
 753     }
 754
 755     nbnxn_gpu_create_context(nb->dev_rundata, nb->dev_info, rank);
 756
 757     /* local/non-local GPU streams */
 758     nb->stream[eintLocal] = clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
 759     if (CL_SUCCESS != cl_error)
 760     {
 761         gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
 762                   rank,
 763                   nb->dev_info->device_name,
 764                   cl_error);
 765         return;
 766     }
 767
 768     if (nb->bUseTwoStreams)
 769     {
 770         init_plist(nb->plist[eintNonlocal]);
 771
 772         nb->stream[eintNonlocal] = clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
 773         if (CL_SUCCESS != cl_error)
 774         {
 775             gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
 776                       rank,
 777                       nb->dev_info->device_name,
 778                       cl_error);
 779             return;
 780         }
 781     }
 782
 783     if (nb->bDoTime)
 784     {
 785         init_timers(nb->timers, nb->bUseTwoStreams);
 786         init_timings(nb->timings);
 787     }
 788
 789     nbnxn_ocl_init_const(nb, ic, listParams, nbv_grp);
 790
 791     /* Enable LJ param manual prefetch for AMD or if we request through env. var.
 792      * TODO: decide about NVIDIA
 793      */
 794     nb->bPrefetchLjParam =
 795         (getenv("GMX_OCL_DISABLE_I_PREFETCH") == NULL) &&
 796         ((nb->dev_info->vendor_e == OCL_VENDOR_AMD) || (getenv("GMX_OCL_ENABLE_I_PREFETCH") != NULL));
 797
 798     /* NOTE: in CUDA we pick L1 cache configuration for the nbnxn kernels here,
 799      * but sadly this is not supported in OpenCL (yet?). Consider adding it if
 800      * it becomes supported.
 801      */
 802     nbnxn_gpu_compile_kernels(nb);
 803     nbnxn_gpu_init_kernels(nb);
 804
 805     /* clear energy and shift force outputs */
 806     nbnxn_ocl_clear_e_fshift(nb);
 807
 808     *p_nb = nb;
 809
 810     if (debug)
 811     {
 812         fprintf(debug, "Initialized OpenCL data structures.\n");
 813     }
 814 }
 815
 816 /*! \brief Clears the first natoms_clear elements of the GPU nonbonded force output array.
 817  */
 818 static void nbnxn_ocl_clear_f(gmx_nbnxn_ocl_t *nb, int natoms_clear)
 819 {
 820     if (natoms_clear == 0)
 821     {
 822         return;
 823     }
 824
 825     cl_int               cl_error;
 826     cl_atomdata_t *      adat     = nb->atdat;
 827     cl_command_queue     ls       = nb->stream[eintLocal];
 828     cl_float             value    = 0.0f;
 829
 830     size_t               local_work_size[3]  = {1, 1, 1};
 831     size_t               global_work_size[3] = {1, 1, 1};
 832
 833     cl_int               arg_no;
 834
 835     cl_kernel            memset_f = nb->kernel_memset_f;
 836
 837     cl_uint              natoms_flat = natoms_clear * (sizeof(rvec)/sizeof(real));
 838
 839     local_work_size[0]  = 64;
 840     // Round the total number of threads up from the array size
 841     global_work_size[0] = ((natoms_flat + local_work_size[0] - 1)/local_work_size[0])*local_work_size[0];
 842
 843
 844     arg_no    = 0;
 845     cl_error  = clSetKernelArg(memset_f, arg_no++, sizeof(cl_mem), &(adat->f));
 846     cl_error |= clSetKernelArg(memset_f, arg_no++, sizeof(cl_float), &value);
 847     cl_error |= clSetKernelArg(memset_f, arg_no++, sizeof(cl_uint), &natoms_flat);
 848     assert(cl_error == CL_SUCCESS);
 849
 850     cl_error = clEnqueueNDRangeKernel(ls, memset_f, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
 851     assert(cl_error == CL_SUCCESS);
 852 }
 853
 854 //! This function is documented in the header file
 855 void
 856 nbnxn_gpu_clear_outputs(gmx_nbnxn_ocl_t   *nb,
 857                         int                flags)
 858 {
 859     nbnxn_ocl_clear_f(nb, nb->atdat->natoms);
 860     /* clear shift force array and energies if the outputs were
 861        used in the current step */
 862     if (flags & GMX_FORCE_VIRIAL)
 863     {
 864         nbnxn_ocl_clear_e_fshift(nb);
 865     }
 866
 867     /* kick off buffer clearing kernel to ensure concurrency with constraints/update */
 868     cl_int gmx_unused cl_error;
 869     cl_error = clFlush(nb->stream[eintLocal]);
 870     assert(CL_SUCCESS == cl_error);
 871 }
 872
 873 //! This function is documented in the header file
 874 void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t        *nb,
 875                              const nbnxn_pairlist_t *h_plist,
 876                              int                     iloc)
 877 {
 878     char             sbuf[STRLEN];
 879     cl_command_queue stream     = nb->stream[iloc];
 880     cl_plist_t      *d_plist    = nb->plist[iloc];
 881
 882     if (d_plist->na_c < 0)
 883     {
 884         d_plist->na_c = h_plist->na_ci;
 885     }
 886     else
 887     {
 888         if (d_plist->na_c != h_plist->na_ci)
 889         {
 890             sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
 891                     d_plist->na_c, h_plist->na_ci);
 892             gmx_incons(sbuf);
 893         }
 894     }
 895
 896     ocl_realloc_buffered(&d_plist->sci, h_plist->sci, sizeof(nbnxn_sci_t),
 897                          &d_plist->nsci, &d_plist->sci_nalloc,
 898                          h_plist->nsci,
 899                          nb->dev_rundata->context,
 900                          stream, true, &(nb->timers->pl_h2d_sci[iloc]));
 901
 902     ocl_realloc_buffered(&d_plist->cj4, h_plist->cj4, sizeof(nbnxn_cj4_t),
 903                          &d_plist->ncj4, &d_plist->cj4_nalloc,
 904                          h_plist->ncj4,
 905                          nb->dev_rundata->context,
 906                          stream, true, &(nb->timers->pl_h2d_cj4[iloc]));
 907
 908     ocl_realloc_buffered(&d_plist->excl, h_plist->excl, sizeof(nbnxn_excl_t),
 909                          &d_plist->nexcl, &d_plist->excl_nalloc,
 910                          h_plist->nexcl,
 911                          nb->dev_rundata->context,
 912                          stream, true, &(nb->timers->pl_h2d_excl[iloc]));
 913
 914     /* need to prune the pair list during the next step */
 915     d_plist->bDoPrune = true;
 916 }
 917
 918 //! This function is documented in the header file
 919 void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_ocl_t        *nb,
 920                                const nbnxn_atomdata_t *nbatom)
 921 {
 922     cl_atomdata_t   *adat  = nb->atdat;
 923     cl_command_queue ls    = nb->stream[eintLocal];
 924
 925     /* only if we have a dynamic box */
 926     if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
 927     {
 928         ocl_copy_H2D_async(adat->shift_vec, nbatom->shift_vec, 0,
 929                            SHIFTS * adat->shift_vec_elem_size, ls, NULL);
 930         adat->bShiftVecUploaded = true;
 931     }
 932 }
 933
 934 //! This function is documented in the header file
 935 void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t               *nb,
 936                              const struct nbnxn_atomdata_t *nbat)
 937 {
 938     cl_int           cl_error;
 939     int              nalloc, natoms;
 940     bool             realloced;
 941     bool             bDoTime = nb->bDoTime;
 942     cl_timers_t     *timers  = nb->timers;
 943     cl_atomdata_t   *d_atdat = nb->atdat;
 944     cl_command_queue ls      = nb->stream[eintLocal];
 945
 946     natoms    = nbat->natoms;
 947     realloced = false;
 948
 949     /* need to reallocate if we have to copy more atoms than the amount of space
 950        available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
 951     if (natoms > d_atdat->nalloc)
 952     {
 953         nalloc = over_alloc_small(natoms);
 954
 955         /* free up first if the arrays have already been initialized */
 956         if (d_atdat->nalloc != -1)
 957         {
 958             ocl_free_buffered(d_atdat->f, &d_atdat->natoms, &d_atdat->nalloc);
 959             ocl_free_buffered(d_atdat->xq, NULL, NULL);
 960             ocl_free_buffered(d_atdat->lj_comb, NULL, NULL);
 961             ocl_free_buffered(d_atdat->atom_types, NULL, NULL);
 962         }
 963
 964         d_atdat->f_elem_size = sizeof(rvec);
 965
 966         // TODO: handle errors, check clCreateBuffer flags
 967         d_atdat->f = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE, nalloc * d_atdat->f_elem_size, NULL, &cl_error);
 968         assert(CL_SUCCESS == cl_error);
 969
 970         // TODO: change the flag to read-only
 971         d_atdat->xq = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE, nalloc * sizeof(cl_float4), NULL, &cl_error);
 972         assert(CL_SUCCESS == cl_error);
 973         // TODO: handle errors, check clCreateBuffer flags
 974
 975         if (useLjCombRule(nb->nbparam->vdwtype))
 976         {
 977             // TODO: change the flag to read-only
 978             d_atdat->lj_comb = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE, nalloc * sizeof(cl_float2), NULL, &cl_error);
 979             assert(CL_SUCCESS == cl_error);
 980             // TODO: handle errors, check clCreateBuffer flags
 981         }
 982         else
 983         {
 984             // TODO: change the flag to read-only
 985             d_atdat->atom_types = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE, nalloc * sizeof(int), NULL, &cl_error);
 986             assert(CL_SUCCESS == cl_error);
 987             // TODO: handle errors, check clCreateBuffer flags
 988         }
 989
 990         d_atdat->nalloc = nalloc;
 991         realloced       = true;
 992     }
 993
 994     d_atdat->natoms       = natoms;
 995     d_atdat->natoms_local = nbat->natoms_local;
 996
 997     /* need to clear GPU f output if realloc happened */
 998     if (realloced)
 999     {
1000         nbnxn_ocl_clear_f(nb, nalloc);
1001     }
1002
1003     if (useLjCombRule(nb->nbparam->vdwtype))
1004     {
1005         ocl_copy_H2D_async(d_atdat->lj_comb, nbat->lj_comb, 0,
1006                            natoms*sizeof(cl_float2), ls, bDoTime ? &(timers->atdat) : NULL);
1007     }
1008     else
1009     {
1010         ocl_copy_H2D_async(d_atdat->atom_types, nbat->type, 0,
1011                            natoms*sizeof(int), ls, bDoTime ? &(timers->atdat) : NULL);
1012
1013     }
1014
1015     /* kick off the tasks enqueued above to ensure concurrency with the search */
1016     cl_error = clFlush(ls);
1017     assert(CL_SUCCESS == cl_error);
1018 }
1019
1020 /*! \brief Releases an OpenCL kernel pointer */
1021 void free_kernel(cl_kernel *kernel_ptr)
1022 {
1023     cl_int gmx_unused cl_error;
1024
1025     assert(NULL != kernel_ptr);
1026
1027     if (*kernel_ptr)
1028     {
1029         cl_error = clReleaseKernel(*kernel_ptr);
1030         assert(cl_error == CL_SUCCESS);
1031
1032         *kernel_ptr = NULL;
1033     }
1034 }
1035
1036 /*! \brief Releases a list of OpenCL kernel pointers */
1037 void free_kernels(cl_kernel *kernels, int count)
1038 {
1039     int i;
1040
1041     for (i = 0; i < count; i++)
1042     {
1043         free_kernel(kernels + i);
1044     }
1045 }
1046
1047 /*! \brief Free the OpenCL runtime data (context and program).
1048  *
1049  *  The function releases the OpenCL context and program assuciated with the
1050  *  device that the calling PP rank is running on.
1051  *
1052  *  \param runData [in]  porinter to the structure with runtime data.
1053  */
1054 static void free_gpu_device_runtime_data(gmx_device_runtime_data_t *runData)
1055 {
1056     if (runData == NULL)
1057     {
1058         return;
1059     }
1060
1061     cl_int gmx_unused cl_error;
1062
1063     if (runData->context)
1064     {
1065         cl_error         = clReleaseContext(runData->context);
1066         runData->context = NULL;
1067         assert(CL_SUCCESS == cl_error);
1068     }
1069
1070     if (runData->program)
1071     {
1072         cl_error         = clReleaseProgram(runData->program);
1073         runData->program = NULL;
1074         assert(CL_SUCCESS == cl_error);
1075     }
1076
1077 }
1078
1079 //! This function is documented in the header file
1080 void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
1081 {
1082     int    kernel_count;
1083
1084     /* Free kernels */
1085     kernel_count = sizeof(nb->kernel_ener_noprune_ptr) / sizeof(nb->kernel_ener_noprune_ptr[0][0]);
1086     free_kernels((cl_kernel*)nb->kernel_ener_noprune_ptr, kernel_count);
1087
1088     kernel_count = sizeof(nb->kernel_ener_prune_ptr) / sizeof(nb->kernel_ener_prune_ptr[0][0]);
1089     free_kernels((cl_kernel*)nb->kernel_ener_prune_ptr, kernel_count);
1090
1091     kernel_count = sizeof(nb->kernel_noener_noprune_ptr) / sizeof(nb->kernel_noener_noprune_ptr[0][0]);
1092     free_kernels((cl_kernel*)nb->kernel_noener_noprune_ptr, kernel_count);
1093
1094     kernel_count = sizeof(nb->kernel_noener_prune_ptr) / sizeof(nb->kernel_noener_prune_ptr[0][0]);
1095     free_kernels((cl_kernel*)nb->kernel_noener_prune_ptr, kernel_count);
1096
1097     free_kernel(&(nb->kernel_memset_f));
1098     free_kernel(&(nb->kernel_memset_f2));
1099     free_kernel(&(nb->kernel_memset_f3));
1100     free_kernel(&(nb->kernel_zero_e_fshift));
1101
1102     /* Free atdat */
1103     free_ocl_buffer(&(nb->atdat->xq));
1104     free_ocl_buffer(&(nb->atdat->f));
1105     free_ocl_buffer(&(nb->atdat->e_lj));
1106     free_ocl_buffer(&(nb->atdat->e_el));
1107     free_ocl_buffer(&(nb->atdat->fshift));
1108     free_ocl_buffer(&(nb->atdat->lj_comb));
1109     free_ocl_buffer(&(nb->atdat->atom_types));
1110     free_ocl_buffer(&(nb->atdat->shift_vec));
1111     sfree(nb->atdat);
1112
1113     /* Free nbparam */
1114     free_ocl_buffer(&(nb->nbparam->nbfp_climg2d));
1115     free_ocl_buffer(&(nb->nbparam->nbfp_comb_climg2d));
1116     free_ocl_buffer(&(nb->nbparam->coulomb_tab_climg2d));
1117     sfree(nb->nbparam);
1118
1119     /* Free plist */
1120     free_ocl_buffer(&(nb->plist[eintLocal]->sci));
1121     free_ocl_buffer(&(nb->plist[eintLocal]->cj4));
1122     free_ocl_buffer(&(nb->plist[eintLocal]->excl));
1123     sfree(nb->plist[eintLocal]);
1124     if (nb->bUseTwoStreams)
1125     {
1126         free_ocl_buffer(&(nb->plist[eintNonlocal]->sci));
1127         free_ocl_buffer(&(nb->plist[eintNonlocal]->cj4));
1128         free_ocl_buffer(&(nb->plist[eintNonlocal]->excl));
1129         sfree(nb->plist[eintNonlocal]);
1130     }
1131
1132     /* Free nbst */
1133     ocl_pfree(nb->nbst.e_lj);
1134     nb->nbst.e_lj = NULL;
1135
1136     ocl_pfree(nb->nbst.e_el);
1137     nb->nbst.e_el = NULL;
1138
1139     ocl_pfree(nb->nbst.fshift);
1140     nb->nbst.fshift = NULL;
1141
1142     /* Free debug buffer */
1143     free_ocl_buffer(&nb->debug_buffer);
1144
1145     /* Free command queues */
1146     clReleaseCommandQueue(nb->stream[eintLocal]);
1147     nb->stream[eintLocal] = NULL;
1148     if (nb->bUseTwoStreams)
1149     {
1150         clReleaseCommandQueue(nb->stream[eintNonlocal]);
1151         nb->stream[eintNonlocal] = NULL;
1152     }
1153     /* Free other events */
1154     if (nb->nonlocal_done)
1155     {
1156         clReleaseEvent(nb->nonlocal_done);
1157         nb->nonlocal_done = NULL;
1158     }
1159     if (nb->misc_ops_and_local_H2D_done)
1160     {
1161         clReleaseEvent(nb->misc_ops_and_local_H2D_done);
1162         nb->misc_ops_and_local_H2D_done = NULL;
1163     }
1164
1165     free_gpu_device_runtime_data(nb->dev_rundata);
1166     sfree(nb->dev_rundata);
1167
1168     /* Free timers and timings */
1169     sfree(nb->timers);
1170     sfree(nb->timings);
1171     sfree(nb);
1172
1173     if (debug)
1174     {
1175         fprintf(debug, "Cleaned up OpenCL data structures.\n");
1176     }
1177 }
1178
1179
1180 //! This function is documented in the header file
1181 gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_ocl_t *nb)
1182 {
1183     return (nb != NULL && nb->bDoTime) ? nb->timings : NULL;
1184 }
1185
1186 //! This function is documented in the header file
1187 void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
1188 {
1189     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
1190     {
1191         init_timings(nbv->gpu_nbv->timings);
1192     }
1193 }
1194
1195 //! This function is documented in the header file
1196 int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_ocl_t *nb)
1197 {
1198     return nb != NULL ?
1199            gpu_min_ci_balanced_factor * nb->dev_info->compute_units : 0;
1200 }
1201
1202 //! This function is documented in the header file
1203 gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_ocl_t *nb)
1204 {
1205     return ((nb->nbparam->eeltype == eelOclEWALD_ANA) ||
1206             (nb->nbparam->eeltype == eelOclEWALD_ANA_TWIN));
1207 }