src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *  \brief Define OpenCL implementation of nbnxn_gpu_data_mgmt.h
  37  *
  38  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  39  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  40  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  41  *  \author Szilárd Páll <pall.szilard@gmail.com>
  42  */
  43 #include "gmxpre.h"
  44
  45 #include <assert.h>
  46 #include <math.h>
  47 #include <stdarg.h>
  48 #include <stdio.h>
  49 #include <stdlib.h>
  50 #include <string.h>
  51
  52 #include "gromacs/gpu_utils/gpu_utils.h"
  53 #include "gromacs/gpu_utils/oclutils.h"
  54 #include "gromacs/hardware/detecthardware.h"
  55 #include "gromacs/hardware/gpu_hw_info.h"
  56 #include "gromacs/math/vectypes.h"
  57 #include "gromacs/mdlib/force_flags.h"
  58 #include "gromacs/mdlib/nb_verlet.h"
  59 #include "gromacs/mdlib/nbnxn_consts.h"
  60 #include "gromacs/mdlib/nbnxn_gpu.h"
  61 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  62 #include "gromacs/mdlib/nbnxn_gpu_jit_support.h"
  63 #include "gromacs/mdtypes/interaction_const.h"
  64 #include "gromacs/mdtypes/md_enums.h"
  65 #include "gromacs/pbcutil/ishift.h"
  66 #include "gromacs/timing/gpu_timing.h"
  67 #include "gromacs/utility/cstringutil.h"
  68 #include "gromacs/utility/fatalerror.h"
  69 #include "gromacs/utility/gmxassert.h"
  70 #include "gromacs/utility/real.h"
  71 #include "gromacs/utility/smalloc.h"
  72
  73 #include "nbnxn_ocl_internal.h"
  74 #include "nbnxn_ocl_types.h"
  75
  76 /*! \brief This parameter should be determined heuristically from the
  77  * kernel execution times
  78  *
  79  * This value is best for small systems on a single AMD Radeon R9 290X
  80  * (and about 5% faster than 40, which is the default for CUDA
  81  * devices). Larger simulation systems were quite insensitive to the
  82  * value of this parameter.
  83  */
  84 static unsigned int gpu_min_ci_balanced_factor = 50;
  85
  86
  87 /*! \brief Returns true if LJ combination rules are used in the non-bonded kernels.
  88  *
  89  * Full doc in nbnxn_ocl_internal.h */
  90 bool useLjCombRule(int vdwType)
  91 {
  92     return (vdwType == evdwOclCUTCOMBGEOM ||
  93             vdwType == evdwOclCUTCOMBLB);
  94 }
  95
  96 /*! \brief Free device buffers
  97  *
  98  * If the pointers to the size variables are NULL no resetting happens.
  99  */
 100 void ocl_free_buffered(cl_mem d_ptr, int *n, int *nalloc)
 101 {
 102     cl_int gmx_unused cl_error;
 103
 104     if (d_ptr)
 105     {
 106         cl_error = clReleaseMemObject(d_ptr);
 107         assert(cl_error == CL_SUCCESS);
 108         // TODO: handle errors
 109     }
 110
 111     if (n)
 112     {
 113         *n = -1;
 114     }
 115
 116     if (nalloc)
 117     {
 118         *nalloc = -1;
 119     }
 120 }
 121
 122 /*! \brief Reallocation device buffers
 123  *
 124  *  Reallocation of the memory pointed by d_ptr and copying of the data from
 125  *  the location pointed by h_src host-side pointer is done. Allocation is
 126  *  buffered and therefore freeing is only needed if the previously allocated
 127  *  space is not enough.
 128  *  The H2D copy is launched in command queue s and can be done synchronously or
 129  *  asynchronously (the default is the latter).
 130  *  If copy_event is not NULL, on return it will contain an event object
 131  *  identifying the H2D copy. The event can further be used to queue a wait
 132  *  for this operation or to query profiling information.
 133  *  OpenCL equivalent of cu_realloc_buffered.
 134  */
 135 void ocl_realloc_buffered(cl_mem *d_dest, void *h_src,
 136                           size_t type_size,
 137                           int *curr_size, int *curr_alloc_size,
 138                           int req_size,
 139                           cl_context context,
 140                           cl_command_queue s,
 141                           bool bAsync = true,
 142                           cl_event *copy_event = NULL)
 143 {
 144     if (d_dest == NULL || req_size < 0)
 145     {
 146         return;
 147     }
 148
 149     /* reallocate only if the data does not fit = allocation size is smaller
 150        than the current requested size */
 151     if (req_size > *curr_alloc_size)
 152     {
 153         cl_int gmx_unused cl_error;
 154
 155         /* only free if the array has already been initialized */
 156         if (*curr_alloc_size >= 0)
 157         {
 158             ocl_free_buffered(*d_dest, curr_size, curr_alloc_size);
 159         }
 160
 161         *curr_alloc_size = over_alloc_large(req_size);
 162
 163         *d_dest = clCreateBuffer(context, CL_MEM_READ_WRITE, *curr_alloc_size * type_size, NULL, &cl_error);
 164         assert(cl_error == CL_SUCCESS);
 165         // TODO: handle errors, check clCreateBuffer flags
 166     }
 167
 168     /* size could have changed without actual reallocation */
 169     *curr_size = req_size;
 170
 171     /* upload to device */
 172     if (h_src)
 173     {
 174         if (bAsync)
 175         {
 176             ocl_copy_H2D_async(*d_dest, h_src, 0, *curr_size * type_size, s, copy_event);
 177         }
 178         else
 179         {
 180             ocl_copy_H2D(*d_dest, h_src,  0, *curr_size * type_size, s);
 181         }
 182     }
 183 }
 184
 185 /*! \brief Releases the input OpenCL buffer */
 186 static void free_ocl_buffer(cl_mem *buffer)
 187 {
 188     cl_int gmx_unused cl_error;
 189
 190     assert(NULL != buffer);
 191
 192     if (*buffer)
 193     {
 194         cl_error = clReleaseMemObject(*buffer);
 195         assert(CL_SUCCESS == cl_error);
 196         *buffer = NULL;
 197     }
 198 }
 199
 200 /*! \brief Tabulates the Ewald Coulomb force and initializes the size/scale
 201  * and the table GPU array.
 202  *
 203  * If called with an already allocated table, it just re-uploads the
 204  * table.
 205  */
 206 static void init_ewald_coulomb_force_table(const interaction_const_t       *ic,
 207                                            cl_nbparam_t                    *nbp,
 208                                            const gmx_device_runtime_data_t *runData)
 209 {
 210     cl_mem       coul_tab;
 211
 212     cl_int       cl_error;
 213
 214     if (nbp->coulomb_tab_climg2d != NULL)
 215     {
 216         free_ocl_buffer(&(nbp->coulomb_tab_climg2d));
 217     }
 218
 219     /* Switched from using textures to using buffers */
 220     // TODO: decide which alternative is most efficient - textures or buffers.
 221     /*
 222        cl_image_format array_format;
 223
 224        array_format.image_channel_data_type = CL_FLOAT;
 225        array_format.image_channel_order     = CL_R;
 226
 227        coul_tab = clCreateImage2D(runData->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
 228        &array_format, tabsize, 1, 0, ftmp, &cl_error);
 229      */
 230
 231     coul_tab = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ic->tabq_size*sizeof(cl_float), ic->tabq_coul_F, &cl_error);
 232     assert(cl_error == CL_SUCCESS);
 233     // TODO: handle errors, check clCreateBuffer flags
 234
 235     nbp->coulomb_tab_climg2d  = coul_tab;
 236     nbp->coulomb_tab_size     = ic->tabq_size;
 237     nbp->coulomb_tab_scale    = ic->tabq_scale;
 238 }
 239
 240
 241 /*! \brief Initializes the atomdata structure first time, it only gets filled at
 242     pair-search.
 243  */
 244 static void init_atomdata_first(cl_atomdata_t *ad, int ntypes, gmx_device_runtime_data_t *runData)
 245 {
 246     cl_int cl_error;
 247
 248     ad->ntypes  = ntypes;
 249
 250     /* An element of the shift_vec device buffer has the same size as one element
 251        of the host side shift_vec buffer. */
 252     ad->shift_vec_elem_size = sizeof(*(((nbnxn_atomdata_t*)0)->shift_vec));
 253
 254     // TODO: handle errors, check clCreateBuffer flags
 255     ad->shift_vec = clCreateBuffer(runData->context, CL_MEM_READ_WRITE, SHIFTS * ad->shift_vec_elem_size, NULL, &cl_error);
 256     assert(cl_error == CL_SUCCESS);
 257     ad->bShiftVecUploaded = false;
 258
 259     /* An element of the fshift device buffer has the same size as one element
 260        of the host side fshift buffer. */
 261     ad->fshift_elem_size = sizeof(*(((cl_nb_staging_t*)0)->fshift));
 262
 263     ad->fshift = clCreateBuffer(runData->context, CL_MEM_READ_WRITE, SHIFTS * ad->fshift_elem_size, NULL, &cl_error);
 264     assert(cl_error == CL_SUCCESS);
 265     // TODO: handle errors, check clCreateBuffer flags
 266
 267     ad->e_lj = clCreateBuffer(runData->context, CL_MEM_READ_WRITE, sizeof(float), NULL, &cl_error);
 268     assert(cl_error == CL_SUCCESS);
 269     // TODO: handle errors, check clCreateBuffer flags
 270
 271     ad->e_el = clCreateBuffer(runData->context, CL_MEM_READ_WRITE, sizeof(float), NULL, &cl_error);
 272     assert(cl_error == CL_SUCCESS);
 273     // TODO: handle errors, check clCreateBuffer flags
 274
 275     /* initialize to NULL pointers to data that is not allocated here and will
 276        need reallocation in nbnxn_gpu_init_atomdata */
 277     ad->xq = NULL;
 278     ad->f  = NULL;
 279
 280     /* size -1 indicates that the respective array hasn't been initialized yet */
 281     ad->natoms = -1;
 282     ad->nalloc = -1;
 283 }
 284
 285 /*! \brief Copies all parameters related to the cut-off from ic to nbp
 286  */
 287 static void set_cutoff_parameters(cl_nbparam_t              *nbp,
 288                                   const interaction_const_t *ic)
 289 {
 290     nbp->ewald_beta       = ic->ewaldcoeff_q;
 291     nbp->sh_ewald         = ic->sh_ewald;
 292     nbp->epsfac           = ic->epsfac;
 293     nbp->two_k_rf         = 2.0 * ic->k_rf;
 294     nbp->c_rf             = ic->c_rf;
 295     nbp->rvdw_sq          = ic->rvdw * ic->rvdw;
 296     nbp->rcoulomb_sq      = ic->rcoulomb * ic->rcoulomb;
 297     nbp->rlist_sq         = ic->rlist * ic->rlist;
 298
 299     nbp->sh_lj_ewald      = ic->sh_lj_ewald;
 300     nbp->ewaldcoeff_lj    = ic->ewaldcoeff_lj;
 301
 302     nbp->rvdw_switch      = ic->rvdw_switch;
 303     nbp->dispersion_shift = ic->dispersion_shift;
 304     nbp->repulsion_shift  = ic->repulsion_shift;
 305     nbp->vdw_switch       = ic->vdw_switch;
 306 }
 307
 308 /*! \brief Returns the kinds of electrostatics and Vdw OpenCL
 309  *  kernels that will be used.
 310  *
 311  * Respectively, these values are from enum eelOcl and enum
 312  * evdwOcl. */
 313 static void
 314 map_interaction_types_to_gpu_kernel_flavors(const interaction_const_t *ic,
 315                                             int                        combRule,
 316                                             int                       *gpu_eeltype,
 317                                             int                       *gpu_vdwtype)
 318 {
 319     if (ic->vdwtype == evdwCUT)
 320     {
 321         switch (ic->vdw_modifier)
 322         {
 323             case eintmodNONE:
 324             case eintmodPOTSHIFT:
 325                 switch (combRule)
 326                 {
 327                     case ljcrNONE:
 328                         *gpu_vdwtype = evdwOclCUT;
 329                         break;
 330                     case ljcrGEOM:
 331                         *gpu_vdwtype = evdwOclCUTCOMBGEOM;
 332                         break;
 333                     case ljcrLB:
 334                         *gpu_vdwtype = evdwOclCUTCOMBLB;
 335                         break;
 336                     default:
 337                         gmx_incons("The requested LJ combination rule is not implemented in the OpenCL GPU accelerated kernels!");
 338                         break;
 339                 }
 340                 break;
 341             case eintmodFORCESWITCH:
 342                 *gpu_vdwtype = evdwOclFSWITCH;
 343                 break;
 344             case eintmodPOTSWITCH:
 345                 *gpu_vdwtype = evdwOclPSWITCH;
 346                 break;
 347             default:
 348                 gmx_incons("The requested VdW interaction modifier is not implemented in the GPU accelerated kernels!");
 349                 break;
 350         }
 351     }
 352     else if (ic->vdwtype == evdwPME)
 353     {
 354         if (ic->ljpme_comb_rule == ljcrGEOM)
 355         {
 356             *gpu_vdwtype = evdwOclEWALDGEOM;
 357         }
 358         else
 359         {
 360             *gpu_vdwtype = evdwOclEWALDLB;
 361         }
 362     }
 363     else
 364     {
 365         gmx_incons("The requested VdW type is not implemented in the GPU accelerated kernels!");
 366     }
 367
 368     if (ic->eeltype == eelCUT)
 369     {
 370         *gpu_eeltype = eelOclCUT;
 371     }
 372     else if (EEL_RF(ic->eeltype))
 373     {
 374         *gpu_eeltype = eelOclRF;
 375     }
 376     else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
 377     {
 378         /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
 379         *gpu_eeltype = nbnxn_gpu_pick_ewald_kernel_type(false);
 380     }
 381     else
 382     {
 383         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
 384         gmx_incons("The requested electrostatics type is not implemented in the GPU accelerated kernels!");
 385     }
 386 }
 387
 388 /*! \brief Initializes the nonbonded parameter data structure.
 389  */
 390 static void init_nbparam(cl_nbparam_t                    *nbp,
 391                          const interaction_const_t       *ic,
 392                          const nbnxn_atomdata_t          *nbat,
 393                          const gmx_device_runtime_data_t *runData)
 394 {
 395     int         ntypes, nnbfp, nnbfp_comb;
 396     cl_int      cl_error;
 397
 398
 399     ntypes = nbat->ntype;
 400
 401     set_cutoff_parameters(nbp, ic);
 402
 403     map_interaction_types_to_gpu_kernel_flavors(ic,
 404                                                 nbat->comb_rule,
 405                                                 &(nbp->eeltype),
 406                                                 &(nbp->vdwtype));
 407
 408     if (ic->vdwtype == evdwPME)
 409     {
 410         if (ic->ljpme_comb_rule == ljcrGEOM)
 411         {
 412             assert(nbat->comb_rule == ljcrGEOM);
 413         }
 414         else
 415         {
 416             assert(nbat->comb_rule == ljcrLB);
 417         }
 418     }
 419     /* generate table for PME */
 420     nbp->coulomb_tab_climg2d = NULL;
 421     if (nbp->eeltype == eelOclEWALD_TAB || nbp->eeltype == eelOclEWALD_TAB_TWIN)
 422     {
 423         init_ewald_coulomb_force_table(ic, nbp, runData);
 424     }
 425     else
 426     // TODO: improvement needed.
 427     // The image2d is created here even if eeltype is not eelCuEWALD_TAB or eelCuEWALD_TAB_TWIN because the OpenCL kernels
 428     // don't accept NULL values for image2D parameters.
 429     {
 430         /* Switched from using textures to using buffers */
 431         // TODO: decide which alternative is most efficient - textures or buffers.
 432         /*
 433            cl_image_format array_format;
 434
 435            array_format.image_channel_data_type = CL_FLOAT;
 436            array_format.image_channel_order     = CL_R;
 437
 438            nbp->coulomb_tab_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE,
 439             &array_format, 1, 1, 0, NULL, &cl_error);
 440          */
 441
 442         nbp->coulomb_tab_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY, sizeof(cl_float), NULL, &cl_error);
 443         // TODO: handle errors
 444     }
 445
 446     nnbfp      = 2*ntypes*ntypes;
 447     nnbfp_comb = 2*ntypes;
 448
 449     {
 450         /* Switched from using textures to using buffers */
 451         // TODO: decide which alternative is most efficient - textures or buffers.
 452         /*
 453            cl_image_format array_format;
 454
 455            array_format.image_channel_data_type = CL_FLOAT;
 456            array_format.image_channel_order     = CL_R;
 457
 458            nbp->nbfp_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
 459             &array_format, nnbfp, 1, 0, nbat->nbfp, &cl_error);
 460          */
 461
 462         nbp->nbfp_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nnbfp*sizeof(cl_float), nbat->nbfp, &cl_error);
 463         assert(cl_error == CL_SUCCESS);
 464         // TODO: handle errors
 465
 466         if (ic->vdwtype == evdwPME)
 467         {
 468             /* Switched from using textures to using buffers */
 469             // TODO: decide which alternative is most efficient - textures or buffers.
 470             /*  nbp->nbfp_comb_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
 471                 &array_format, nnbfp_comb, 1, 0, nbat->nbfp_comb, &cl_error);*/
 472             nbp->nbfp_comb_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nnbfp_comb*sizeof(cl_float), nbat->nbfp_comb, &cl_error);
 473
 474
 475             assert(cl_error == CL_SUCCESS);
 476             // TODO: handle errors
 477         }
 478         else
 479         {
 480             // TODO: improvement needed.
 481             // The image2d is created here even if vdwtype is not evdwPME because the OpenCL kernels
 482             // don't accept NULL values for image2D parameters.
 483             /* Switched from using textures to using buffers */
 484             // TODO: decide which alternative is most efficient - textures or buffers.
 485             /* nbp->nbfp_comb_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE,
 486                 &array_format, 1, 1, 0, NULL, &cl_error);*/
 487             nbp->nbfp_comb_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY, sizeof(cl_float), NULL, &cl_error);
 488
 489
 490             assert(cl_error == CL_SUCCESS);
 491             // TODO: handle errors
 492         }
 493     }
 494 }
 495
 496 //! This function is documented in the header file
 497 void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
 498                                         const interaction_const_t   *ic)
 499 {
 500     if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
 501     {
 502         return;
 503     }
 504     gmx_nbnxn_ocl_t    *nb  = nbv->gpu_nbv;
 505     cl_nbparam_t       *nbp = nb->nbparam;
 506
 507     set_cutoff_parameters(nbp, ic);
 508
 509     nbp->eeltype = nbnxn_gpu_pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw);
 510
 511     init_ewald_coulomb_force_table(ic, nb->nbparam, nb->dev_rundata);
 512 }
 513
 514 /*! \brief Initializes the pair list data structure.
 515  */
 516 static void init_plist(cl_plist_t *pl)
 517 {
 518     /* initialize to NULL pointers to data that is not allocated here and will
 519        need reallocation in nbnxn_gpu_init_pairlist */
 520     pl->sci     = NULL;
 521     pl->cj4     = NULL;
 522     pl->excl    = NULL;
 523
 524     /* size -1 indicates that the respective array hasn't been initialized yet */
 525     pl->na_c        = -1;
 526     pl->nsci        = -1;
 527     pl->sci_nalloc  = -1;
 528     pl->ncj4        = -1;
 529     pl->cj4_nalloc  = -1;
 530     pl->nexcl       = -1;
 531     pl->excl_nalloc = -1;
 532     pl->bDoPrune    = false;
 533 }
 534
 535 /*! \brief Initializes the timer data structure.
 536  */
 537 static void init_timers(cl_timers_t gmx_unused *t, bool gmx_unused bUseTwoStreams)
 538 {
 539     /* Nothing to initialize for OpenCL */
 540 }
 541
 542 /*! \brief Initializes the timings data structure.
 543  */
 544 static void init_timings(gmx_wallclock_gpu_t *t)
 545 {
 546     int i, j;
 547
 548     t->nb_h2d_t = 0.0;
 549     t->nb_d2h_t = 0.0;
 550     t->nb_c     = 0;
 551     t->pl_h2d_t = 0.0;
 552     t->pl_h2d_c = 0;
 553     for (i = 0; i < 2; i++)
 554     {
 555         for (j = 0; j < 2; j++)
 556         {
 557             t->ktime[i][j].t = 0.0;
 558             t->ktime[i][j].c = 0;
 559         }
 560     }
 561 }
 562
 563 /*! \brief Creates context for OpenCL GPU given by \p mygpu
 564  *
 565  * A fatal error results if creation fails.
 566  *
 567  * \param[inout] runtimeData runtime data including program and context
 568  * \param[in]    devInfo     device info struct
 569  * \param[in]    rank        MPI rank (for error reporting)
 570  */
 571 static void
 572 nbnxn_gpu_create_context(gmx_device_runtime_data_t *runtimeData,
 573                          const gmx_device_info_t   *devInfo,
 574                          int                        rank)
 575 {
 576     cl_context_properties     context_properties[3];
 577     cl_platform_id            platform_id;
 578     cl_device_id              device_id;
 579     cl_context                context;
 580     cl_int                    cl_error;
 581
 582     assert(runtimeData != NULL);
 583     assert(devInfo != NULL);
 584
 585     platform_id      = devInfo->ocl_gpu_id.ocl_platform_id;
 586     device_id        = devInfo->ocl_gpu_id.ocl_device_id;
 587
 588     context_properties[0] = CL_CONTEXT_PLATFORM;
 589     context_properties[1] = (cl_context_properties) platform_id;
 590     context_properties[2] = 0; /* Terminates the list of properties */
 591
 592     context = clCreateContext(context_properties, 1, &device_id, NULL, NULL, &cl_error);
 593     if (CL_SUCCESS != cl_error)
 594     {
 595         gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s:\n OpenCL error %d: %s",
 596                   rank,
 597                   devInfo->device_name,
 598                   cl_error, ocl_get_error_string(cl_error).c_str());
 599         return;
 600     }
 601
 602     runtimeData->context = context;
 603 }
 604
 605 /*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
 606 static cl_kernel nbnxn_gpu_create_kernel(gmx_nbnxn_ocl_t *nb,
 607                                          const char      *kernel_name)
 608 {
 609     cl_kernel kernel;
 610     cl_int    cl_error;
 611
 612     kernel = clCreateKernel(nb->dev_rundata->program, kernel_name, &cl_error);
 613     if (CL_SUCCESS != cl_error)
 614     {
 615         gmx_fatal(FARGS, "Failed to create kernel '%s' for GPU #%s: OpenCL error %d",
 616                   kernel_name,
 617                   nb->dev_info->device_name,
 618                   cl_error);
 619     }
 620
 621     return kernel;
 622 }
 623
 624 /*! \brief Clears nonbonded shift force output array and energy outputs on the GPU.
 625  */
 626 static void
 627 nbnxn_ocl_clear_e_fshift(gmx_nbnxn_ocl_t *nb)
 628 {
 629
 630     cl_int               cl_error;
 631     cl_atomdata_t *      adat     = nb->atdat;
 632     cl_command_queue     ls       = nb->stream[eintLocal];
 633
 634     size_t               local_work_size[3]   = {1, 1, 1};
 635     size_t               global_work_size[3]  = {1, 1, 1};
 636
 637     cl_int               shifts   = SHIFTS*3;
 638
 639     cl_int               arg_no;
 640
 641     cl_kernel            zero_e_fshift = nb->kernel_zero_e_fshift;
 642
 643     local_work_size[0]   = 64;
 644     // Round the total number of threads up from the array size
 645     global_work_size[0]  = ((shifts + local_work_size[0] - 1)/local_work_size[0])*local_work_size[0];
 646
 647     arg_no    = 0;
 648     cl_error  = clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->fshift));
 649     cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_lj));
 650     cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_el));
 651     cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_uint), &shifts);
 652     GMX_ASSERT(cl_error == CL_SUCCESS, ocl_get_error_string(cl_error).c_str());
 653
 654     cl_error = clEnqueueNDRangeKernel(ls, zero_e_fshift, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
 655     GMX_ASSERT(cl_error == CL_SUCCESS, ocl_get_error_string(cl_error).c_str());
 656 }
 657
 658 /*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
 659 static void nbnxn_gpu_init_kernels(gmx_nbnxn_ocl_t *nb)
 660 {
 661     /* Init to 0 main kernel arrays */
 662     /* They will be later on initialized in select_nbnxn_kernel */
 663     memset(nb->kernel_ener_noprune_ptr, 0, sizeof(nb->kernel_ener_noprune_ptr));
 664     memset(nb->kernel_ener_prune_ptr, 0, sizeof(nb->kernel_ener_prune_ptr));
 665     memset(nb->kernel_noener_noprune_ptr, 0, sizeof(nb->kernel_noener_noprune_ptr));
 666     memset(nb->kernel_noener_prune_ptr, 0, sizeof(nb->kernel_noener_prune_ptr));
 667
 668     /* Init auxiliary kernels */
 669     nb->kernel_memset_f      = nbnxn_gpu_create_kernel(nb, "memset_f");
 670     nb->kernel_memset_f2     = nbnxn_gpu_create_kernel(nb, "memset_f2");
 671     nb->kernel_memset_f3     = nbnxn_gpu_create_kernel(nb, "memset_f3");
 672     nb->kernel_zero_e_fshift = nbnxn_gpu_create_kernel(nb, "zero_e_fshift");
 673 }
 674
 675 /*! \brief Initializes simulation constant data.
 676  *
 677  *  Initializes members of the atomdata and nbparam structs and
 678  *  clears e/fshift output buffers.
 679  */
 680 static void nbnxn_ocl_init_const(gmx_nbnxn_ocl_t                *nb,
 681                                  const interaction_const_t      *ic,
 682                                  const nonbonded_verlet_group_t *nbv_group)
 683 {
 684     init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype, nb->dev_rundata);
 685     init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_rundata);
 686 }
 687
 688
 689 //! This function is documented in the header file
 690 void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
 691                     const gmx_gpu_info_t      *gpu_info,
 692                     const gmx_gpu_opt_t       *gpu_opt,
 693                     const interaction_const_t *ic,
 694                     nonbonded_verlet_group_t  *nbv_grp,
 695                     int                        my_gpu_index,
 696                     int                        rank,
 697                     gmx_bool                   bLocalAndNonlocal)
 698 {
 699     gmx_nbnxn_ocl_t            *nb;
 700     cl_int                      cl_error;
 701     cl_command_queue_properties queue_properties;
 702
 703     assert(gpu_info);
 704     assert(gpu_opt);
 705     assert(ic);
 706
 707     if (p_nb == NULL)
 708     {
 709         return;
 710     }
 711
 712     snew(nb, 1);
 713     snew(nb->atdat, 1);
 714     snew(nb->nbparam, 1);
 715     snew(nb->plist[eintLocal], 1);
 716     if (bLocalAndNonlocal)
 717     {
 718         snew(nb->plist[eintNonlocal], 1);
 719     }
 720
 721     nb->bUseTwoStreams = bLocalAndNonlocal;
 722
 723     snew(nb->timers, 1);
 724     snew(nb->timings, 1);
 725
 726     /* set device info, just point it to the right GPU among the detected ones */
 727     nb->dev_info = gpu_info->gpu_dev + gpu_opt->dev_use[my_gpu_index];
 728     snew(nb->dev_rundata, 1);
 729
 730     /* init to NULL the debug buffer */
 731     nb->debug_buffer = NULL;
 732
 733     /* init nbst */
 734     ocl_pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj));
 735     ocl_pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
 736     ocl_pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
 737
 738     init_plist(nb->plist[eintLocal]);
 739
 740     /* OpenCL timing disabled if GMX_DISABLE_OCL_TIMING is defined. */
 741     nb->bDoTime = (getenv("GMX_DISABLE_OCL_TIMING") == NULL);
 742
 743     /* Create queues only after bDoTime has been initialized */
 744     if (nb->bDoTime)
 745     {
 746         queue_properties = CL_QUEUE_PROFILING_ENABLE;
 747     }
 748     else
 749     {
 750         queue_properties = 0;
 751     }
 752
 753     nbnxn_gpu_create_context(nb->dev_rundata, nb->dev_info, rank);
 754
 755     /* local/non-local GPU streams */
 756     nb->stream[eintLocal] = clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
 757     if (CL_SUCCESS != cl_error)
 758     {
 759         gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
 760                   rank,
 761                   nb->dev_info->device_name,
 762                   cl_error);
 763         return;
 764     }
 765
 766     if (nb->bUseTwoStreams)
 767     {
 768         init_plist(nb->plist[eintNonlocal]);
 769
 770         nb->stream[eintNonlocal] = clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
 771         if (CL_SUCCESS != cl_error)
 772         {
 773             gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
 774                       rank,
 775                       nb->dev_info->device_name,
 776                       cl_error);
 777             return;
 778         }
 779     }
 780
 781     if (nb->bDoTime)
 782     {
 783         init_timers(nb->timers, nb->bUseTwoStreams);
 784         init_timings(nb->timings);
 785     }
 786
 787     nbnxn_ocl_init_const(nb, ic, nbv_grp);
 788
 789     /* Enable LJ param manual prefetch for AMD or if we request through env. var.
 790      * TODO: decide about NVIDIA
 791      */
 792     nb->bPrefetchLjParam =
 793         (getenv("GMX_OCL_DISABLE_I_PREFETCH") == NULL) &&
 794         ((nb->dev_info->vendor_e == OCL_VENDOR_AMD) || (getenv("GMX_OCL_ENABLE_I_PREFETCH") != NULL));
 795
 796     /* NOTE: in CUDA we pick L1 cache configuration for the nbnxn kernels here,
 797      * but sadly this is not supported in OpenCL (yet?). Consider adding it if
 798      * it becomes supported.
 799      */
 800     nbnxn_gpu_compile_kernels(nb);
 801     nbnxn_gpu_init_kernels(nb);
 802
 803     /* clear energy and shift force outputs */
 804     nbnxn_ocl_clear_e_fshift(nb);
 805
 806     *p_nb = nb;
 807
 808     if (debug)
 809     {
 810         fprintf(debug, "Initialized OpenCL data structures.\n");
 811     }
 812 }
 813
 814 /*! \brief Clears the first natoms_clear elements of the GPU nonbonded force output array.
 815  */
 816 static void nbnxn_ocl_clear_f(gmx_nbnxn_ocl_t *nb, int natoms_clear)
 817 {
 818     if (natoms_clear == 0)
 819     {
 820         return;
 821     }
 822
 823     cl_int               cl_error;
 824     cl_atomdata_t *      adat     = nb->atdat;
 825     cl_command_queue     ls       = nb->stream[eintLocal];
 826     cl_float             value    = 0.0f;
 827
 828     size_t               local_work_size[3]  = {1, 1, 1};
 829     size_t               global_work_size[3] = {1, 1, 1};
 830
 831     cl_int               arg_no;
 832
 833     cl_kernel            memset_f = nb->kernel_memset_f;
 834
 835     cl_uint              natoms_flat = natoms_clear * (sizeof(rvec)/sizeof(real));
 836
 837     local_work_size[0]  = 64;
 838     // Round the total number of threads up from the array size
 839     global_work_size[0] = ((natoms_flat + local_work_size[0] - 1)/local_work_size[0])*local_work_size[0];
 840
 841
 842     arg_no    = 0;
 843     cl_error  = clSetKernelArg(memset_f, arg_no++, sizeof(cl_mem), &(adat->f));
 844     cl_error |= clSetKernelArg(memset_f, arg_no++, sizeof(cl_float), &value);
 845     cl_error |= clSetKernelArg(memset_f, arg_no++, sizeof(cl_uint), &natoms_flat);
 846     assert(cl_error == CL_SUCCESS);
 847
 848     cl_error = clEnqueueNDRangeKernel(ls, memset_f, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
 849     assert(cl_error == CL_SUCCESS);
 850 }
 851
 852 //! This function is documented in the header file
 853 void
 854 nbnxn_gpu_clear_outputs(gmx_nbnxn_ocl_t   *nb,
 855                         int                flags)
 856 {
 857     nbnxn_ocl_clear_f(nb, nb->atdat->natoms);
 858     /* clear shift force array and energies if the outputs were
 859        used in the current step */
 860     if (flags & GMX_FORCE_VIRIAL)
 861     {
 862         nbnxn_ocl_clear_e_fshift(nb);
 863     }
 864
 865     /* kick off buffer clearing kernel to ensure concurrency with constraints/update */
 866     cl_int gmx_unused cl_error;
 867     cl_error = clFlush(nb->stream[eintLocal]);
 868     assert(CL_SUCCESS == cl_error);
 869 }
 870
 871 //! This function is documented in the header file
 872 void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t        *nb,
 873                              const nbnxn_pairlist_t *h_plist,
 874                              int                     iloc)
 875 {
 876     char             sbuf[STRLEN];
 877     cl_command_queue stream     = nb->stream[iloc];
 878     cl_plist_t      *d_plist    = nb->plist[iloc];
 879
 880     if (d_plist->na_c < 0)
 881     {
 882         d_plist->na_c = h_plist->na_ci;
 883     }
 884     else
 885     {
 886         if (d_plist->na_c != h_plist->na_ci)
 887         {
 888             sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
 889                     d_plist->na_c, h_plist->na_ci);
 890             gmx_incons(sbuf);
 891         }
 892     }
 893
 894     ocl_realloc_buffered(&d_plist->sci, h_plist->sci, sizeof(nbnxn_sci_t),
 895                          &d_plist->nsci, &d_plist->sci_nalloc,
 896                          h_plist->nsci,
 897                          nb->dev_rundata->context,
 898                          stream, true, &(nb->timers->pl_h2d_sci[iloc]));
 899
 900     ocl_realloc_buffered(&d_plist->cj4, h_plist->cj4, sizeof(nbnxn_cj4_t),
 901                          &d_plist->ncj4, &d_plist->cj4_nalloc,
 902                          h_plist->ncj4,
 903                          nb->dev_rundata->context,
 904                          stream, true, &(nb->timers->pl_h2d_cj4[iloc]));
 905
 906     ocl_realloc_buffered(&d_plist->excl, h_plist->excl, sizeof(nbnxn_excl_t),
 907                          &d_plist->nexcl, &d_plist->excl_nalloc,
 908                          h_plist->nexcl,
 909                          nb->dev_rundata->context,
 910                          stream, true, &(nb->timers->pl_h2d_excl[iloc]));
 911
 912     /* need to prune the pair list during the next step */
 913     d_plist->bDoPrune = true;
 914 }
 915
 916 //! This function is documented in the header file
 917 void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_ocl_t        *nb,
 918                                const nbnxn_atomdata_t *nbatom)
 919 {
 920     cl_atomdata_t   *adat  = nb->atdat;
 921     cl_command_queue ls    = nb->stream[eintLocal];
 922
 923     /* only if we have a dynamic box */
 924     if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
 925     {
 926         ocl_copy_H2D_async(adat->shift_vec, nbatom->shift_vec, 0,
 927                            SHIFTS * adat->shift_vec_elem_size, ls, NULL);
 928         adat->bShiftVecUploaded = true;
 929     }
 930 }
 931
 932 //! This function is documented in the header file
 933 void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t               *nb,
 934                              const struct nbnxn_atomdata_t *nbat)
 935 {
 936     cl_int           cl_error;
 937     int              nalloc, natoms;
 938     bool             realloced;
 939     bool             bDoTime = nb->bDoTime;
 940     cl_timers_t     *timers  = nb->timers;
 941     cl_atomdata_t   *d_atdat = nb->atdat;
 942     cl_command_queue ls      = nb->stream[eintLocal];
 943
 944     natoms    = nbat->natoms;
 945     realloced = false;
 946
 947     /* need to reallocate if we have to copy more atoms than the amount of space
 948        available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
 949     if (natoms > d_atdat->nalloc)
 950     {
 951         nalloc = over_alloc_small(natoms);
 952
 953         /* free up first if the arrays have already been initialized */
 954         if (d_atdat->nalloc != -1)
 955         {
 956             ocl_free_buffered(d_atdat->f, &d_atdat->natoms, &d_atdat->nalloc);
 957             ocl_free_buffered(d_atdat->xq, NULL, NULL);
 958             ocl_free_buffered(d_atdat->lj_comb, NULL, NULL);
 959             ocl_free_buffered(d_atdat->atom_types, NULL, NULL);
 960         }
 961
 962         d_atdat->f_elem_size = sizeof(rvec);
 963
 964         // TODO: handle errors, check clCreateBuffer flags
 965         d_atdat->f = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE, nalloc * d_atdat->f_elem_size, NULL, &cl_error);
 966         assert(CL_SUCCESS == cl_error);
 967
 968         // TODO: change the flag to read-only
 969         d_atdat->xq = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE, nalloc * sizeof(cl_float4), NULL, &cl_error);
 970         assert(CL_SUCCESS == cl_error);
 971         // TODO: handle errors, check clCreateBuffer flags
 972
 973         if (useLjCombRule(nb->nbparam->vdwtype))
 974         {
 975             // TODO: change the flag to read-only
 976             d_atdat->lj_comb = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE, nalloc * sizeof(cl_float2), NULL, &cl_error);
 977             assert(CL_SUCCESS == cl_error);
 978             // TODO: handle errors, check clCreateBuffer flags
 979         }
 980         else
 981         {
 982             // TODO: change the flag to read-only
 983             d_atdat->atom_types = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE, nalloc * sizeof(int), NULL, &cl_error);
 984             assert(CL_SUCCESS == cl_error);
 985             // TODO: handle errors, check clCreateBuffer flags
 986         }
 987
 988         d_atdat->nalloc = nalloc;
 989         realloced       = true;
 990     }
 991
 992     d_atdat->natoms       = natoms;
 993     d_atdat->natoms_local = nbat->natoms_local;
 994
 995     /* need to clear GPU f output if realloc happened */
 996     if (realloced)
 997     {
 998         nbnxn_ocl_clear_f(nb, nalloc);
 999     }
1000
1001     if (useLjCombRule(nb->nbparam->vdwtype))
1002     {
1003         ocl_copy_H2D_async(d_atdat->lj_comb, nbat->lj_comb, 0,
1004                            natoms*sizeof(cl_float2), ls, bDoTime ? &(timers->atdat) : NULL);
1005     }
1006     else
1007     {
1008         ocl_copy_H2D_async(d_atdat->atom_types, nbat->type, 0,
1009                            natoms*sizeof(int), ls, bDoTime ? &(timers->atdat) : NULL);
1010
1011     }
1012
1013     /* kick off the tasks enqueued above to ensure concurrency with the search */
1014     cl_error = clFlush(ls);
1015     assert(CL_SUCCESS == cl_error);
1016 }
1017
1018 /*! \brief Releases an OpenCL kernel pointer */
1019 void free_kernel(cl_kernel *kernel_ptr)
1020 {
1021     cl_int gmx_unused cl_error;
1022
1023     assert(NULL != kernel_ptr);
1024
1025     if (*kernel_ptr)
1026     {
1027         cl_error = clReleaseKernel(*kernel_ptr);
1028         assert(cl_error == CL_SUCCESS);
1029
1030         *kernel_ptr = NULL;
1031     }
1032 }
1033
1034 /*! \brief Releases a list of OpenCL kernel pointers */
1035 void free_kernels(cl_kernel *kernels, int count)
1036 {
1037     int i;
1038
1039     for (i = 0; i < count; i++)
1040     {
1041         free_kernel(kernels + i);
1042     }
1043 }
1044
1045 /*! \brief Free the OpenCL runtime data (context and program).
1046  *
1047  *  The function releases the OpenCL context and program assuciated with the
1048  *  device that the calling PP rank is running on.
1049  *
1050  *  \param runData [in]  porinter to the structure with runtime data.
1051  */
1052 static void free_gpu_device_runtime_data(gmx_device_runtime_data_t *runData)
1053 {
1054     if (runData == NULL)
1055     {
1056         return;
1057     }
1058
1059     cl_int gmx_unused cl_error;
1060
1061     if (runData->context)
1062     {
1063         cl_error         = clReleaseContext(runData->context);
1064         runData->context = NULL;
1065         assert(CL_SUCCESS == cl_error);
1066     }
1067
1068     if (runData->program)
1069     {
1070         cl_error         = clReleaseProgram(runData->program);
1071         runData->program = NULL;
1072         assert(CL_SUCCESS == cl_error);
1073     }
1074
1075 }
1076
1077 //! This function is documented in the header file
1078 void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
1079 {
1080     int    kernel_count;
1081
1082     /* Free kernels */
1083     kernel_count = sizeof(nb->kernel_ener_noprune_ptr) / sizeof(nb->kernel_ener_noprune_ptr[0][0]);
1084     free_kernels((cl_kernel*)nb->kernel_ener_noprune_ptr, kernel_count);
1085
1086     kernel_count = sizeof(nb->kernel_ener_prune_ptr) / sizeof(nb->kernel_ener_prune_ptr[0][0]);
1087     free_kernels((cl_kernel*)nb->kernel_ener_prune_ptr, kernel_count);
1088
1089     kernel_count = sizeof(nb->kernel_noener_noprune_ptr) / sizeof(nb->kernel_noener_noprune_ptr[0][0]);
1090     free_kernels((cl_kernel*)nb->kernel_noener_noprune_ptr, kernel_count);
1091
1092     kernel_count = sizeof(nb->kernel_noener_prune_ptr) / sizeof(nb->kernel_noener_prune_ptr[0][0]);
1093     free_kernels((cl_kernel*)nb->kernel_noener_prune_ptr, kernel_count);
1094
1095     free_kernel(&(nb->kernel_memset_f));
1096     free_kernel(&(nb->kernel_memset_f2));
1097     free_kernel(&(nb->kernel_memset_f3));
1098     free_kernel(&(nb->kernel_zero_e_fshift));
1099
1100     /* Free atdat */
1101     free_ocl_buffer(&(nb->atdat->xq));
1102     free_ocl_buffer(&(nb->atdat->f));
1103     free_ocl_buffer(&(nb->atdat->e_lj));
1104     free_ocl_buffer(&(nb->atdat->e_el));
1105     free_ocl_buffer(&(nb->atdat->fshift));
1106     free_ocl_buffer(&(nb->atdat->lj_comb));
1107     free_ocl_buffer(&(nb->atdat->atom_types));
1108     free_ocl_buffer(&(nb->atdat->shift_vec));
1109     sfree(nb->atdat);
1110
1111     /* Free nbparam */
1112     free_ocl_buffer(&(nb->nbparam->nbfp_climg2d));
1113     free_ocl_buffer(&(nb->nbparam->nbfp_comb_climg2d));
1114     free_ocl_buffer(&(nb->nbparam->coulomb_tab_climg2d));
1115     sfree(nb->nbparam);
1116
1117     /* Free plist */
1118     free_ocl_buffer(&(nb->plist[eintLocal]->sci));
1119     free_ocl_buffer(&(nb->plist[eintLocal]->cj4));
1120     free_ocl_buffer(&(nb->plist[eintLocal]->excl));
1121     sfree(nb->plist[eintLocal]);
1122     if (nb->bUseTwoStreams)
1123     {
1124         free_ocl_buffer(&(nb->plist[eintNonlocal]->sci));
1125         free_ocl_buffer(&(nb->plist[eintNonlocal]->cj4));
1126         free_ocl_buffer(&(nb->plist[eintNonlocal]->excl));
1127         sfree(nb->plist[eintNonlocal]);
1128     }
1129
1130     /* Free nbst */
1131     ocl_pfree(nb->nbst.e_lj);
1132     nb->nbst.e_lj = NULL;
1133
1134     ocl_pfree(nb->nbst.e_el);
1135     nb->nbst.e_el = NULL;
1136
1137     ocl_pfree(nb->nbst.fshift);
1138     nb->nbst.fshift = NULL;
1139
1140     /* Free debug buffer */
1141     free_ocl_buffer(&nb->debug_buffer);
1142
1143     /* Free command queues */
1144     clReleaseCommandQueue(nb->stream[eintLocal]);
1145     nb->stream[eintLocal] = NULL;
1146     if (nb->bUseTwoStreams)
1147     {
1148         clReleaseCommandQueue(nb->stream[eintNonlocal]);
1149         nb->stream[eintNonlocal] = NULL;
1150     }
1151     /* Free other events */
1152     if (nb->nonlocal_done)
1153     {
1154         clReleaseEvent(nb->nonlocal_done);
1155         nb->nonlocal_done = NULL;
1156     }
1157     if (nb->misc_ops_and_local_H2D_done)
1158     {
1159         clReleaseEvent(nb->misc_ops_and_local_H2D_done);
1160         nb->misc_ops_and_local_H2D_done = NULL;
1161     }
1162
1163     free_gpu_device_runtime_data(nb->dev_rundata);
1164     sfree(nb->dev_rundata);
1165
1166     /* Free timers and timings */
1167     sfree(nb->timers);
1168     sfree(nb->timings);
1169     sfree(nb);
1170
1171     if (debug)
1172     {
1173         fprintf(debug, "Cleaned up OpenCL data structures.\n");
1174     }
1175 }
1176
1177
1178 //! This function is documented in the header file
1179 gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_ocl_t *nb)
1180 {
1181     return (nb != NULL && nb->bDoTime) ? nb->timings : NULL;
1182 }
1183
1184 //! This function is documented in the header file
1185 void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
1186 {
1187     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
1188     {
1189         init_timings(nbv->gpu_nbv->timings);
1190     }
1191 }
1192
1193 //! This function is documented in the header file
1194 int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_ocl_t *nb)
1195 {
1196     return nb != NULL ?
1197            gpu_min_ci_balanced_factor * nb->dev_info->compute_units : 0;
1198 }
1199
1200 //! This function is documented in the header file
1201 gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_ocl_t *nb)
1202 {
1203     return ((nb->nbparam->eeltype == eelOclEWALD_ANA) ||
1204             (nb->nbparam->eeltype == eelOclEWALD_ANA_TWIN));
1205 }