src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *  \brief Define OpenCL implementation of nbnxn_gpu.h
  37  *
  38  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  39  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  40  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  41  *  \ingroup module_mdlib
  42  */
  43 #include "gmxpre.h"
  44
  45 #include "config.h"
  46
  47 #include <assert.h>
  48 #include <stdlib.h>
  49
  50 #if defined(_MSVC)
  51 #include <limits>
  52 #endif
  53
  54 #include "gromacs/gpu_utils/oclutils.h"
  55 #include "gromacs/hardware/hw_info.h"
  56 #include "gromacs/mdlib/force_flags.h"
  57 #include "gromacs/mdlib/nb_verlet.h"
  58 #include "gromacs/mdlib/nbnxn_consts.h"
  59 #include "gromacs/mdlib/nbnxn_pairlist.h"
  60 #include "gromacs/timing/gpu_timing.h"
  61
  62 #ifdef TMPI_ATOMICS
  63 #include "thread_mpi/atomic.h"
  64 #endif
  65
  66 #include "gromacs/mdlib/nbnxn_gpu.h"
  67 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  68 #include "gromacs/pbcutil/ishift.h"
  69 #include "gromacs/utility/cstringutil.h"
  70 #include "gromacs/utility/fatalerror.h"
  71 #include "gromacs/utility/gmxassert.h"
  72
  73 #include "nbnxn_ocl_types.h"
  74
  75 #if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ >= 300
  76 #define USE_TEXOBJ
  77 #endif
  78
  79 /*! \brief Convenience defines */
  80 //@{
  81 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  82 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  83 //@}
  84
  85 /*! \brief Always/never run the energy/pruning kernels -- only for benchmarking purposes */
  86 //@{
  87 static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
  88 static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
  89 static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
  90 //@}
  91
  92 /* Uncomment this define to enable kernel debugging */
  93 //#define DEBUG_OCL
  94
  95 /*! \brief Specifies which kernel run to debug */
  96 #define DEBUG_RUN_STEP 2
  97
  98 /*! \brief Validates the input global work size parameter.
  99  */
 100 static inline void validate_global_work_size(size_t *global_work_size, int work_dim, gmx_device_info_t *dinfo)
 101 {
 102     cl_uint device_size_t_size_bits;
 103     cl_uint host_size_t_size_bits;
 104
 105     assert(dinfo);
 106
 107     /* Each component of a global_work_size must not exceed the range given by the
 108        sizeof(device size_t) for the device on which the kernel execution will
 109        be enqueued. See:
 110        https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
 111      */
 112     device_size_t_size_bits = dinfo->adress_bits;
 113     host_size_t_size_bits   = (cl_uint)(sizeof(size_t) * 8);
 114
 115     /* If sizeof(host size_t) <= sizeof(device size_t)
 116             => global_work_size components will always be valid
 117        else
 118             => get device limit for global work size and
 119             compare it against each component of global_work_size.
 120      */
 121     if (host_size_t_size_bits > device_size_t_size_bits)
 122     {
 123         size_t device_limit;
 124
 125         device_limit = (((size_t)1) << device_size_t_size_bits) - 1;
 126
 127         for (int i = 0; i < work_dim; i++)
 128         {
 129             if (global_work_size[i] > device_limit)
 130             {
 131                 gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
 132                           "The number of nonbonded work units (=number of super-clusters) exceeds the"
 133                           "device capabilities. Global work size limit exceeded (%d > %d)!",
 134                           global_work_size[i], device_limit);
 135             }
 136         }
 137     }
 138 }
 139
 140 /* Constant arrays listing non-bonded kernel function names. The arrays are
 141  * organized in 2-dim arrays by: electrostatics and VDW type.
 142  *
 143  *  Note that the row- and column-order of function pointers has to match the
 144  *  order of corresponding enumerated electrostatics and vdw types, resp.,
 145  *  defined in nbnxn_cuda_types.h.
 146  */
 147
 148 /*! \brief Force-only kernel function names. */
 149 static const char* nb_kfunc_noener_noprune_ptr[eelOclNR][evdwOclNR] =
 150 {
 151     { "nbnxn_kernel_ElecCut_VdwLJ_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_opencl"            },
 152     { "nbnxn_kernel_ElecRF_VdwLJ_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_opencl"             },
 153     { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_opencl"        },
 154     { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_opencl" },
 155     { "nbnxn_kernel_ElecEw_VdwLJ_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_opencl"             },
 156     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_opencl"      }
 157 };
 158
 159 /*! \brief Force + energy kernel function pointers. */
 160 static const char* nb_kfunc_ener_noprune_ptr[eelOclNR][evdwOclNR] =
 161 {
 162     { "nbnxn_kernel_ElecCut_VdwLJ_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_opencl"              },
 163     { "nbnxn_kernel_ElecRF_VdwLJ_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_opencl"               },
 164     { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_opencl"          },
 165     { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_opencl"     },
 166     { "nbnxn_kernel_ElecEw_VdwLJ_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_opencl"               },
 167     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_opencl"        }
 168 };
 169
 170 /*! \brief Force + pruning kernel function pointers. */
 171 static const char* nb_kfunc_noener_prune_ptr[eelOclNR][evdwOclNR] =
 172 {
 173     { "nbnxn_kernel_ElecCut_VdwLJ_F_prune_opencl",             "nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_opencl"            },
 174     { "nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl",              "nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_opencl"             },
 175     { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_opencl",         "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_opencl"        },
 176     { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_opencl",  "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_opencl" },
 177     { "nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl",              "nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_opencl"             },
 178     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_opencl",       "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_opencl"      }
 179 };
 180
 181 /*! \brief Force + energy + pruning kernel function pointers. */
 182 static const char* nb_kfunc_ener_prune_ptr[eelOclNR][evdwOclNR] =
 183 {
 184     { "nbnxn_kernel_ElecCut_VdwLJ_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_opencl"            },
 185     { "nbnxn_kernel_ElecRF_VdwLJ_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_opencl"             },
 186     { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_opencl"        },
 187     { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_opencl" },
 188     { "nbnxn_kernel_ElecEw_VdwLJ_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_opencl"             },
 189     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_opencl"      }
 190 };
 191
 192 /*! \brief Return a pointer to the kernel version to be executed at the current step.
 193  *  OpenCL kernel objects are cached in nb. If the requested kernel is not
 194  *  found in the cache, it will be created and the cache will be updated.
 195  */
 196 static inline cl_kernel select_nbnxn_kernel(gmx_nbnxn_ocl_t   *nb,
 197                                             int                eeltype,
 198                                             int                evdwtype,
 199                                             bool               bDoEne,
 200                                             bool               bDoPrune)
 201 {
 202     const char* kernel_name_to_run;
 203     cl_kernel  *kernel_ptr;
 204     cl_int      cl_error;
 205
 206     assert(eeltype < eelOclNR);
 207     assert(evdwtype < eelOclNR);
 208
 209     if (bDoEne)
 210     {
 211         if (bDoPrune)
 212         {
 213             kernel_name_to_run = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
 214             kernel_ptr         = &(nb->kernel_ener_prune_ptr[eeltype][evdwtype]);
 215         }
 216         else
 217         {
 218             kernel_name_to_run = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
 219             kernel_ptr         = &(nb->kernel_ener_noprune_ptr[eeltype][evdwtype]);
 220         }
 221     }
 222     else
 223     {
 224         if (bDoPrune)
 225         {
 226             kernel_name_to_run = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
 227             kernel_ptr         = &(nb->kernel_noener_prune_ptr[eeltype][evdwtype]);
 228         }
 229         else
 230         {
 231             kernel_name_to_run = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
 232             kernel_ptr         = &(nb->kernel_noener_noprune_ptr[eeltype][evdwtype]);
 233         }
 234     }
 235
 236     if (NULL == kernel_ptr[0])
 237     {
 238         *kernel_ptr = clCreateKernel(nb->dev_info->program, kernel_name_to_run, &cl_error);
 239         assert(cl_error == CL_SUCCESS);
 240     }
 241     // TODO: handle errors
 242
 243     return *kernel_ptr;
 244 }
 245
 246 /*! \brief Calculates the amount of shared memory required by the OpenCL kernel in use.
 247  */
 248 static inline int calc_shmem_required()
 249 {
 250     int shmem;
 251
 252     /* size of shmem (force-buffers/xq/atom type preloading) */
 253     /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
 254     /* i-atom x+q in shared memory */
 255     shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float) * 4; /* xqib */
 256     /* cj in shared memory, for both warps separately */
 257     shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);       /* cjs  */
 258 #ifdef IATYPE_SHMEM
 259     /* FIXME: this should not be compile-time decided but rather at runtime.
 260      * This issue propagated from the CUDA code where due to the source to source
 261      * compilation there was confusion the way to set up arch-dependent launch parameters.
 262      * Here too this should be converted to a hardware/arch/generation dependent
 263      * conditional when re-evaluating the need for i atom type preloading.
 264      */
 265     /* i-atom types in shared memory */
 266     #pragma error "Should not be defined"
 267     shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);       /* atib */
 268 #endif
 269     /* force reduction buffers in shared memory */
 270     shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float); /* f_buf */
 271     /* Warp vote. In fact it must be * number of warps in block.. */
 272     shmem += sizeof(cl_uint) * 2;                   /* warp_any */
 273     return shmem;
 274 }
 275
 276 /*! \brief Initializes data structures that are going to be sent to the OpenCL device.
 277  *
 278  *  The device can't use the same data structures as the host for two main reasons:
 279  *  - OpenCL restrictions (pointers are not accepted inside data structures)
 280  *  - some host side fields are not needed for the OpenCL kernels.
 281  */
 282 static void fillin_ocl_structures(cl_nbparam_t        *nbp,
 283                                   cl_nbparam_params_t *nbparams_params)
 284 {
 285     nbparams_params->coulomb_tab_scale = nbp->coulomb_tab_scale;
 286     nbparams_params->coulomb_tab_size  = nbp->coulomb_tab_size;
 287     nbparams_params->c_rf              = nbp->c_rf;
 288     nbparams_params->dispersion_shift  = nbp->dispersion_shift;
 289     nbparams_params->eeltype           = nbp->eeltype;
 290     nbparams_params->epsfac            = nbp->epsfac;
 291     nbparams_params->ewaldcoeff_lj     = nbp->ewaldcoeff_lj;
 292     nbparams_params->ewald_beta        = nbp->ewald_beta;
 293     nbparams_params->rcoulomb_sq       = nbp->rcoulomb_sq;
 294     nbparams_params->repulsion_shift   = nbp->repulsion_shift;
 295     nbparams_params->rlist_sq          = nbp->rlist_sq;
 296     nbparams_params->rvdw_sq           = nbp->rvdw_sq;
 297     nbparams_params->rvdw_switch       = nbp->rvdw_switch;
 298     nbparams_params->sh_ewald          = nbp->sh_ewald;
 299     nbparams_params->sh_lj_ewald       = nbp->sh_lj_ewald;
 300     nbparams_params->two_k_rf          = nbp->two_k_rf;
 301     nbparams_params->vdwtype           = nbp->vdwtype;
 302     nbparams_params->vdw_switch        = nbp->vdw_switch;
 303 }
 304
 305 /*! \brief Waits for the commands associated with the input event to finish.
 306  * Then it releases the event and sets it to 0.
 307  * Don't use this function when more than one wait will be issued for the event.
 308  */
 309 void wait_ocl_event(cl_event *ocl_event)
 310 {
 311     cl_int gmx_unused cl_error;
 312
 313     /* Blocking wait for the event */
 314     cl_error = clWaitForEvents(1, ocl_event);
 315     assert(CL_SUCCESS == cl_error);
 316
 317     /* Release event and reset it to 0 */
 318     cl_error = clReleaseEvent(*ocl_event);
 319     assert(CL_SUCCESS == cl_error);
 320     *ocl_event = 0;
 321 }
 322
 323 /*! \brief Enqueues a wait for event completion.
 324  *
 325  * Then it releases the event and sets it to 0.
 326  * Don't use this function when more than one wait will be issued for the event.
 327  * Equivalent to Cuda Stream Sync. */
 328 void sync_ocl_event(cl_command_queue stream, cl_event *ocl_event)
 329 {
 330     cl_int gmx_unused cl_error;
 331
 332     /* Enqueue wait */
 333 #ifdef CL_VERSION_1_2
 334     cl_error = clEnqueueBarrierWithWaitList(stream, 1, ocl_event, NULL);
 335 #else
 336     cl_error = clEnqueueWaitForEvents(stream, 1, ocl_event);
 337 #endif
 338
 339     GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error));
 340
 341     /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */
 342     cl_error = clReleaseEvent(*ocl_event);
 343     assert(CL_SUCCESS == cl_error);
 344     *ocl_event = 0;
 345 }
 346
 347 /*! \brief Returns the duration in milliseconds for the command associated with the event.
 348  *
 349  * It then releases the event and sets it to 0.
 350  * Before calling this function, make sure the command has finished either by
 351  * calling clFinish or clWaitForEvents.
 352  * The function returns 0.0 if the input event, *ocl_event, is 0.
 353  * Don't use this function when more than one wait will be issued for the event.
 354  */
 355 double ocl_event_elapsed_ms(cl_event *ocl_event)
 356 {
 357     cl_int gmx_unused cl_error;
 358     cl_ulong          start_ns, end_ns;
 359     double            elapsed_ms;
 360
 361     elapsed_ms = 0.0;
 362     assert(NULL != ocl_event);
 363
 364     if (*ocl_event)
 365     {
 366         cl_error = clGetEventProfilingInfo(*ocl_event, CL_PROFILING_COMMAND_START,
 367                                            sizeof(cl_ulong), &start_ns, NULL);
 368         assert(CL_SUCCESS == cl_error);
 369
 370         cl_error = clGetEventProfilingInfo(*ocl_event, CL_PROFILING_COMMAND_END,
 371                                            sizeof(cl_ulong), &end_ns, NULL);
 372         assert(CL_SUCCESS == cl_error);
 373
 374         clReleaseEvent(*ocl_event);
 375         *ocl_event = 0;
 376
 377         elapsed_ms = (end_ns - start_ns) / 1000000.0;
 378     }
 379
 380     return elapsed_ms;
 381 }
 382
 383 /*! \brief Launch GPU kernel
 384
 385    As we execute nonbonded workload in separate queues, before launching
 386    the kernel we need to make sure that he following operations have completed:
 387    - atomdata allocation and related H2D transfers (every nstlist step);
 388    - pair list H2D transfer (every nstlist step);
 389    - shift vector H2D transfer (every nstlist step);
 390    - force (+shift force and energy) output clearing (every step).
 391
 392    These operations are issued in the local queue at the beginning of the step
 393    and therefore always complete before the local kernel launch. The non-local
 394    kernel is launched after the local on the same device/context, so this is
 395    inherently scheduled after the operations in the local stream (including the
 396    above "misc_ops").
 397    However, for the sake of having a future-proof implementation, we use the
 398    misc_ops_done event to record the point in time when the above  operations
 399    are finished and synchronize with this event in the non-local stream.
 400  */
 401 void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
 402                              const struct nbnxn_atomdata_t *nbatom,
 403                              int                            flags,
 404                              int                            iloc)
 405 {
 406     cl_int               cl_error;
 407     int                  adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 408     /* OpenCL kernel launch-related stuff */
 409     int                  shmem;
 410     size_t               local_work_size[3], global_work_size[3];
 411     cl_kernel            nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
 412
 413     cl_atomdata_t       *adat    = nb->atdat;
 414     cl_nbparam_t        *nbp     = nb->nbparam;
 415     cl_plist_t          *plist   = nb->plist[iloc];
 416     cl_timers_t         *t       = nb->timers;
 417     cl_command_queue     stream  = nb->stream[iloc];
 418
 419     bool                 bCalcEner   = flags & GMX_FORCE_ENERGY;
 420     int                  bCalcFshift = flags & GMX_FORCE_VIRIAL;
 421     bool                 bDoTime     = nb->bDoTime;
 422     cl_uint              arg_no;
 423
 424     cl_nbparam_params_t  nbparams_params;
 425 #ifdef DEBUG_OCL
 426     float              * debug_buffer_h;
 427     size_t               debug_buffer_size;
 428 #endif
 429
 430     /* turn energy calculation always on/off (for debugging/testing only) */
 431     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 432
 433     /* Don't launch the non-local kernel if there is no work to do.
 434        Doing the same for the local kernel is more complicated, since the
 435        local part of the force array also depends on the non-local kernel.
 436        So to avoid complicating the code and to reduce the risk of bugs,
 437        we always call the local kernel, the local x+q copy and later (not in
 438        this function) the stream wait, local f copyback and the f buffer
 439        clearing. All these operations, except for the local interaction kernel,
 440        are needed for the non-local interactions. The skip of the local kernel
 441        call is taken care of later in this function. */
 442     if (iloc == eintNonlocal && plist->nsci == 0)
 443     {
 444         return;
 445     }
 446
 447     /* calculate the atom data index range based on locality */
 448     if (LOCAL_I(iloc))
 449     {
 450         adat_begin  = 0;
 451         adat_len    = adat->natoms_local;
 452     }
 453     else
 454     {
 455         adat_begin  = adat->natoms_local;
 456         adat_len    = adat->natoms - adat->natoms_local;
 457     }
 458
 459     /* beginning of timed HtoD section */
 460
 461     /* HtoD x, q */
 462     ocl_copy_H2D_async(adat->xq, nbatom->x + adat_begin * 4, adat_begin*sizeof(float)*4,
 463                        adat_len * sizeof(float) * 4, stream, bDoTime ? (&(t->nb_h2d[iloc])) : NULL);
 464
 465     /* When we get here all misc operations issues in the local stream as well as
 466        the local xq H2D are done,
 467        so we record that in the local stream and wait for it in the nonlocal one. */
 468     if (nb->bUseTwoStreams)
 469     {
 470         if (iloc == eintLocal)
 471         {
 472 #ifdef CL_VERSION_1_2
 473             cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->misc_ops_and_local_H2D_done));
 474 #else
 475             cl_error = clEnqueueMarker(stream, &(nb->misc_ops_and_local_H2D_done));
 476 #endif
 477             assert(CL_SUCCESS == cl_error);
 478
 479             /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
 480              * in the local stream in order to be able to sync with the above event
 481              * from the non-local stream.
 482              */
 483             cl_error = clFlush(stream);
 484             assert(CL_SUCCESS == cl_error);
 485         }
 486         else
 487         {
 488             sync_ocl_event(stream, &(nb->misc_ops_and_local_H2D_done));
 489         }
 490     }
 491
 492     if (plist->nsci == 0)
 493     {
 494         /* Don't launch an empty local kernel (is not allowed with OpenCL).
 495          * TODO: Separate H2D and kernel launch into separate functions.
 496          */
 497         return;
 498     }
 499
 500     /* beginning of timed nonbonded calculation section */
 501
 502     /* get the pointer to the kernel flavor we need to use */
 503     nb_kernel = select_nbnxn_kernel(nb,
 504                                     nbp->eeltype,
 505                                     nbp->vdwtype,
 506                                     bCalcEner,
 507                                     plist->bDoPrune || always_prune);
 508
 509     /* kernel launch config */
 510     local_work_size[0] = CL_SIZE;
 511     local_work_size[1] = CL_SIZE;
 512     local_work_size[2] = 1;
 513
 514     global_work_size[0] = plist->nsci * local_work_size[0];
 515     global_work_size[1] = 1 * local_work_size[1];
 516     global_work_size[2] = 1 * local_work_size[2];
 517
 518     validate_global_work_size(global_work_size, 3, nb->dev_info);
 519
 520     shmem     = calc_shmem_required();
 521
 522 #ifdef DEBUG_OCL
 523     {
 524         static int run_step = 1;
 525
 526         if (DEBUG_RUN_STEP == run_step)
 527         {
 528             debug_buffer_size = global_work_size[0] * global_work_size[1] * global_work_size[2] * sizeof(float);
 529             debug_buffer_h    = (float*)calloc(1, debug_buffer_size);
 530             assert(NULL != debug_buffer_h);
 531
 532             if (NULL == nb->debug_buffer)
 533             {
 534                 nb->debug_buffer = clCreateBuffer(nb->dev_info->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
 535                                                   debug_buffer_size, debug_buffer_h, &cl_error);
 536
 537                 assert(CL_SUCCESS == cl_error);
 538             }
 539         }
 540
 541         run_step++;
 542     }
 543 #endif
 544     if (debug)
 545     {
 546         fprintf(debug, "GPU launch configuration:\n\tLocal work size: %dx%dx%d\n\t"
 547                 "Global work size : %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
 548                 (int)(local_work_size[0]), (int)(local_work_size[1]), (int)(local_work_size[2]),
 549                 (int)(global_work_size[0]), (int)(global_work_size[1]), plist->nsci*NCL_PER_SUPERCL,
 550                 NCL_PER_SUPERCL, plist->na_c);
 551     }
 552
 553     fillin_ocl_structures(nbp, &nbparams_params);
 554
 555     arg_no    = 0;
 556     cl_error  = clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &(adat->ntypes));
 557     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(nbparams_params), &(nbparams_params));
 558     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->xq));
 559     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->f));
 560     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_lj));
 561     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_el));
 562     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->fshift));
 563     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->atom_types));
 564     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->shift_vec));
 565     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_climg2d));
 566     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_comb_climg2d));
 567     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->coulomb_tab_climg2d));
 568     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->sci));
 569     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->cj4));
 570     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->excl));
 571     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &bCalcFshift);
 572     cl_error |= clSetKernelArg(nb_kernel, arg_no++, shmem, NULL);
 573     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nb->debug_buffer));
 574
 575     assert(cl_error == CL_SUCCESS);
 576
 577     if (cl_error)
 578     {
 579         printf("ClERROR! %d\n", cl_error);
 580     }
 581     cl_error = clEnqueueNDRangeKernel(stream, nb_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, bDoTime ? &(t->nb_k[iloc]) : NULL);
 582     assert(cl_error == CL_SUCCESS);
 583
 584 #ifdef DEBUG_OCL
 585     {
 586         static int run_step = 1;
 587
 588         if (DEBUG_RUN_STEP == run_step)
 589         {
 590             FILE *pf;
 591             char  file_name[256] = {0};
 592
 593             ocl_copy_D2H_async(debug_buffer_h, nb->debug_buffer, 0,
 594                                debug_buffer_size, stream, NULL);
 595
 596             // Make sure all data has been transfered back from device
 597             clFinish(stream);
 598
 599             printf("\nWriting debug_buffer to debug_buffer_ocl.txt...");
 600
 601             sprintf(file_name, "debug_buffer_ocl_%d.txt", DEBUG_RUN_STEP);
 602             pf = fopen(file_name, "wt");
 603             assert(pf != NULL);
 604
 605             fprintf(pf, "%20s", "");
 606             for (int j = 0; j < global_work_size[0]; j++)
 607             {
 608                 char label[20];
 609                 sprintf(label, "(wIdx=%2d thIdx=%2d)", j / local_work_size[0], j % local_work_size[0]);
 610                 fprintf(pf, "%20s", label);
 611             }
 612
 613             for (int i = 0; i < global_work_size[1]; i++)
 614             {
 615                 char label[20];
 616                 sprintf(label, "(wIdy=%2d thIdy=%2d)", i / local_work_size[1], i % local_work_size[1]);
 617                 fprintf(pf, "\n%20s", label);
 618
 619                 for (int j = 0; j < global_work_size[0]; j++)
 620                 {
 621                     fprintf(pf, "%20.5f", debug_buffer_h[i * global_work_size[0] + j]);
 622                 }
 623
 624                 //fprintf(pf, "\n");
 625             }
 626
 627             fclose(pf);
 628
 629             printf(" done.\n");
 630
 631
 632             free(debug_buffer_h);
 633             debug_buffer_h = NULL;
 634         }
 635
 636         run_step++;
 637     }
 638 #endif
 639 }
 640
 641 /*! \brief Debugging helper function */
 642 void dump_compare_results_cj4(nbnxn_cj4_t* results, int cnt, char* out_file, char* ref_file)
 643 {
 644     FILE *pf;
 645
 646     pf = fopen(out_file, "wt");
 647     assert(pf != NULL);
 648
 649     fprintf(pf, "%20s%20s%20s%20s%20s%20s%20s%20s\n",
 650             "cj[0]", "cj[1]", "cj[2]", "cj[3]",
 651             "imei[0].excl_ind", "imei[0].imask",
 652             "imei[1].excl_ind", "imei[1].imask");
 653
 654     for (int index = 0; index < cnt; index++)
 655     {
 656         fprintf(pf, "%20d%20d%20d%20d%20d%20u%20d%20u\n",
 657                 results[index].cj[0], results[index].cj[1], results[index].cj[2], results[index].cj[3],
 658                 results[index].imei[0].excl_ind, results[index].imei[0].imask,
 659                 results[index].imei[1].excl_ind, results[index].imei[1].imask);
 660     }
 661
 662     fclose(pf);
 663
 664     printf("\nWrote results to %s", out_file);
 665
 666     pf = fopen(ref_file, "rt");
 667     if (pf)
 668     {
 669         char c;
 670         int  diff = 0;
 671         printf("\n%s file found. Comparing results...", ref_file);
 672
 673         /* Skip the first line */
 674         c = 0;
 675         while (c != '\n')
 676         {
 677             if (1 != fscanf(pf, "%c", &c))
 678             {
 679                 break;
 680             }
 681         }
 682
 683         for (int index = 0; index < cnt; index++)
 684         {
 685             int          ref_val;
 686             unsigned int u_ref_val;
 687
 688             for (int j = 0; j < 4; j++)
 689             {
 690                 if (1 != fscanf(pf, "%20d", &ref_val))
 691                 {
 692                     break;
 693                 }
 694
 695                 if (ref_val != results[index].cj[j])
 696                 {
 697                     printf("\nDifference for cj[%d] at index %d computed value = %d reference value = %d",
 698                            j, index, results[index].cj[j], ref_val);
 699
 700                     diff++;
 701                 }
 702             }
 703
 704             for (int j = 0; j < 2; j++)
 705             {
 706                 if (1 != fscanf(pf, "%20d", &ref_val))
 707                 {
 708                     break;
 709                 }
 710
 711                 if (ref_val != results[index].imei[j].excl_ind)
 712                 {
 713                     printf("\nDifference for imei[%d].excl_ind at index %d computed value = %d reference value = %d",
 714                            j, index, results[index].imei[j].excl_ind, ref_val);
 715
 716                     diff++;
 717                 }
 718
 719                 if (1 != fscanf(pf, "%20u", &u_ref_val))
 720                 {
 721                     break;
 722                 }
 723
 724                 if (u_ref_val != results[index].imei[j].imask)
 725                 {
 726                     printf("\nDifference for imei[%d].imask at index %d computed value = %u reference value = %u",
 727                            j, index, results[index].imei[j].imask, u_ref_val);
 728
 729                     diff++;
 730                 }
 731
 732             }
 733         }
 734
 735         printf("\nFinished comparing results. Total number of differences: %d", diff);
 736         fclose(pf);
 737     }
 738     else
 739     {
 740         printf("\n%s file not found. No comparison performed.", ref_file);
 741     }
 742 }
 743
 744 /*! \brief Debugging helper function */
 745 void dump_compare_results_f(float* results, int cnt, char* out_file, char* ref_file)
 746 {
 747     FILE *pf;
 748     float cmp_eps = 0.001f;
 749
 750     pf = fopen(out_file, "wt");
 751     assert(pf != NULL);
 752
 753     for (int index = 0; index < cnt; index++)
 754     {
 755         fprintf(pf, "%15.5f\n", results[index]);
 756     }
 757
 758     fclose(pf);
 759
 760     printf("\nWrote results to %s", out_file);
 761
 762     pf = fopen(ref_file, "rt");
 763     if (pf)
 764     {
 765         int diff = 0;
 766         printf("\n%s file found. Comparing results...", ref_file);
 767         for (int index = 0; index < cnt; index++)
 768         {
 769             float ref_val;
 770             if (1 != fscanf(pf, "%20f", &ref_val))
 771             {
 772                 break;
 773             }
 774
 775             if (((ref_val - results[index]) > cmp_eps) ||
 776                 ((ref_val - results[index]) < -cmp_eps))
 777             {
 778                 printf("\nDifference at index %d computed value = %15.5f reference value = %15.5f",
 779                        index, results[index], ref_val);
 780
 781                 diff++;
 782             }
 783         }
 784
 785         printf("\nFinished comparing results. Total number of differences: %d", diff);
 786         fclose(pf);
 787     }
 788     else
 789     {
 790         printf("\n%s file not found. No comparison performed.", ref_file);
 791     }
 792 }
 793
 794 /*! \brief
 795  * Debug function for dumping cj4, f and fshift buffers.
 796  * By default this function does nothing. To enable debugging for any of these
 797  * buffers, uncomment the corresponding definition inside the function:
 798  * DEBUG_DUMP_CJ4_OCL, DEBUG_DUMP_F_OCL, DEBUG_DUMP_FSHIFT_OCL.
 799  */
 800 static
 801 void debug_dump_cj4_f_fshift(gmx_nbnxn_ocl_t               gmx_unused *nb,
 802                              const struct nbnxn_atomdata_t gmx_unused *nbatom,
 803                              cl_command_queue              gmx_unused  stream,
 804                              int                           gmx_unused  adat_begin,
 805                              int                           gmx_unused  adat_len)
 806 {
 807 /* Uncomment this define to enable cj4 debugging for the first kernel run */
 808 //#define DEBUG_DUMP_CJ4_OCL
 809 #ifdef DEBUG_DUMP_CJ4_OCL
 810     {
 811         static int run_step = 1;
 812
 813         if (DEBUG_RUN_STEP == run_step)
 814         {
 815             nbnxn_cj4_t *temp_cj4;
 816             int          cnt;
 817             size_t       size;
 818             char         ocl_file_name[256]  = {0};
 819             char         cuda_file_name[256] = {0};
 820
 821             cnt      = nb->plist[0]->ncj4;
 822             size     = cnt * sizeof(nbnxn_cj4_t);
 823             temp_cj4 = (nbnxn_cj4_t*)malloc(size);
 824
 825             ocl_copy_D2H_async(temp_cj4, nb->plist[0]->cj4, 0,
 826                                size, stream, NULL);
 827
 828             // Make sure all data has been transfered back from device
 829             clFinish(stream);
 830
 831             sprintf(ocl_file_name, "ocl_cj4_%d.txt", DEBUG_RUN_STEP);
 832             sprintf(cuda_file_name, "cuda_cj4_%d.txt", DEBUG_RUN_STEP);
 833             dump_compare_results_cj4(temp_cj4, cnt, ocl_file_name, cuda_file_name);
 834
 835             free(temp_cj4);
 836         }
 837
 838         run_step++;
 839     }
 840 #endif
 841
 842 /* Uncomment this define to enable f debugging for the first kernel run */
 843 //#define DEBUG_DUMP_F_OCL
 844 #ifdef DEBUG_DUMP_F_OCL
 845     {
 846         static int run_step = 1;
 847
 848         if (DEBUG_RUN_STEP == run_step)
 849         {
 850             char ocl_file_name[256]  = {0};
 851             char cuda_file_name[256] = {0};
 852
 853             // Make sure all data has been transfered back from device
 854             clFinish(stream);
 855
 856             sprintf(ocl_file_name, "ocl_f_%d.txt", DEBUG_RUN_STEP);
 857             sprintf(cuda_file_name, "cuda_f_%d.txt", DEBUG_RUN_STEP);
 858
 859             dump_compare_results_f(nbatom->out[0].f + adat_begin * 3, (adat_len) * 3,
 860                                    ocl_file_name, cuda_file_name);
 861         }
 862
 863         run_step++;
 864     }
 865 #endif
 866
 867 /* Uncomment this define to enable fshift debugging for the first kernel run */
 868 //#define DEBUG_DUMP_FSHIFT_OCL
 869 #ifdef DEBUG_DUMP_FSHIFT_OCL
 870     {
 871         static int run_step = 1;
 872
 873         if (DEBUG_RUN_STEP == run_step)
 874         {
 875             char ocl_file_name[256]  = {0};
 876             char cuda_file_name[256] = {0};
 877
 878             // Make sure all data has been transfered back from device
 879             clFinish(stream);
 880
 881             sprintf(ocl_file_name, "ocl_fshift_%d.txt", DEBUG_RUN_STEP);
 882             sprintf(cuda_file_name, "cuda_fshift_%d.txt", DEBUG_RUN_STEP);
 883
 884             dump_compare_results_f((float*)(nb->nbst.fshift), SHIFTS * 3,
 885                                    ocl_file_name, cuda_file_name);
 886         }
 887
 888         run_step++;
 889     }
 890 #endif
 891 }
 892
 893 /*! \brief
 894  * Launch asynchronously the download of nonbonded forces from the GPU
 895  * (and energies/shift forces if required).
 896  */
 897 void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
 898                               const struct nbnxn_atomdata_t *nbatom,
 899                               int                            flags,
 900                               int                            aloc)
 901 {
 902     cl_int gmx_unused cl_error;
 903     int               adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 904     int               iloc = -1;
 905
 906     /* determine interaction locality from atom locality */
 907     if (LOCAL_A(aloc))
 908     {
 909         iloc = eintLocal;
 910     }
 911     else if (NONLOCAL_A(aloc))
 912     {
 913         iloc = eintNonlocal;
 914     }
 915     else
 916     {
 917         char stmp[STRLEN];
 918         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 919                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 920
 921         gmx_incons(stmp);
 922     }
 923
 924     cl_atomdata_t   *adat    = nb->atdat;
 925     cl_timers_t     *t       = nb->timers;
 926     bool             bDoTime = nb->bDoTime;
 927     cl_command_queue stream  = nb->stream[iloc];
 928
 929     bool             bCalcEner   = flags & GMX_FORCE_ENERGY;
 930     int              bCalcFshift = flags & GMX_FORCE_VIRIAL;
 931
 932
 933     /* don't launch non-local copy-back if there was no non-local work to do */
 934     if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
 935     {
 936         /* TODO An alternative way to signal that non-local work is
 937            complete is to use a clEnqueueMarker+clEnqueueBarrier
 938            pair. However, the use of bNonLocalStreamActive has the
 939            advantage of being local to the host, so probably minimizes
 940            overhead. Curiously, for NVIDIA OpenCL with an empty-domain
 941            test case, overall simulation performance was higher with
 942            the API calls, but this has not been tested on AMD OpenCL,
 943            so could be worth considering in future. */
 944         nb->bNonLocalStreamActive = false;
 945         return;
 946     }
 947
 948     /* calculate the atom data index range based on locality */
 949     if (LOCAL_A(aloc))
 950     {
 951         adat_begin  = 0;
 952         adat_len    = adat->natoms_local;
 953     }
 954     else
 955     {
 956         adat_begin  = adat->natoms_local;
 957         adat_len    = adat->natoms - adat->natoms_local;
 958     }
 959
 960     /* beginning of timed D2H section */
 961
 962     /* With DD the local D2H transfer can only start after the non-local
 963        has been launched. */
 964     if (iloc == eintLocal && nb->bNonLocalStreamActive)
 965     {
 966         sync_ocl_event(stream, &(nb->nonlocal_done));
 967     }
 968
 969     /* DtoH f */
 970     ocl_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f, adat_begin*3*sizeof(float),
 971                        (adat_len)* adat->f_elem_size, stream, bDoTime ? &(t->nb_d2h_f[iloc]) : NULL);
 972
 973     /* kick off work */
 974     cl_error = clFlush(stream);
 975     assert(CL_SUCCESS == cl_error);
 976
 977     /* After the non-local D2H is launched the nonlocal_done event can be
 978        recorded which signals that the local D2H can proceed. This event is not
 979        placed after the non-local kernel because we first need the non-local
 980        data back first. */
 981     if (iloc == eintNonlocal)
 982     {
 983 #ifdef CL_VERSION_1_2
 984         cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->nonlocal_done));
 985 #else
 986         cl_error = clEnqueueMarker(stream, &(nb->nonlocal_done));
 987 #endif
 988         assert(CL_SUCCESS == cl_error);
 989         nb->bNonLocalStreamActive = true;
 990     }
 991
 992     /* only transfer energies in the local stream */
 993     if (LOCAL_I(iloc))
 994     {
 995         /* DtoH fshift */
 996         if (bCalcFshift)
 997         {
 998             ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
 999                                SHIFTS * adat->fshift_elem_size, stream, bDoTime ? &(t->nb_d2h_fshift[iloc]) : NULL);
1000         }
1001
1002         /* DtoH energies */
1003         if (bCalcEner)
1004         {
1005             ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0,
1006                                sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_lj[iloc]) : NULL);
1007
1008             ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0,
1009                                sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_el[iloc]) : NULL);
1010         }
1011     }
1012
1013     debug_dump_cj4_f_fshift(nb, nbatom, stream, adat_begin, adat_len);
1014 }
1015
1016 /*! \brief
1017  * Wait for the asynchronously launched nonbonded calculations and data
1018  * transfers to finish.
1019  */
1020 void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_ocl_t *nb,
1021                             int flags, int aloc,
1022                             real *e_lj, real *e_el, rvec *fshift)
1023 {
1024     /* NOTE:  only implemented for single-precision at this time */
1025     cl_int gmx_unused      cl_error;
1026     int                    i, iloc = -1;
1027
1028     /* determine interaction locality from atom locality */
1029     if (LOCAL_A(aloc))
1030     {
1031         iloc = eintLocal;
1032     }
1033     else if (NONLOCAL_A(aloc))
1034     {
1035         iloc = eintNonlocal;
1036     }
1037     else
1038     {
1039         char stmp[STRLEN];
1040         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
1041                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
1042         gmx_incons(stmp);
1043     }
1044
1045     cl_plist_t                 *plist    = nb->plist[iloc];
1046     cl_timers_t                *timers   = nb->timers;
1047     struct gmx_wallclock_gpu_t *timings  = nb->timings;
1048     cl_nb_staging               nbst     = nb->nbst;
1049
1050     bool                        bCalcEner   = flags & GMX_FORCE_ENERGY;
1051     int                         bCalcFshift = flags & GMX_FORCE_VIRIAL;
1052
1053     /* turn energy calculation always on/off (for debugging/testing only) */
1054     bCalcEner = (bCalcEner || always_ener) && !never_ener;
1055
1056     /* Launch wait/update timers & counters, unless doing the non-local phase
1057        when there is not actually work to do. This is consistent with
1058        nbnxn_gpu_launch_kernel.
1059
1060        NOTE: if timing with multiple GPUs (streams) becomes possible, the
1061        counters could end up being inconsistent due to not being incremented
1062        on some of the nodes! */
1063     if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
1064     {
1065         return;
1066     }
1067
1068     /* Actual sync point. Waits for everything to be finished in the command queue. TODO: Find out if a more fine grained solution is needed */
1069     cl_error = clFinish(nb->stream[iloc]);
1070     assert(CL_SUCCESS == cl_error);
1071
1072     /* timing data accumulation */
1073     if (nb->bDoTime)
1074     {
1075         /* only increase counter once (at local F wait) */
1076         if (LOCAL_I(iloc))
1077         {
1078             timings->nb_c++;
1079             timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
1080         }
1081
1082         /* kernel timings */
1083
1084         timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
1085             ocl_event_elapsed_ms(timers->nb_k + iloc);
1086
1087         /* X/q H2D and F D2H timings */
1088         timings->nb_h2d_t += ocl_event_elapsed_ms(timers->nb_h2d        + iloc);
1089         timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_f      + iloc);
1090         timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_fshift + iloc);
1091         timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_e_el   + iloc);
1092         timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_e_lj   + iloc);
1093
1094         /* only count atdat and pair-list H2D at pair-search step */
1095         if (plist->bDoPrune)
1096         {
1097             /* atdat transfer timing (add only once, at local F wait) */
1098             if (LOCAL_A(aloc))
1099             {
1100                 timings->pl_h2d_c++;
1101                 timings->pl_h2d_t += ocl_event_elapsed_ms(&(timers->atdat));
1102             }
1103
1104             timings->pl_h2d_t +=
1105                 ocl_event_elapsed_ms(timers->pl_h2d_sci     + iloc) +
1106                 ocl_event_elapsed_ms(timers->pl_h2d_cj4     + iloc) +
1107                 ocl_event_elapsed_ms(timers->pl_h2d_excl    + iloc);
1108
1109         }
1110     }
1111
1112     /* add up energies and shift forces (only once at local F wait) */
1113     if (LOCAL_I(iloc))
1114     {
1115         if (bCalcEner)
1116         {
1117             *e_lj += *nbst.e_lj;
1118             *e_el += *nbst.e_el;
1119         }
1120
1121         if (bCalcFshift)
1122         {
1123             for (i = 0; i < SHIFTS; i++)
1124             {
1125                 fshift[i][0] += (nbst.fshift)[i][0];
1126                 fshift[i][1] += (nbst.fshift)[i][1];
1127                 fshift[i][2] += (nbst.fshift)[i][2];
1128             }
1129         }
1130     }
1131
1132     /* turn off pruning (doesn't matter if this is pair-search step or not) */
1133     plist->bDoPrune = false;
1134
1135 }
1136
1137 /*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
1138 int nbnxn_gpu_pick_ewald_kernel_type(bool bTwinCut)
1139 {
1140     bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
1141     int  kernel_type;
1142
1143     /* Benchmarking/development environment variables to force the use of
1144        analytical or tabulated Ewald kernel. */
1145     bForceAnalyticalEwald = (getenv("GMX_OCL_NB_ANA_EWALD") != NULL);
1146     bForceTabulatedEwald  = (getenv("GMX_OCL_NB_TAB_EWALD") != NULL);
1147
1148     if (bForceAnalyticalEwald && bForceTabulatedEwald)
1149     {
1150         gmx_incons("Both analytical and tabulated Ewald OpenCL non-bonded kernels "
1151                    "requested through environment variables.");
1152     }
1153
1154     /* OpenCL: By default, use analytical Ewald
1155      * TODO: tabulated does not work, it needs fixing, see init_nbparam() in nbnxn_ocl_data_mgmt.cpp
1156      *
1157      * TODO: decide if dev_info parameter should be added to recognize NVIDIA CC>=3.0 devices.
1158      *
1159      */
1160     //if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
1161     if ((1                         || bForceAnalyticalEwald) && !bForceTabulatedEwald)
1162     {
1163         bUseAnalyticalEwald = true;
1164
1165         if (debug)
1166         {
1167             fprintf(debug, "Using analytical Ewald OpenCL kernels\n");
1168         }
1169     }
1170     else
1171     {
1172         bUseAnalyticalEwald = false;
1173
1174         if (debug)
1175         {
1176             fprintf(debug, "Using tabulated Ewald OpenCL kernels\n");
1177         }
1178     }
1179
1180     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
1181        forces it (use it for debugging/benchmarking only). */
1182     if (!bTwinCut && (getenv("GMX_OCL_NB_EWALD_TWINCUT") == NULL))
1183     {
1184         kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA : eelOclEWALD_TAB;
1185     }
1186     else
1187     {
1188         kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA_TWIN : eelOclEWALD_TAB_TWIN;
1189     }
1190
1191     return kernel_type;
1192 }