src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *  \brief Define OpenCL implementation of nbnxn_gpu.h
  37  *
  38  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  39  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  40  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  41  *  \ingroup module_mdlib
  42  */
  43 #include "gmxpre.h"
  44
  45 #include "config.h"
  46
  47 #include <assert.h>
  48 #include <stdlib.h>
  49
  50 #if defined(_MSVC)
  51 #include <limits>
  52 #endif
  53
  54 #include "gromacs/gmxlib/ocl_tools/oclutils.h"
  55 #include "gromacs/legacyheaders/types/force_flags.h"
  56 #include "gromacs/legacyheaders/types/hw_info.h"
  57 #include "gromacs/legacyheaders/types/simple.h"
  58 #include "gromacs/mdlib/nb_verlet.h"
  59 #include "gromacs/mdlib/nbnxn_consts.h"
  60 #include "gromacs/mdlib/nbnxn_pairlist.h"
  61 #include "gromacs/timing/gpu_timing.h"
  62
  63 #ifdef TMPI_ATOMICS
  64 #include "thread_mpi/atomic.h"
  65 #endif
  66
  67 #include "gromacs/mdlib/nbnxn_gpu.h"
  68 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  69 #include "gromacs/pbcutil/ishift.h"
  70 #include "gromacs/utility/cstringutil.h"
  71 #include "gromacs/utility/fatalerror.h"
  72
  73 #include "nbnxn_ocl_types.h"
  74
  75 #if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ >= 300
  76 #define USE_TEXOBJ
  77 #endif
  78
  79 /*! \brief Convenience defines */
  80 //@{
  81 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  82 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  83 //@}
  84
  85 /*! \brief Always/never run the energy/pruning kernels -- only for benchmarking purposes */
  86 //@{
  87 static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
  88 static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
  89 static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
  90 //@}
  91
  92 /* Uncomment this define to enable kernel debugging */
  93 //#define DEBUG_OCL
  94
  95 /*! \brief Specifies which kernel run to debug */
  96 #define DEBUG_RUN_STEP 2
  97
  98 /*! \brief Validates the input global work size parameter.
  99  */
 100 static inline void validate_global_work_size(size_t *global_work_size, int work_dim, gmx_device_info_t *dinfo)
 101 {
 102     cl_uint device_size_t_size_bits;
 103     cl_uint host_size_t_size_bits;
 104
 105     assert(dinfo);
 106
 107     /* Each component of a global_work_size must not exceed the range given by the
 108        sizeof(device size_t) for the device on which the kernel execution will
 109        be enqueued. See:
 110        https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
 111      */
 112     device_size_t_size_bits = dinfo->adress_bits;
 113     host_size_t_size_bits   = (cl_uint)(sizeof(size_t) * 8);
 114
 115     /* If sizeof(host size_t) <= sizeof(device size_t)
 116             => global_work_size components will always be valid
 117        else
 118             => get device limit for global work size and
 119             compare it against each component of global_work_size.
 120      */
 121     if (host_size_t_size_bits > device_size_t_size_bits)
 122     {
 123         size_t device_limit;
 124
 125         device_limit = (((size_t)1) << device_size_t_size_bits) - 1;
 126
 127         for (int i = 0; i < work_dim; i++)
 128         {
 129             if (global_work_size[i] > device_limit)
 130             {
 131                 gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
 132                           "The number of nonbonded work units (=number of super-clusters) exceeds the"
 133                           "device capabilities. Global work size limit exceeded (%d > %d)!",
 134                           global_work_size[i], device_limit);
 135             }
 136         }
 137     }
 138 }
 139
 140 /* Constant arrays listing non-bonded kernel function names. The arrays are
 141  * organized in 2-dim arrays by: electrostatics and VDW type.
 142  *
 143  *  Note that the row- and column-order of function pointers has to match the
 144  *  order of corresponding enumerated electrostatics and vdw types, resp.,
 145  *  defined in nbnxn_cuda_types.h.
 146  */
 147
 148 /*! \brief Force-only kernel function names. */
 149 static const char* nb_kfunc_noener_noprune_ptr[eelOclNR][evdwOclNR] =
 150 {
 151     { "nbnxn_kernel_ElecCut_VdwLJ_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_opencl"            },
 152     { "nbnxn_kernel_ElecRF_VdwLJ_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_opencl"             },
 153     { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_opencl"        },
 154     { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_opencl" },
 155     { "nbnxn_kernel_ElecEw_VdwLJ_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_opencl"             },
 156     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_opencl"      }
 157 };
 158
 159 /*! \brief Force + energy kernel function pointers. */
 160 static const char* nb_kfunc_ener_noprune_ptr[eelOclNR][evdwOclNR] =
 161 {
 162     { "nbnxn_kernel_ElecCut_VdwLJ_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_opencl"              },
 163     { "nbnxn_kernel_ElecRF_VdwLJ_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_opencl"               },
 164     { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_opencl"          },
 165     { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_opencl"     },
 166     { "nbnxn_kernel_ElecEw_VdwLJ_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_opencl"               },
 167     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_opencl"        }
 168 };
 169
 170 /*! \brief Force + pruning kernel function pointers. */
 171 static const char* nb_kfunc_noener_prune_ptr[eelOclNR][evdwOclNR] =
 172 {
 173     { "nbnxn_kernel_ElecCut_VdwLJ_F_prune_opencl",             "nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_opencl"            },
 174     { "nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl",              "nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_opencl"             },
 175     { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_opencl",         "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_opencl"        },
 176     { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_opencl",  "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_opencl" },
 177     { "nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl",              "nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_opencl"             },
 178     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_opencl",       "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_opencl"      }
 179 };
 180
 181 /*! \brief Force + energy + pruning kernel function pointers. */
 182 static const char* nb_kfunc_ener_prune_ptr[eelOclNR][evdwOclNR] =
 183 {
 184     { "nbnxn_kernel_ElecCut_VdwLJ_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_opencl"            },
 185     { "nbnxn_kernel_ElecRF_VdwLJ_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_opencl"             },
 186     { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_opencl"        },
 187     { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_opencl" },
 188     { "nbnxn_kernel_ElecEw_VdwLJ_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_opencl"             },
 189     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_opencl"      }
 190 };
 191
 192 /*! \brief Return a pointer to the kernel version to be executed at the current step.
 193  *  OpenCL kernel objects are cached in nb. If the requested kernel is not
 194  *  found in the cache, it will be created and the cache will be updated.
 195  */
 196 static inline cl_kernel select_nbnxn_kernel(gmx_nbnxn_ocl_t   *nb,
 197                                             int                eeltype,
 198                                             int                evdwtype,
 199                                             bool               bDoEne,
 200                                             bool               bDoPrune)
 201 {
 202     const char* kernel_name_to_run;
 203     cl_kernel  *kernel_ptr;
 204     cl_int      cl_error;
 205
 206     assert(eeltype < eelOclNR);
 207     assert(evdwtype < eelOclNR);
 208
 209     if (bDoEne)
 210     {
 211         if (bDoPrune)
 212         {
 213             kernel_name_to_run = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
 214             kernel_ptr         = &(nb->kernel_ener_prune_ptr[eeltype][evdwtype]);
 215         }
 216         else
 217         {
 218             kernel_name_to_run = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
 219             kernel_ptr         = &(nb->kernel_ener_noprune_ptr[eeltype][evdwtype]);
 220         }
 221     }
 222     else
 223     {
 224         if (bDoPrune)
 225         {
 226             kernel_name_to_run = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
 227             kernel_ptr         = &(nb->kernel_noener_prune_ptr[eeltype][evdwtype]);
 228         }
 229         else
 230         {
 231             kernel_name_to_run = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
 232             kernel_ptr         = &(nb->kernel_noener_noprune_ptr[eeltype][evdwtype]);
 233         }
 234     }
 235
 236     if (NULL == kernel_ptr[0])
 237     {
 238         *kernel_ptr = clCreateKernel(nb->dev_info->program, kernel_name_to_run, &cl_error);
 239         assert(cl_error == CL_SUCCESS);
 240     }
 241     // TODO: handle errors
 242
 243     return *kernel_ptr;
 244 }
 245
 246 /*! \brief Calculates the amount of shared memory required by the OpenCL kernel in use.
 247  */
 248 static inline int calc_shmem_required()
 249 {
 250     int shmem;
 251
 252     /* size of shmem (force-buffers/xq/atom type preloading) */
 253     /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
 254     /* i-atom x+q in shared memory */
 255     //shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 256     shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float) * 4; /* xqib */
 257     /* cj in shared memory, for both warps separately */
 258     shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);       /* cjs  */
 259 #ifdef IATYPE_SHMEM                                         // CUDA ARCH >= 300
 260     /* i-atom types in shared memory */
 261     #pragma error "Should not be defined"
 262     shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);       /* atib */
 263 #endif
 264     /* force reduction buffers in shared memory */
 265     shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float); /* f_buf */
 266     /* Warp vote. In fact it must be * number of warps in block.. */
 267     shmem += sizeof(cl_uint) * 2;                   /* warp_any */
 268     return shmem;
 269 }
 270
 271 /*! \brief Initializes data structures that are going to be sent to the OpenCL device.
 272  *
 273  *  The device can't use the same data structures as the host for two main reasons:
 274  *  - OpenCL restrictions (pointers are not accepted inside data structures)
 275  *  - some host side fields are not needed for the OpenCL kernels.
 276  */
 277 static void fillin_ocl_structures(cl_nbparam_t        *nbp,
 278                                   cl_nbparam_params_t *nbparams_params)
 279 {
 280     nbparams_params->coulomb_tab_scale = nbp->coulomb_tab_scale;
 281     nbparams_params->coulomb_tab_size  = nbp->coulomb_tab_size;
 282     nbparams_params->c_rf              = nbp->c_rf;
 283     nbparams_params->dispersion_shift  = nbp->dispersion_shift;
 284     nbparams_params->eeltype           = nbp->eeltype;
 285     nbparams_params->epsfac            = nbp->epsfac;
 286     nbparams_params->ewaldcoeff_lj     = nbp->ewaldcoeff_lj;
 287     nbparams_params->ewald_beta        = nbp->ewald_beta;
 288     nbparams_params->rcoulomb_sq       = nbp->rcoulomb_sq;
 289     nbparams_params->repulsion_shift   = nbp->repulsion_shift;
 290     nbparams_params->rlist_sq          = nbp->rlist_sq;
 291     nbparams_params->rvdw_sq           = nbp->rvdw_sq;
 292     nbparams_params->rvdw_switch       = nbp->rvdw_switch;
 293     nbparams_params->sh_ewald          = nbp->sh_ewald;
 294     nbparams_params->sh_lj_ewald       = nbp->sh_lj_ewald;
 295     nbparams_params->two_k_rf          = nbp->two_k_rf;
 296     nbparams_params->vdwtype           = nbp->vdwtype;
 297     nbparams_params->vdw_switch        = nbp->vdw_switch;
 298 }
 299
 300 /*! \brief Waits for the commands associated with the input event to finish.
 301  * Then it releases the event and sets it to 0.
 302  * Don't use this function when more than one wait will be issued for the event.
 303  */
 304 void wait_ocl_event(cl_event *ocl_event)
 305 {
 306     cl_int gmx_unused cl_error;
 307
 308     /* Blocking wait for the event */
 309     cl_error = clWaitForEvents(1, ocl_event);
 310     assert(CL_SUCCESS == cl_error);
 311
 312     /* Release event and reset it to 0 */
 313     cl_error = clReleaseEvent(*ocl_event);
 314     assert(CL_SUCCESS == cl_error);
 315     *ocl_event = 0;
 316 }
 317
 318 /*! \brief Enqueues a wait for event completion.
 319  *
 320  * Then it releases the event and sets it to 0.
 321  * Don't use this function when more than one wait will be issued for the event.
 322  * Equivalent to Cuda Stream Sync. */
 323 void sync_ocl_event(cl_command_queue stream, cl_event *ocl_event)
 324 {
 325     cl_int gmx_unused cl_error;
 326
 327     /* Enqueue wait */
 328 #ifdef CL_VERSION_1_2
 329     cl_error = clEnqueueBarrierWithWaitList(stream, 1, ocl_event, NULL);
 330 #else
 331     cl_error = clEnqueueWaitForEvents(stream, 1, ocl_event);
 332 #endif
 333
 334     assert(CL_SUCCESS == cl_error);
 335
 336     /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */
 337     cl_error = clReleaseEvent(*ocl_event);
 338     assert(CL_SUCCESS == cl_error);
 339     *ocl_event = 0;
 340 }
 341
 342 /*! \brief Returns the duration in miliseconds for the command associated with the event.
 343  *
 344  * It then releases the event and sets it to 0.
 345  * Before calling this function, make sure the command has finished either by
 346  * calling clFinish or clWaitForEvents.
 347  * The function returns 0.0 if the input event, *ocl_event, is 0.
 348  * Don't use this function when more than one wait will be issued for the event.
 349  */
 350 double ocl_event_elapsed_ms(cl_event *ocl_event)
 351 {
 352     cl_int gmx_unused cl_error;
 353     cl_ulong          start_ns, end_ns;
 354     double            elapsed_ms;
 355
 356     elapsed_ms = 0.0;
 357     assert(NULL != ocl_event);
 358
 359     if (*ocl_event)
 360     {
 361         cl_error = clGetEventProfilingInfo(*ocl_event, CL_PROFILING_COMMAND_START,
 362                                            sizeof(cl_ulong), &start_ns, NULL);
 363         assert(CL_SUCCESS == cl_error);
 364
 365         cl_error = clGetEventProfilingInfo(*ocl_event, CL_PROFILING_COMMAND_END,
 366                                            sizeof(cl_ulong), &end_ns, NULL);
 367         assert(CL_SUCCESS == cl_error);
 368
 369         clReleaseEvent(*ocl_event);
 370         *ocl_event = 0;
 371
 372         elapsed_ms = (end_ns - start_ns) / 1000000.0;
 373     }
 374
 375     return elapsed_ms;
 376 }
 377
 378 /*! \brief Launch GPU kernel
 379
 380    As we execute nonbonded workload in separate queues, before launching
 381    the kernel we need to make sure that he following operations have completed:
 382    - atomdata allocation and related H2D transfers (every nstlist step);
 383    - pair list H2D transfer (every nstlist step);
 384    - shift vector H2D transfer (every nstlist step);
 385    - force (+shift force and energy) output clearing (every step).
 386
 387    These operations are issued in the local queue at the beginning of the step
 388    and therefore always complete before the local kernel launch. The non-local
 389    kernel is launched after the local on the same device/context, so this is
 390    inherently scheduled after the operations in the local stream (including the
 391    above "misc_ops").
 392    However, for the sake of having a future-proof implementation, we use the
 393    misc_ops_done event to record the point in time when the above  operations
 394    are finished and synchronize with this event in the non-local stream.
 395  */
 396 void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
 397                              const struct nbnxn_atomdata_t *nbatom,
 398                              int                            flags,
 399                              int                            iloc)
 400 {
 401     cl_int               cl_error;
 402     int                  adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 403     /* OpenCL kernel launch-related stuff */
 404     int                  shmem;
 405     size_t               local_work_size[3], global_work_size[3];
 406     cl_kernel            nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
 407
 408     cl_atomdata_t       *adat    = nb->atdat;
 409     cl_nbparam_t        *nbp     = nb->nbparam;
 410     cl_plist_t          *plist   = nb->plist[iloc];
 411     cl_timers_t         *t       = nb->timers;
 412     cl_command_queue     stream  = nb->stream[iloc];
 413
 414     bool                 bCalcEner   = flags & GMX_FORCE_ENERGY;
 415     int                  bCalcFshift = flags & GMX_FORCE_VIRIAL;
 416     bool                 bDoTime     = nb->bDoTime;
 417     cl_uint              arg_no;
 418
 419     cl_nbparam_params_t  nbparams_params;
 420 #ifdef DEBUG_OCL
 421     float              * debug_buffer_h;
 422     size_t               debug_buffer_size;
 423 #endif
 424
 425     /* turn energy calculation always on/off (for debugging/testing only) */
 426     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 427
 428     /* Don't launch the non-local kernel if there is no work to do.
 429        Doing the same for the local kernel is more complicated, since the
 430        local part of the force array also depends on the non-local kernel.
 431        So to avoid complicating the code and to reduce the risk of bugs,
 432        we always call the local kernel, the local x+q copy and later (not in
 433        this function) the stream wait, local f copyback and the f buffer
 434        clearing. All these operations, except for the local interaction kernel,
 435        are needed for the non-local interactions. The skip of the local kernel
 436        call is taken care of later in this function. */
 437     if (iloc == eintNonlocal && plist->nsci == 0)
 438     {
 439         return;
 440     }
 441
 442     /* calculate the atom data index range based on locality */
 443     if (LOCAL_I(iloc))
 444     {
 445         adat_begin  = 0;
 446         adat_len    = adat->natoms_local;
 447     }
 448     else
 449     {
 450         adat_begin  = adat->natoms_local;
 451         adat_len    = adat->natoms - adat->natoms_local;
 452     }
 453
 454     /* beginning of timed HtoD section */
 455
 456     /* HtoD x, q */
 457     ocl_copy_H2D_async(adat->xq, nbatom->x + adat_begin * 4, adat_begin*sizeof(float)*4,
 458                        adat_len * sizeof(float) * 4, stream, bDoTime ? (&(t->nb_h2d[iloc])) : NULL);
 459
 460     /* When we get here all misc operations issues in the local stream as well as
 461        the local xq H2D are done,
 462        so we record that in the local stream and wait for it in the nonlocal one. */
 463     if (nb->bUseTwoStreams)
 464     {
 465         if (iloc == eintLocal)
 466         {
 467 #ifdef CL_VERSION_1_2
 468             cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->misc_ops_and_local_H2D_done));
 469 #else
 470             cl_error = clEnqueueMarker(stream, &(nb->misc_ops_and_local_H2D_done));
 471 #endif
 472             assert(CL_SUCCESS == cl_error);
 473         }
 474         else
 475         {
 476             sync_ocl_event(stream, &(nb->misc_ops_and_local_H2D_done));
 477         }
 478     }
 479
 480     if (plist->nsci == 0)
 481     {
 482         /* Don't launch an empty local kernel (is not allowed with OpenCL).
 483          * TODO: Separate H2D and kernel launch into separate functions.
 484          */
 485         return;
 486     }
 487
 488     /* beginning of timed nonbonded calculation section */
 489
 490     /* get the pointer to the kernel flavor we need to use */
 491     nb_kernel = select_nbnxn_kernel(nb,
 492                                     nbp->eeltype,
 493                                     nbp->vdwtype,
 494                                     bCalcEner,
 495                                     plist->bDoPrune || always_prune);
 496
 497     /* kernel launch config */
 498     local_work_size[0] = CL_SIZE;
 499     local_work_size[1] = CL_SIZE;
 500     local_work_size[2] = 1;
 501
 502     global_work_size[0] = plist->nsci * local_work_size[0];
 503     global_work_size[1] = 1 * local_work_size[1];
 504     global_work_size[2] = 1 * local_work_size[2];
 505
 506     validate_global_work_size(global_work_size, 3, nb->dev_info);
 507
 508     shmem     = calc_shmem_required();
 509
 510 #ifdef DEBUG_OCL
 511     {
 512         static int run_step = 1;
 513
 514         if (DEBUG_RUN_STEP == run_step)
 515         {
 516             debug_buffer_size = global_work_size[0] * global_work_size[1] * global_work_size[2] * sizeof(float);
 517             debug_buffer_h    = (float*)calloc(1, debug_buffer_size);
 518             assert(NULL != debug_buffer_h);
 519
 520             if (NULL == nb->debug_buffer)
 521             {
 522                 nb->debug_buffer = clCreateBuffer(nb->dev_info->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
 523                                                   debug_buffer_size, debug_buffer_h, &cl_error);
 524
 525                 assert(CL_SUCCESS == cl_error);
 526             }
 527         }
 528
 529         run_step++;
 530     }
 531 #endif
 532     if (debug)
 533     {
 534         fprintf(debug, "GPU launch configuration:\n\tLocal work size: %dx%dx%d\n\t"
 535                 "Global work size : %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
 536                 (int)(local_work_size[0]), (int)(local_work_size[1]), (int)(local_work_size[2]),
 537                 (int)(global_work_size[0]), (int)(global_work_size[1]), plist->nsci*NCL_PER_SUPERCL,
 538                 NCL_PER_SUPERCL, plist->na_c);
 539     }
 540
 541     fillin_ocl_structures(nbp, &nbparams_params);
 542
 543     arg_no    = 0;
 544     cl_error  = clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &(adat->ntypes));
 545     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(nbparams_params), &(nbparams_params));
 546     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->xq));
 547     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->f));
 548     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_lj));
 549     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_el));
 550     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->fshift));
 551     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->atom_types));
 552     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->shift_vec));
 553     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_climg2d));
 554     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_comb_climg2d));
 555     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->coulomb_tab_climg2d));
 556     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->sci));
 557     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->cj4));
 558     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->excl));
 559     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &bCalcFshift);
 560     cl_error |= clSetKernelArg(nb_kernel, arg_no++, shmem, NULL);
 561     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nb->debug_buffer));
 562
 563     assert(cl_error == CL_SUCCESS);
 564
 565     if (cl_error)
 566     {
 567         printf("ClERROR! %d\n", cl_error);
 568     }
 569     cl_error = clEnqueueNDRangeKernel(stream, nb_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, bDoTime ? &(t->nb_k[iloc]) : NULL);
 570     assert(cl_error == CL_SUCCESS);
 571
 572 #ifdef DEBUG_OCL
 573     {
 574         static int run_step = 1;
 575
 576         if (DEBUG_RUN_STEP == run_step)
 577         {
 578             FILE *pf;
 579             char  file_name[256] = {0};
 580
 581             ocl_copy_D2H_async(debug_buffer_h, nb->debug_buffer, 0,
 582                                debug_buffer_size, stream, NULL);
 583
 584             // Make sure all data has been transfered back from device
 585             clFinish(stream);
 586
 587             printf("\nWriting debug_buffer to debug_buffer_ocl.txt...");
 588
 589             sprintf(file_name, "debug_buffer_ocl_%d.txt", DEBUG_RUN_STEP);
 590             pf = fopen(file_name, "wt");
 591             assert(pf != NULL);
 592
 593             fprintf(pf, "%20s", "");
 594             for (int j = 0; j < global_work_size[0]; j++)
 595             {
 596                 char label[20];
 597                 sprintf(label, "(wIdx=%2d thIdx=%2d)", j / local_work_size[0], j % local_work_size[0]);
 598                 fprintf(pf, "%20s", label);
 599             }
 600
 601             for (int i = 0; i < global_work_size[1]; i++)
 602             {
 603                 char label[20];
 604                 sprintf(label, "(wIdy=%2d thIdy=%2d)", i / local_work_size[1], i % local_work_size[1]);
 605                 fprintf(pf, "\n%20s", label);
 606
 607                 for (int j = 0; j < global_work_size[0]; j++)
 608                 {
 609                     fprintf(pf, "%20.5f", debug_buffer_h[i * global_work_size[0] + j]);
 610                 }
 611
 612                 //fprintf(pf, "\n");
 613             }
 614
 615             fclose(pf);
 616
 617             printf(" done.\n");
 618
 619
 620             free(debug_buffer_h);
 621             debug_buffer_h = NULL;
 622         }
 623
 624         run_step++;
 625     }
 626 #endif
 627 }
 628
 629 /*! \brief Debugging helper function */
 630 void dump_compare_results_cj4(nbnxn_cj4_t* results, int cnt, char* out_file, char* ref_file)
 631 {
 632     FILE *pf;
 633
 634     pf = fopen(out_file, "wt");
 635     assert(pf != NULL);
 636
 637     fprintf(pf, "%20s%20s%20s%20s%20s%20s%20s%20s\n",
 638             "cj[0]", "cj[1]", "cj[2]", "cj[3]",
 639             "imei[0].excl_ind", "imei[0].imask",
 640             "imei[1].excl_ind", "imei[1].imask");
 641
 642     for (int index = 0; index < cnt; index++)
 643     {
 644         fprintf(pf, "%20d%20d%20d%20d%20d%20u%20d%20u\n",
 645                 results[index].cj[0], results[index].cj[1], results[index].cj[2], results[index].cj[3],
 646                 results[index].imei[0].excl_ind, results[index].imei[0].imask,
 647                 results[index].imei[1].excl_ind, results[index].imei[1].imask);
 648     }
 649
 650     fclose(pf);
 651
 652     printf("\nWrote results to %s", out_file);
 653
 654     pf = fopen(ref_file, "rt");
 655     if (pf)
 656     {
 657         char c;
 658         int  diff = 0;
 659         printf("\n%s file found. Comparing results...", ref_file);
 660
 661         /* Skip the first line */
 662         c = 0;
 663         while (c != '\n')
 664         {
 665             if (1 != fscanf(pf, "%c", &c))
 666             {
 667                 break;
 668             }
 669         }
 670
 671         for (int index = 0; index < cnt; index++)
 672         {
 673             int          ref_val;
 674             unsigned int u_ref_val;
 675
 676             for (int j = 0; j < 4; j++)
 677             {
 678                 if (1 != fscanf(pf, "%20d", &ref_val))
 679                 {
 680                     break;
 681                 }
 682
 683                 if (ref_val != results[index].cj[j])
 684                 {
 685                     printf("\nDifference for cj[%d] at index %d computed value = %d reference value = %d",
 686                            j, index, results[index].cj[j], ref_val);
 687
 688                     diff++;
 689                 }
 690             }
 691
 692             for (int j = 0; j < 2; j++)
 693             {
 694                 if (1 != fscanf(pf, "%20d", &ref_val))
 695                 {
 696                     break;
 697                 }
 698
 699                 if (ref_val != results[index].imei[j].excl_ind)
 700                 {
 701                     printf("\nDifference for imei[%d].excl_ind at index %d computed value = %d reference value = %d",
 702                            j, index, results[index].imei[j].excl_ind, ref_val);
 703
 704                     diff++;
 705                 }
 706
 707                 if (1 != fscanf(pf, "%20u", &u_ref_val))
 708                 {
 709                     break;
 710                 }
 711
 712                 if (u_ref_val != results[index].imei[j].imask)
 713                 {
 714                     printf("\nDifference for imei[%d].imask at index %d computed value = %u reference value = %u",
 715                            j, index, results[index].imei[j].imask, u_ref_val);
 716
 717                     diff++;
 718                 }
 719
 720             }
 721         }
 722
 723         printf("\nFinished comparing results. Total number of differences: %d", diff);
 724         fclose(pf);
 725     }
 726     else
 727     {
 728         printf("\n%s file not found. No comparison performed.", ref_file);
 729     }
 730 }
 731
 732 /*! \brief Debugging helper function */
 733 void dump_compare_results_f(float* results, int cnt, char* out_file, char* ref_file)
 734 {
 735     FILE *pf;
 736     float cmp_eps = 0.001f;
 737
 738     pf = fopen(out_file, "wt");
 739     assert(pf != NULL);
 740
 741     for (int index = 0; index < cnt; index++)
 742     {
 743         fprintf(pf, "%15.5f\n", results[index]);
 744     }
 745
 746     fclose(pf);
 747
 748     printf("\nWrote results to %s", out_file);
 749
 750     pf = fopen(ref_file, "rt");
 751     if (pf)
 752     {
 753         int diff = 0;
 754         printf("\n%s file found. Comparing results...", ref_file);
 755         for (int index = 0; index < cnt; index++)
 756         {
 757             float ref_val;
 758             if (1 != fscanf(pf, "%20f", &ref_val))
 759             {
 760                 break;
 761             }
 762
 763             if (((ref_val - results[index]) > cmp_eps) ||
 764                 ((ref_val - results[index]) < -cmp_eps))
 765             {
 766                 printf("\nDifference at index %d computed value = %15.5f reference value = %15.5f",
 767                        index, results[index], ref_val);
 768
 769                 diff++;
 770             }
 771         }
 772
 773         printf("\nFinished comparing results. Total number of differences: %d", diff);
 774         fclose(pf);
 775     }
 776     else
 777     {
 778         printf("\n%s file not found. No comparison performed.", ref_file);
 779     }
 780 }
 781
 782 /*! \brief
 783  * Debug function for dumping cj4, f and fshift buffers.
 784  * By default this function does nothing. To enable debugging for any of these
 785  * buffers, uncomment the corresponding definition inside the function:
 786  * DEBUG_DUMP_CJ4_OCL, DEBUG_DUMP_F_OCL, DEBUG_DUMP_FSHIFT_OCL.
 787  */
 788 static
 789 void debug_dump_cj4_f_fshift(gmx_nbnxn_ocl_t               gmx_unused *nb,
 790                              const struct nbnxn_atomdata_t gmx_unused *nbatom,
 791                              cl_command_queue              gmx_unused  stream,
 792                              int                           gmx_unused  adat_begin,
 793                              int                           gmx_unused  adat_len)
 794 {
 795 /* Uncomment this define to enable cj4 debugging for the first kernel run */
 796 //#define DEBUG_DUMP_CJ4_OCL
 797 #ifdef DEBUG_DUMP_CJ4_OCL
 798     {
 799         static int run_step = 1;
 800
 801         if (DEBUG_RUN_STEP == run_step)
 802         {
 803             nbnxn_cj4_t *temp_cj4;
 804             int          cnt;
 805             size_t       size;
 806             char         ocl_file_name[256]  = {0};
 807             char         cuda_file_name[256] = {0};
 808
 809             cnt      = nb->plist[0]->ncj4;
 810             size     = cnt * sizeof(nbnxn_cj4_t);
 811             temp_cj4 = (nbnxn_cj4_t*)malloc(size);
 812
 813             ocl_copy_D2H_async(temp_cj4, nb->plist[0]->cj4, 0,
 814                                size, stream, NULL);
 815
 816             // Make sure all data has been transfered back from device
 817             clFinish(stream);
 818
 819             sprintf(ocl_file_name, "ocl_cj4_%d.txt", DEBUG_RUN_STEP);
 820             sprintf(cuda_file_name, "cuda_cj4_%d.txt", DEBUG_RUN_STEP);
 821             dump_compare_results_cj4(temp_cj4, cnt, ocl_file_name, cuda_file_name);
 822
 823             free(temp_cj4);
 824         }
 825
 826         run_step++;
 827     }
 828 #endif
 829
 830 /* Uncomment this define to enable f debugging for the first kernel run */
 831 //#define DEBUG_DUMP_F_OCL
 832 #ifdef DEBUG_DUMP_F_OCL
 833     {
 834         static int run_step = 1;
 835
 836         if (DEBUG_RUN_STEP == run_step)
 837         {
 838             char ocl_file_name[256]  = {0};
 839             char cuda_file_name[256] = {0};
 840
 841             // Make sure all data has been transfered back from device
 842             clFinish(stream);
 843
 844             sprintf(ocl_file_name, "ocl_f_%d.txt", DEBUG_RUN_STEP);
 845             sprintf(cuda_file_name, "cuda_f_%d.txt", DEBUG_RUN_STEP);
 846
 847             dump_compare_results_f(nbatom->out[0].f + adat_begin * 3, (adat_len) * 3,
 848                                    ocl_file_name, cuda_file_name);
 849         }
 850
 851         run_step++;
 852     }
 853 #endif
 854
 855 /* Uncomment this define to enable fshift debugging for the first kernel run */
 856 //#define DEBUG_DUMP_FSHIFT_OCL
 857 #ifdef DEBUG_DUMP_FSHIFT_OCL
 858     {
 859         static int run_step = 1;
 860
 861         if (DEBUG_RUN_STEP == run_step)
 862         {
 863             char ocl_file_name[256]  = {0};
 864             char cuda_file_name[256] = {0};
 865
 866             // Make sure all data has been transfered back from device
 867             clFinish(stream);
 868
 869             sprintf(ocl_file_name, "ocl_fshift_%d.txt", DEBUG_RUN_STEP);
 870             sprintf(cuda_file_name, "cuda_fshift_%d.txt", DEBUG_RUN_STEP);
 871
 872             dump_compare_results_f((float*)(nb->nbst.fshift), SHIFTS * 3,
 873                                    ocl_file_name, cuda_file_name);
 874         }
 875
 876         run_step++;
 877     }
 878 #endif
 879 }
 880
 881 /*! \brief
 882  * Launch asynchronously the download of nonbonded forces from the GPU
 883  * (and energies/shift forces if required).
 884  */
 885 void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
 886                               const struct nbnxn_atomdata_t *nbatom,
 887                               int                            flags,
 888                               int                            aloc)
 889 {
 890     cl_int gmx_unused cl_error;
 891     int               adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 892     int               iloc = -1;
 893
 894     /* determine interaction locality from atom locality */
 895     if (LOCAL_A(aloc))
 896     {
 897         iloc = eintLocal;
 898     }
 899     else if (NONLOCAL_A(aloc))
 900     {
 901         iloc = eintNonlocal;
 902     }
 903     else
 904     {
 905         char stmp[STRLEN];
 906         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 907                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 908
 909         gmx_incons(stmp);
 910     }
 911
 912     cl_atomdata_t   *adat    = nb->atdat;
 913     cl_timers_t     *t       = nb->timers;
 914     bool             bDoTime = nb->bDoTime;
 915     cl_command_queue stream  = nb->stream[iloc];
 916
 917     bool             bCalcEner   = flags & GMX_FORCE_ENERGY;
 918     int              bCalcFshift = flags & GMX_FORCE_VIRIAL;
 919
 920
 921     /* don't launch non-local copy-back if there was no non-local work to do */
 922     if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
 923     {
 924         return;
 925     }
 926
 927     /* calculate the atom data index range based on locality */
 928     if (LOCAL_A(aloc))
 929     {
 930         adat_begin  = 0;
 931         adat_len    = adat->natoms_local;
 932     }
 933     else
 934     {
 935         adat_begin  = adat->natoms_local;
 936         adat_len    = adat->natoms - adat->natoms_local;
 937     }
 938
 939     /* beginning of timed D2H section */
 940
 941     /* With DD the local D2H transfer can only start after the non-local
 942        has been launched. */
 943     if (iloc == eintLocal && nb->bUseTwoStreams)
 944     {
 945         sync_ocl_event(stream, &(nb->nonlocal_done));
 946     }
 947
 948     /* DtoH f */
 949     ocl_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f, adat_begin*3*sizeof(float),
 950                        (adat_len)* adat->f_elem_size, stream, bDoTime ? &(t->nb_d2h_f[iloc]) : NULL);
 951
 952     /* After the non-local D2H is launched the nonlocal_done event can be
 953        recorded which signals that the local D2H can proceed. This event is not
 954        placed after the non-local kernel because we first need the non-local
 955        data back first. */
 956     if (iloc == eintNonlocal)
 957     {
 958 #ifdef CL_VERSION_1_2
 959         cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->nonlocal_done));
 960 #else
 961         cl_error = clEnqueueMarker(stream, &(nb->nonlocal_done));
 962 #endif
 963         assert(CL_SUCCESS == cl_error);
 964     }
 965
 966     /* only transfer energies in the local stream */
 967     if (LOCAL_I(iloc))
 968     {
 969         /* DtoH fshift */
 970         if (bCalcFshift)
 971         {
 972             ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
 973                                SHIFTS * adat->fshift_elem_size, stream, bDoTime ? &(t->nb_d2h_fshift[iloc]) : NULL);
 974         }
 975
 976         /* DtoH energies */
 977         if (bCalcEner)
 978         {
 979             ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0,
 980                                sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_lj[iloc]) : NULL);
 981
 982             ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0,
 983                                sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_el[iloc]) : NULL);
 984         }
 985     }
 986
 987     debug_dump_cj4_f_fshift(nb, nbatom, stream, adat_begin, adat_len);
 988 }
 989
 990 /*! \brief
 991  * Wait for the asynchronously launched nonbonded calculations and data
 992  * transfers to finish.
 993  */
 994 void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_ocl_t *nb,
 995                             const nbnxn_atomdata_t gmx_unused *nbatom,
 996                             int flags, int aloc,
 997                             real *e_lj, real *e_el, rvec *fshift)
 998 {
 999     /* NOTE:  only implemented for single-precision at this time */
1000     cl_int gmx_unused      cl_error;
1001     int                    i, iloc = -1;
1002
1003     /* determine interaction locality from atom locality */
1004     if (LOCAL_A(aloc))
1005     {
1006         iloc = eintLocal;
1007     }
1008     else if (NONLOCAL_A(aloc))
1009     {
1010         iloc = eintNonlocal;
1011     }
1012     else
1013     {
1014         char stmp[STRLEN];
1015         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
1016                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
1017         gmx_incons(stmp);
1018     }
1019
1020     cl_plist_t                 *plist    = nb->plist[iloc];
1021     cl_timers_t                *timers   = nb->timers;
1022     struct gmx_wallclock_gpu_t *timings  = nb->timings;
1023     cl_nb_staging               nbst     = nb->nbst;
1024
1025     bool                        bCalcEner   = flags & GMX_FORCE_ENERGY;
1026     int                         bCalcFshift = flags & GMX_FORCE_VIRIAL;
1027
1028     /* turn energy calculation always on/off (for debugging/testing only) */
1029     bCalcEner = (bCalcEner || always_ener) && !never_ener;
1030
1031     /* Launch wait/update timers & counters, unless doing the non-local phase
1032        when there is not actually work to do. This is consistent with
1033        nbnxn_gpu_launch_kernel.
1034
1035        NOTE: if timing with multiple GPUs (streams) becomes possible, the
1036        counters could end up being inconsistent due to not being incremented
1037        on some of the nodes! */
1038     if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
1039     {
1040         return;
1041     }
1042
1043     /* Actual sync point. Waits for everything to be finished in the command queue. TODO: Find out if a more fine grained solution is needed */
1044     cl_error = clFinish(nb->stream[iloc]);
1045     assert(CL_SUCCESS == cl_error);
1046
1047     /* timing data accumulation */
1048     if (nb->bDoTime)
1049     {
1050         /* only increase counter once (at local F wait) */
1051         if (LOCAL_I(iloc))
1052         {
1053             timings->nb_c++;
1054             timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
1055         }
1056
1057         /* kernel timings */
1058
1059         timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
1060             ocl_event_elapsed_ms(timers->nb_k + iloc);
1061
1062         /* X/q H2D and F D2H timings */
1063         timings->nb_h2d_t += ocl_event_elapsed_ms(timers->nb_h2d        + iloc);
1064         timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_f      + iloc);
1065         timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_fshift + iloc);
1066         timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_e_el   + iloc);
1067         timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_e_lj   + iloc);
1068
1069         /* only count atdat and pair-list H2D at pair-search step */
1070         if (plist->bDoPrune)
1071         {
1072             /* atdat transfer timing (add only once, at local F wait) */
1073             if (LOCAL_A(aloc))
1074             {
1075                 timings->pl_h2d_c++;
1076                 timings->pl_h2d_t += ocl_event_elapsed_ms(&(timers->atdat));
1077             }
1078
1079             timings->pl_h2d_t +=
1080                 ocl_event_elapsed_ms(timers->pl_h2d_sci     + iloc) +
1081                 ocl_event_elapsed_ms(timers->pl_h2d_cj4     + iloc) +
1082                 ocl_event_elapsed_ms(timers->pl_h2d_excl    + iloc);
1083
1084         }
1085     }
1086
1087     /* add up energies and shift forces (only once at local F wait) */
1088     if (LOCAL_I(iloc))
1089     {
1090         if (bCalcEner)
1091         {
1092             *e_lj += *nbst.e_lj;
1093             *e_el += *nbst.e_el;
1094         }
1095
1096         if (bCalcFshift)
1097         {
1098             for (i = 0; i < SHIFTS; i++)
1099             {
1100                 fshift[i][0] += (nbst.fshift)[i][0];
1101                 fshift[i][1] += (nbst.fshift)[i][1];
1102                 fshift[i][2] += (nbst.fshift)[i][2];
1103             }
1104         }
1105     }
1106
1107     /* turn off pruning (doesn't matter if this is pair-search step or not) */
1108     plist->bDoPrune = false;
1109
1110 }
1111
1112 /*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
1113 int nbnxn_gpu_pick_ewald_kernel_type(bool bTwinCut)
1114 {
1115     bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
1116     int  kernel_type;
1117
1118     /* Benchmarking/development environment variables to force the use of
1119        analytical or tabulated Ewald kernel. */
1120     bForceAnalyticalEwald = (getenv("GMX_OCL_NB_ANA_EWALD") != NULL);
1121     bForceTabulatedEwald  = (getenv("GMX_OCL_NB_TAB_EWALD") != NULL);
1122
1123     if (bForceAnalyticalEwald && bForceTabulatedEwald)
1124     {
1125         gmx_incons("Both analytical and tabulated Ewald OpenCL non-bonded kernels "
1126                    "requested through environment variables.");
1127     }
1128
1129     /* CUDA: By default, on SM 3.0 and later use analytical Ewald, on earlier tabulated. */
1130     /* OpenCL: By default, use analytical Ewald, on earlier tabulated. */
1131     // TODO: decide if dev_info parameter should be added to recognize NVIDIA CC>=3.0 devices.
1132     //if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
1133     if ((1                         || bForceAnalyticalEwald) && !bForceTabulatedEwald)
1134     {
1135         bUseAnalyticalEwald = true;
1136
1137         if (debug)
1138         {
1139             fprintf(debug, "Using analytical Ewald OpenCL kernels\n");
1140         }
1141     }
1142     else
1143     {
1144         bUseAnalyticalEwald = false;
1145
1146         if (debug)
1147         {
1148             fprintf(debug, "Using tabulated Ewald OpenCL kernels\n");
1149         }
1150     }
1151
1152     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
1153        forces it (use it for debugging/benchmarking only). */
1154     if (!bTwinCut && (getenv("GMX_OCL_NB_EWALD_TWINCUT") == NULL))
1155     {
1156         kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA : eelOclEWALD_TAB;
1157     }
1158     else
1159     {
1160         kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA_TWIN : eelOclEWALD_TAB_TWIN;
1161     }
1162
1163     return kernel_type;
1164 }