src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *  \brief Define OpenCL implementation of nbnxn_gpu.h
  37  *
  38  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  39  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  40  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  41  *  \ingroup module_mdlib
  42  */
  43 #include "gmxpre.h"
  44
  45 #include "config.h"
  46
  47 #include <assert.h>
  48 #include <stdlib.h>
  49
  50 #if defined(_MSVC)
  51 #include <limits>
  52 #endif
  53
  54 #include "gromacs/gmxlib/ocl_tools/oclutils.h"
  55 #include "gromacs/legacyheaders/types/force_flags.h"
  56 #include "gromacs/legacyheaders/types/hw_info.h"
  57 #include "gromacs/legacyheaders/types/simple.h"
  58 #include "gromacs/mdlib/nb_verlet.h"
  59 #include "gromacs/mdlib/nbnxn_consts.h"
  60 #include "gromacs/mdlib/nbnxn_pairlist.h"
  61 #include "gromacs/timing/gpu_timing.h"
  62
  63 #ifdef TMPI_ATOMICS
  64 #include "thread_mpi/atomic.h"
  65 #endif
  66
  67 #include "gromacs/mdlib/nbnxn_gpu.h"
  68 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  69 #include "gromacs/pbcutil/ishift.h"
  70 #include "gromacs/utility/cstringutil.h"
  71 #include "gromacs/utility/fatalerror.h"
  72
  73 #include "nbnxn_ocl_types.h"
  74
  75 #if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ >= 300
  76 #define USE_TEXOBJ
  77 #endif
  78
  79 /*! \brief Convenience defines */
  80 //@{
  81 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  82 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  83 //@}
  84
  85 /*! \brief Always/never run the energy/pruning kernels -- only for benchmarking purposes */
  86 //@{
  87 static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
  88 static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
  89 static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
  90 //@}
  91
  92 /* Uncomment this define to enable kernel debugging */
  93 //#define DEBUG_OCL
  94
  95 /*! \brief Specifies which kernel run to debug */
  96 #define DEBUG_RUN_STEP 2
  97
  98 /*! \brief Validates the input global work size parameter.
  99  */
 100 static inline void validate_global_work_size(size_t *global_work_size, int work_dim, gmx_device_info_t *dinfo)
 101 {
 102     cl_uint device_size_t_size_bits;
 103     cl_uint host_size_t_size_bits;
 104
 105     assert(dinfo);
 106
 107     /* Each component of a global_work_size must not exceed the range given by the
 108        sizeof(device size_t) for the device on which the kernel execution will
 109        be enqueued. See:
 110        https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
 111      */
 112     device_size_t_size_bits = dinfo->adress_bits;
 113     host_size_t_size_bits   = (cl_uint)(sizeof(size_t) * 8);
 114
 115     /* If sizeof(host size_t) <= sizeof(device size_t)
 116             => global_work_size components will always be valid
 117        else
 118             => get device limit for global work size and
 119             compare it against each component of global_work_size.
 120      */
 121     if (host_size_t_size_bits > device_size_t_size_bits)
 122     {
 123         size_t device_limit;
 124
 125         device_limit = (((size_t)1) << device_size_t_size_bits) - 1;
 126
 127         for (int i = 0; i < work_dim; i++)
 128         {
 129             if (global_work_size[i] > device_limit)
 130             {
 131                 gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
 132                           "The number of nonbonded work units (=number of super-clusters) exceeds the"
 133                           "device capabilities. Global work size limit exceeded (%d > %d)!",
 134                           global_work_size[i], device_limit);
 135             }
 136         }
 137     }
 138 }
 139
 140 /* Constant arrays listing non-bonded kernel function names. The arrays are
 141  * organized in 2-dim arrays by: electrostatics and VDW type.
 142  *
 143  *  Note that the row- and column-order of function pointers has to match the
 144  *  order of corresponding enumerated electrostatics and vdw types, resp.,
 145  *  defined in nbnxn_cuda_types.h.
 146  */
 147
 148 /*! \brief Force-only kernel function names. */
 149 static const char* nb_kfunc_noener_noprune_ptr[eelOclNR][evdwOclNR] =
 150 {
 151     { "nbnxn_kernel_ElecCut_VdwLJ_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_opencl"            },
 152     { "nbnxn_kernel_ElecRF_VdwLJ_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_opencl"             },
 153     { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_opencl"        },
 154     { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_opencl" },
 155     { "nbnxn_kernel_ElecEw_VdwLJ_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_opencl"             },
 156     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_opencl"      }
 157 };
 158
 159 /*! \brief Force + energy kernel function pointers. */
 160 static const char* nb_kfunc_ener_noprune_ptr[eelOclNR][evdwOclNR] =
 161 {
 162     { "nbnxn_kernel_ElecCut_VdwLJ_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_opencl"              },
 163     { "nbnxn_kernel_ElecRF_VdwLJ_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_opencl"               },
 164     { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_opencl"          },
 165     { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_opencl"     },
 166     { "nbnxn_kernel_ElecEw_VdwLJ_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_opencl"               },
 167     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_opencl"        }
 168 };
 169
 170 /*! \brief Force + pruning kernel function pointers. */
 171 static const char* nb_kfunc_noener_prune_ptr[eelOclNR][evdwOclNR] =
 172 {
 173     { "nbnxn_kernel_ElecCut_VdwLJ_F_prune_opencl",             "nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_opencl"            },
 174     { "nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl",              "nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_opencl"             },
 175     { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_opencl",         "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_opencl"        },
 176     { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_opencl",  "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_opencl" },
 177     { "nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl",              "nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_opencl"             },
 178     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_opencl",       "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_opencl"      }
 179 };
 180
 181 /*! \brief Force + energy + pruning kernel function pointers. */
 182 static const char* nb_kfunc_ener_prune_ptr[eelOclNR][evdwOclNR] =
 183 {
 184     { "nbnxn_kernel_ElecCut_VdwLJ_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_opencl"            },
 185     { "nbnxn_kernel_ElecRF_VdwLJ_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_opencl"             },
 186     { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_opencl"        },
 187     { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_opencl" },
 188     { "nbnxn_kernel_ElecEw_VdwLJ_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_opencl"             },
 189     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_opencl"      }
 190 };
 191
 192 /*! \brief Return a pointer to the kernel version to be executed at the current step.
 193  *  OpenCL kernel objects are cached in nb. If the requested kernel is not
 194  *  found in the cache, it will be created and the cache will be updated.
 195  */
 196 static inline cl_kernel select_nbnxn_kernel(gmx_nbnxn_ocl_t   *nb,
 197                                             int                eeltype,
 198                                             int                evdwtype,
 199                                             bool               bDoEne,
 200                                             bool               bDoPrune)
 201 {
 202     const char* kernel_name_to_run;
 203     cl_kernel  *kernel_ptr;
 204     cl_int      cl_error;
 205
 206     assert(eeltype < eelOclNR);
 207     assert(evdwtype < eelOclNR);
 208
 209     if (bDoEne)
 210     {
 211         if (bDoPrune)
 212         {
 213             kernel_name_to_run = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
 214             kernel_ptr         = &(nb->kernel_ener_prune_ptr[eeltype][evdwtype]);
 215         }
 216         else
 217         {
 218             kernel_name_to_run = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
 219             kernel_ptr         = &(nb->kernel_ener_noprune_ptr[eeltype][evdwtype]);
 220         }
 221     }
 222     else
 223     {
 224         if (bDoPrune)
 225         {
 226             kernel_name_to_run = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
 227             kernel_ptr         = &(nb->kernel_noener_prune_ptr[eeltype][evdwtype]);
 228         }
 229         else
 230         {
 231             kernel_name_to_run = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
 232             kernel_ptr         = &(nb->kernel_noener_noprune_ptr[eeltype][evdwtype]);
 233         }
 234     }
 235
 236     if (NULL == kernel_ptr[0])
 237     {
 238         *kernel_ptr = clCreateKernel(nb->dev_info->program, kernel_name_to_run, &cl_error);
 239         assert(cl_error == CL_SUCCESS);
 240     }
 241     // TODO: handle errors
 242
 243     return *kernel_ptr;
 244 }
 245
 246 /*! \brief Calculates the amount of shared memory required by the OpenCL kernel in use.
 247  */
 248 static inline int calc_shmem_required()
 249 {
 250     int shmem;
 251
 252     /* size of shmem (force-buffers/xq/atom type preloading) */
 253     /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
 254     /* i-atom x+q in shared memory */
 255     //shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 256     shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float) * 4; /* xqib */
 257     /* cj in shared memory, for both warps separately */
 258     shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);       /* cjs  */
 259 #ifdef IATYPE_SHMEM                                         // CUDA ARCH >= 300
 260     /* i-atom types in shared memory */
 261     #pragma error "Should not be defined"
 262     shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);       /* atib */
 263 #endif
 264     /* force reduction buffers in shared memory */
 265     shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float); /* f_buf */
 266     /* Warp vote. In fact it must be * number of warps in block.. */
 267     shmem += sizeof(cl_uint) * 2;                   /* warp_any */
 268     return shmem;
 269 }
 270
 271 /*! \brief Initializes data structures that are going to be sent to the OpenCL device.
 272  *
 273  *  The device can't use the same data structures as the host for two main reasons:
 274  *  - OpenCL restrictions (pointers are not accepted inside data structures)
 275  *  - some host side fields are not needed for the OpenCL kernels.
 276  */
 277 static void fillin_ocl_structures(cl_nbparam_t        *nbp,
 278                                   cl_nbparam_params_t *nbparams_params)
 279 {
 280     nbparams_params->coulomb_tab_scale = nbp->coulomb_tab_scale;
 281     nbparams_params->coulomb_tab_size  = nbp->coulomb_tab_size;
 282     nbparams_params->c_rf              = nbp->c_rf;
 283     nbparams_params->dispersion_shift  = nbp->dispersion_shift;
 284     nbparams_params->eeltype           = nbp->eeltype;
 285     nbparams_params->epsfac            = nbp->epsfac;
 286     nbparams_params->ewaldcoeff_lj     = nbp->ewaldcoeff_lj;
 287     nbparams_params->ewald_beta        = nbp->ewald_beta;
 288     nbparams_params->rcoulomb_sq       = nbp->rcoulomb_sq;
 289     nbparams_params->repulsion_shift   = nbp->repulsion_shift;
 290     nbparams_params->rlist_sq          = nbp->rlist_sq;
 291     nbparams_params->rvdw_sq           = nbp->rvdw_sq;
 292     nbparams_params->rvdw_switch       = nbp->rvdw_switch;
 293     nbparams_params->sh_ewald          = nbp->sh_ewald;
 294     nbparams_params->sh_lj_ewald       = nbp->sh_lj_ewald;
 295     nbparams_params->two_k_rf          = nbp->two_k_rf;
 296     nbparams_params->vdwtype           = nbp->vdwtype;
 297     nbparams_params->vdw_switch        = nbp->vdw_switch;
 298 }
 299
 300 /*! \brief Waits for the commands associated with the input event to finish.
 301  * Then it releases the event and sets it to 0.
 302  * Don't use this function when more than one wait will be issued for the event.
 303  */
 304 void wait_ocl_event(cl_event *ocl_event)
 305 {
 306     cl_int gmx_unused cl_error;
 307
 308     /* Blocking wait for the event */
 309     cl_error = clWaitForEvents(1, ocl_event);
 310     assert(CL_SUCCESS == cl_error);
 311
 312     /* Release event and reset it to 0 */
 313     cl_error = clReleaseEvent(*ocl_event);
 314     assert(CL_SUCCESS == cl_error);
 315     *ocl_event = 0;
 316 }
 317
 318 /*! \brief Enqueues a wait for event completion.
 319  *
 320  * Then it releases the event and sets it to 0.
 321  * Don't use this function when more than one wait will be issued for the event.
 322  * Equivalent to Cuda Stream Sync. */
 323 void sync_ocl_event(cl_command_queue stream, cl_event *ocl_event)
 324 {
 325     cl_int gmx_unused cl_error;
 326
 327     /* Enqueue wait */
 328 #ifdef CL_VERSION_1_2
 329     cl_error = clEnqueueBarrierWithWaitList(stream, 1, ocl_event, NULL);
 330 #else
 331     cl_error = clEnqueueWaitForEvents(stream, 1, ocl_event);
 332 #endif
 333
 334     assert(CL_SUCCESS == cl_error);
 335
 336     /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */
 337     cl_error = clReleaseEvent(*ocl_event);
 338     assert(CL_SUCCESS == cl_error);
 339     *ocl_event = 0;
 340 }
 341
 342 /*! \brief Returns the duration in miliseconds for the command associated with the event.
 343  *
 344  * It then releases the event and sets it to 0.
 345  * Before calling this function, make sure the command has finished either by
 346  * calling clFinish or clWaitForEvents.
 347  * The function returns 0.0 if the input event, *ocl_event, is 0.
 348  * Don't use this function when more than one wait will be issued for the event.
 349  */
 350 double ocl_event_elapsed_ms(cl_event *ocl_event)
 351 {
 352     cl_int gmx_unused cl_error;
 353     cl_ulong          start_ns, end_ns;
 354     double            elapsed_ms;
 355
 356     elapsed_ms = 0.0;
 357     assert(NULL != ocl_event);
 358
 359     if (*ocl_event)
 360     {
 361         cl_error = clGetEventProfilingInfo(*ocl_event, CL_PROFILING_COMMAND_START,
 362                                            sizeof(cl_ulong), &start_ns, NULL);
 363         assert(CL_SUCCESS == cl_error);
 364
 365         cl_error = clGetEventProfilingInfo(*ocl_event, CL_PROFILING_COMMAND_END,
 366                                            sizeof(cl_ulong), &end_ns, NULL);
 367         assert(CL_SUCCESS == cl_error);
 368
 369         clReleaseEvent(*ocl_event);
 370         *ocl_event = 0;
 371
 372         elapsed_ms = (end_ns - start_ns) / 1000000.0;
 373     }
 374
 375     return elapsed_ms;
 376 }
 377
 378 /*! \brief Launch GPU kernel
 379
 380    As we execute nonbonded workload in separate queues, before launching
 381    the kernel we need to make sure that he following operations have completed:
 382    - atomdata allocation and related H2D transfers (every nstlist step);
 383    - pair list H2D transfer (every nstlist step);
 384    - shift vector H2D transfer (every nstlist step);
 385    - force (+shift force and energy) output clearing (every step).
 386
 387    These operations are issued in the local queue at the beginning of the step
 388    and therefore always complete before the local kernel launch. The non-local
 389    kernel is launched after the local on the same device/context, so this is
 390    inherently scheduled after the operations in the local stream (including the
 391    above "misc_ops").
 392    However, for the sake of having a future-proof implementation, we use the
 393    misc_ops_done event to record the point in time when the above  operations
 394    are finished and synchronize with this event in the non-local stream.
 395  */
 396 void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
 397                              const struct nbnxn_atomdata_t *nbatom,
 398                              int                            flags,
 399                              int                            iloc)
 400 {
 401     cl_int               cl_error;
 402     int                  adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 403     /* OpenCL kernel launch-related stuff */
 404     int                  shmem;
 405     size_t               local_work_size[3], global_work_size[3];
 406     cl_kernel            nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
 407
 408     cl_atomdata_t       *adat    = nb->atdat;
 409     cl_nbparam_t        *nbp     = nb->nbparam;
 410     cl_plist_t          *plist   = nb->plist[iloc];
 411     cl_timers_t         *t       = nb->timers;
 412     cl_command_queue     stream  = nb->stream[iloc];
 413
 414     bool                 bCalcEner   = flags & GMX_FORCE_ENERGY;
 415     int                  bCalcFshift = flags & GMX_FORCE_VIRIAL;
 416     bool                 bDoTime     = nb->bDoTime;
 417     cl_uint              arg_no;
 418
 419     cl_nbparam_params_t  nbparams_params;
 420 #ifdef DEBUG_OCL
 421     float              * debug_buffer_h;
 422     size_t               debug_buffer_size;
 423 #endif
 424
 425     /* turn energy calculation always on/off (for debugging/testing only) */
 426     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 427
 428     /* Don't launch the non-local kernel if there is no work to do.
 429        Doing the same for the local kernel is more complicated, since the
 430        local part of the force array also depends on the non-local kernel.
 431        So to avoid complicating the code and to reduce the risk of bugs,
 432        we always call the local kernel, the local x+q copy and later (not in
 433        this function) the stream wait, local f copyback and the f buffer
 434        clearing. All these operations, except for the local interaction kernel,
 435        are needed for the non-local interactions. The skip of the local kernel
 436        call is taken care of later in this function. */
 437     if (iloc == eintNonlocal && plist->nsci == 0)
 438     {
 439         return;
 440     }
 441
 442     /* calculate the atom data index range based on locality */
 443     if (LOCAL_I(iloc))
 444     {
 445         adat_begin  = 0;
 446         adat_len    = adat->natoms_local;
 447     }
 448     else
 449     {
 450         adat_begin  = adat->natoms_local;
 451         adat_len    = adat->natoms - adat->natoms_local;
 452     }
 453
 454     /* When we get here all misc operations issues in the local stream are done,
 455        so we record that in the local stream and wait for it in the nonlocal one. */
 456     if (nb->bUseTwoStreams)
 457     {
 458         if (iloc == eintLocal)
 459         {
 460 #ifdef CL_VERSION_1_2
 461             cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->misc_ops_done));
 462 #else
 463             cl_error = clEnqueueMarker(stream, &(nb->misc_ops_done));
 464 #endif
 465             assert(CL_SUCCESS == cl_error);
 466         }
 467         else
 468         {
 469             sync_ocl_event(stream, &(nb->misc_ops_done));
 470         }
 471     }
 472
 473     /* beginning of timed HtoD section */
 474
 475     /* HtoD x, q */
 476     ocl_copy_H2D_async(adat->xq, nbatom->x + adat_begin * 4, adat_begin*sizeof(float)*4,
 477                        adat_len * sizeof(float) * 4, stream, bDoTime ? (&(t->nb_h2d[iloc])) : NULL);
 478
 479     if (plist->nsci == 0)
 480     {
 481         /* Don't launch an empty local kernel (is not allowed with OpenCL).
 482          * TODO: Separate H2D and kernel launch into separate functions.
 483          */
 484         return;
 485     }
 486
 487     /* beginning of timed nonbonded calculation section */
 488
 489     /* get the pointer to the kernel flavor we need to use */
 490     nb_kernel = select_nbnxn_kernel(nb,
 491                                     nbp->eeltype,
 492                                     nbp->vdwtype,
 493                                     bCalcEner,
 494                                     plist->bDoPrune || always_prune);
 495
 496     /* kernel launch config */
 497     local_work_size[0] = CL_SIZE;
 498     local_work_size[1] = CL_SIZE;
 499     local_work_size[2] = 1;
 500
 501     global_work_size[0] = plist->nsci * local_work_size[0];
 502     global_work_size[1] = 1 * local_work_size[1];
 503     global_work_size[2] = 1 * local_work_size[2];
 504
 505     validate_global_work_size(global_work_size, 3, nb->dev_info);
 506
 507     shmem     = calc_shmem_required();
 508
 509 #ifdef DEBUG_OCL
 510     {
 511         static int run_step = 1;
 512
 513         if (DEBUG_RUN_STEP == run_step)
 514         {
 515             debug_buffer_size = global_work_size[0] * global_work_size[1] * global_work_size[2] * sizeof(float);
 516             debug_buffer_h    = (float*)calloc(1, debug_buffer_size);
 517             assert(NULL != debug_buffer_h);
 518
 519             if (NULL == nb->debug_buffer)
 520             {
 521                 nb->debug_buffer = clCreateBuffer(nb->dev_info->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
 522                                                   debug_buffer_size, debug_buffer_h, &cl_error);
 523
 524                 assert(CL_SUCCESS == cl_error);
 525             }
 526         }
 527
 528         run_step++;
 529     }
 530 #endif
 531     if (debug)
 532     {
 533         fprintf(debug, "GPU launch configuration:\n\tLocal work size: %dx%dx%d\n\t"
 534                 "Global work size : %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
 535                 (int)(local_work_size[0]), (int)(local_work_size[1]), (int)(local_work_size[2]),
 536                 (int)(global_work_size[0]), (int)(global_work_size[1]), plist->nsci*NCL_PER_SUPERCL,
 537                 NCL_PER_SUPERCL, plist->na_c);
 538     }
 539
 540     fillin_ocl_structures(nbp, &nbparams_params);
 541
 542     arg_no    = 0;
 543     cl_error  = clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &(adat->ntypes));
 544     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(nbparams_params), &(nbparams_params));
 545     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->xq));
 546     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->f));
 547     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_lj));
 548     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_el));
 549     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->fshift));
 550     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->atom_types));
 551     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->shift_vec));
 552     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_climg2d));
 553     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_comb_climg2d));
 554     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->coulomb_tab_climg2d));
 555     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->sci));
 556     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->cj4));
 557     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->excl));
 558     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &bCalcFshift);
 559     cl_error |= clSetKernelArg(nb_kernel, arg_no++, shmem, NULL);
 560     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nb->debug_buffer));
 561
 562     assert(cl_error == CL_SUCCESS);
 563
 564     if (cl_error)
 565     {
 566         printf("ClERROR! %d\n", cl_error);
 567     }
 568     cl_error = clEnqueueNDRangeKernel(stream, nb_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, bDoTime ? &(t->nb_k[iloc]) : NULL);
 569     assert(cl_error == CL_SUCCESS);
 570
 571 #ifdef DEBUG_OCL
 572     {
 573         static int run_step = 1;
 574
 575         if (DEBUG_RUN_STEP == run_step)
 576         {
 577             FILE *pf;
 578             char  file_name[256] = {0};
 579
 580             ocl_copy_D2H_async(debug_buffer_h, nb->debug_buffer, 0,
 581                                debug_buffer_size, stream, NULL);
 582
 583             // Make sure all data has been transfered back from device
 584             clFinish(stream);
 585
 586             printf("\nWriting debug_buffer to debug_buffer_ocl.txt...");
 587
 588             sprintf(file_name, "debug_buffer_ocl_%d.txt", DEBUG_RUN_STEP);
 589             pf = fopen(file_name, "wt");
 590             assert(pf != NULL);
 591
 592             fprintf(pf, "%20s", "");
 593             for (int j = 0; j < global_work_size[0]; j++)
 594             {
 595                 char label[20];
 596                 sprintf(label, "(wIdx=%2d thIdx=%2d)", j / local_work_size[0], j % local_work_size[0]);
 597                 fprintf(pf, "%20s", label);
 598             }
 599
 600             for (int i = 0; i < global_work_size[1]; i++)
 601             {
 602                 char label[20];
 603                 sprintf(label, "(wIdy=%2d thIdy=%2d)", i / local_work_size[1], i % local_work_size[1]);
 604                 fprintf(pf, "\n%20s", label);
 605
 606                 for (int j = 0; j < global_work_size[0]; j++)
 607                 {
 608                     fprintf(pf, "%20.5f", debug_buffer_h[i * global_work_size[0] + j]);
 609                 }
 610
 611                 //fprintf(pf, "\n");
 612             }
 613
 614             fclose(pf);
 615
 616             printf(" done.\n");
 617
 618
 619             free(debug_buffer_h);
 620             debug_buffer_h = NULL;
 621         }
 622
 623         run_step++;
 624     }
 625 #endif
 626 }
 627
 628 /*! \brief Debugging helper function */
 629 void dump_compare_results_cj4(nbnxn_cj4_t* results, int cnt, char* out_file, char* ref_file)
 630 {
 631     FILE *pf;
 632
 633     pf = fopen(out_file, "wt");
 634     assert(pf != NULL);
 635
 636     fprintf(pf, "%20s%20s%20s%20s%20s%20s%20s%20s\n",
 637             "cj[0]", "cj[1]", "cj[2]", "cj[3]",
 638             "imei[0].excl_ind", "imei[0].imask",
 639             "imei[1].excl_ind", "imei[1].imask");
 640
 641     for (int index = 0; index < cnt; index++)
 642     {
 643         fprintf(pf, "%20d%20d%20d%20d%20d%20u%20d%20u\n",
 644                 results[index].cj[0], results[index].cj[1], results[index].cj[2], results[index].cj[3],
 645                 results[index].imei[0].excl_ind, results[index].imei[0].imask,
 646                 results[index].imei[1].excl_ind, results[index].imei[1].imask);
 647     }
 648
 649     fclose(pf);
 650
 651     printf("\nWrote results to %s", out_file);
 652
 653     pf = fopen(ref_file, "rt");
 654     if (pf)
 655     {
 656         char c;
 657         int  diff = 0;
 658         printf("\n%s file found. Comparing results...", ref_file);
 659
 660         /* Skip the first line */
 661         c = 0;
 662         while (c != '\n')
 663         {
 664             if (1 != fscanf(pf, "%c", &c))
 665             {
 666                 break;
 667             }
 668         }
 669
 670         for (int index = 0; index < cnt; index++)
 671         {
 672             int          ref_val;
 673             unsigned int u_ref_val;
 674
 675             for (int j = 0; j < 4; j++)
 676             {
 677                 if (1 != fscanf(pf, "%20d", &ref_val))
 678                 {
 679                     break;
 680                 }
 681
 682                 if (ref_val != results[index].cj[j])
 683                 {
 684                     printf("\nDifference for cj[%d] at index %d computed value = %d reference value = %d",
 685                            j, index, results[index].cj[j], ref_val);
 686
 687                     diff++;
 688                 }
 689             }
 690
 691             for (int j = 0; j < 2; j++)
 692             {
 693                 if (1 != fscanf(pf, "%20d", &ref_val))
 694                 {
 695                     break;
 696                 }
 697
 698                 if (ref_val != results[index].imei[j].excl_ind)
 699                 {
 700                     printf("\nDifference for imei[%d].excl_ind at index %d computed value = %d reference value = %d",
 701                            j, index, results[index].imei[j].excl_ind, ref_val);
 702
 703                     diff++;
 704                 }
 705
 706                 if (1 != fscanf(pf, "%20u", &u_ref_val))
 707                 {
 708                     break;
 709                 }
 710
 711                 if (u_ref_val != results[index].imei[j].imask)
 712                 {
 713                     printf("\nDifference for imei[%d].imask at index %d computed value = %u reference value = %u",
 714                            j, index, results[index].imei[j].imask, u_ref_val);
 715
 716                     diff++;
 717                 }
 718
 719             }
 720         }
 721
 722         printf("\nFinished comparing results. Total number of differences: %d", diff);
 723         fclose(pf);
 724     }
 725     else
 726     {
 727         printf("\n%s file not found. No comparison performed.", ref_file);
 728     }
 729 }
 730
 731 /*! \brief Debugging helper function */
 732 void dump_compare_results_f(float* results, int cnt, char* out_file, char* ref_file)
 733 {
 734     FILE *pf;
 735     float cmp_eps = 0.001f;
 736
 737     pf = fopen(out_file, "wt");
 738     assert(pf != NULL);
 739
 740     for (int index = 0; index < cnt; index++)
 741     {
 742         fprintf(pf, "%15.5f\n", results[index]);
 743     }
 744
 745     fclose(pf);
 746
 747     printf("\nWrote results to %s", out_file);
 748
 749     pf = fopen(ref_file, "rt");
 750     if (pf)
 751     {
 752         int diff = 0;
 753         printf("\n%s file found. Comparing results...", ref_file);
 754         for (int index = 0; index < cnt; index++)
 755         {
 756             float ref_val;
 757             if (1 != fscanf(pf, "%20f", &ref_val))
 758             {
 759                 break;
 760             }
 761
 762             if (((ref_val - results[index]) > cmp_eps) ||
 763                 ((ref_val - results[index]) < -cmp_eps))
 764             {
 765                 printf("\nDifference at index %d computed value = %15.5f reference value = %15.5f",
 766                        index, results[index], ref_val);
 767
 768                 diff++;
 769             }
 770         }
 771
 772         printf("\nFinished comparing results. Total number of differences: %d", diff);
 773         fclose(pf);
 774     }
 775     else
 776     {
 777         printf("\n%s file not found. No comparison performed.", ref_file);
 778     }
 779 }
 780
 781 /*! \brief
 782  * Debug function for dumping cj4, f and fshift buffers.
 783  * By default this function does nothing. To enable debugging for any of these
 784  * buffers, uncomment the corresponding definition inside the function:
 785  * DEBUG_DUMP_CJ4_OCL, DEBUG_DUMP_F_OCL, DEBUG_DUMP_FSHIFT_OCL.
 786  */
 787 static
 788 void debug_dump_cj4_f_fshift(gmx_nbnxn_ocl_t               gmx_unused *nb,
 789                              const struct nbnxn_atomdata_t gmx_unused *nbatom,
 790                              cl_command_queue              gmx_unused  stream,
 791                              int                           gmx_unused  adat_begin,
 792                              int                           gmx_unused  adat_len)
 793 {
 794 /* Uncomment this define to enable cj4 debugging for the first kernel run */
 795 //#define DEBUG_DUMP_CJ4_OCL
 796 #ifdef DEBUG_DUMP_CJ4_OCL
 797     {
 798         static int run_step = 1;
 799
 800         if (DEBUG_RUN_STEP == run_step)
 801         {
 802             nbnxn_cj4_t *temp_cj4;
 803             int          cnt;
 804             size_t       size;
 805             char         ocl_file_name[256]  = {0};
 806             char         cuda_file_name[256] = {0};
 807
 808             cnt      = nb->plist[0]->ncj4;
 809             size     = cnt * sizeof(nbnxn_cj4_t);
 810             temp_cj4 = (nbnxn_cj4_t*)malloc(size);
 811
 812             ocl_copy_D2H_async(temp_cj4, nb->plist[0]->cj4, 0,
 813                                size, stream, NULL);
 814
 815             // Make sure all data has been transfered back from device
 816             clFinish(stream);
 817
 818             sprintf(ocl_file_name, "ocl_cj4_%d.txt", DEBUG_RUN_STEP);
 819             sprintf(cuda_file_name, "cuda_cj4_%d.txt", DEBUG_RUN_STEP);
 820             dump_compare_results_cj4(temp_cj4, cnt, ocl_file_name, cuda_file_name);
 821
 822             free(temp_cj4);
 823         }
 824
 825         run_step++;
 826     }
 827 #endif
 828
 829 /* Uncomment this define to enable f debugging for the first kernel run */
 830 //#define DEBUG_DUMP_F_OCL
 831 #ifdef DEBUG_DUMP_F_OCL
 832     {
 833         static int run_step = 1;
 834
 835         if (DEBUG_RUN_STEP == run_step)
 836         {
 837             char ocl_file_name[256]  = {0};
 838             char cuda_file_name[256] = {0};
 839
 840             // Make sure all data has been transfered back from device
 841             clFinish(stream);
 842
 843             sprintf(ocl_file_name, "ocl_f_%d.txt", DEBUG_RUN_STEP);
 844             sprintf(cuda_file_name, "cuda_f_%d.txt", DEBUG_RUN_STEP);
 845
 846             dump_compare_results_f(nbatom->out[0].f + adat_begin * 3, (adat_len) * 3,
 847                                    ocl_file_name, cuda_file_name);
 848         }
 849
 850         run_step++;
 851     }
 852 #endif
 853
 854 /* Uncomment this define to enable fshift debugging for the first kernel run */
 855 //#define DEBUG_DUMP_FSHIFT_OCL
 856 #ifdef DEBUG_DUMP_FSHIFT_OCL
 857     {
 858         static int run_step = 1;
 859
 860         if (DEBUG_RUN_STEP == run_step)
 861         {
 862             char ocl_file_name[256]  = {0};
 863             char cuda_file_name[256] = {0};
 864
 865             // Make sure all data has been transfered back from device
 866             clFinish(stream);
 867
 868             sprintf(ocl_file_name, "ocl_fshift_%d.txt", DEBUG_RUN_STEP);
 869             sprintf(cuda_file_name, "cuda_fshift_%d.txt", DEBUG_RUN_STEP);
 870
 871             dump_compare_results_f((float*)(nb->nbst.fshift), SHIFTS * 3,
 872                                    ocl_file_name, cuda_file_name);
 873         }
 874
 875         run_step++;
 876     }
 877 #endif
 878 }
 879
 880 /*! \brief
 881  * Launch asynchronously the download of nonbonded forces from the GPU
 882  * (and energies/shift forces if required).
 883  */
 884 void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
 885                               const struct nbnxn_atomdata_t *nbatom,
 886                               int                            flags,
 887                               int                            aloc)
 888 {
 889     cl_int gmx_unused cl_error;
 890     int               adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 891     int               iloc = -1;
 892
 893     /* determine interaction locality from atom locality */
 894     if (LOCAL_A(aloc))
 895     {
 896         iloc = eintLocal;
 897     }
 898     else if (NONLOCAL_A(aloc))
 899     {
 900         iloc = eintNonlocal;
 901     }
 902     else
 903     {
 904         char stmp[STRLEN];
 905         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 906                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 907
 908         gmx_incons(stmp);
 909     }
 910
 911     cl_atomdata_t   *adat    = nb->atdat;
 912     cl_timers_t     *t       = nb->timers;
 913     bool             bDoTime = nb->bDoTime;
 914     cl_command_queue stream  = nb->stream[iloc];
 915
 916     bool             bCalcEner   = flags & GMX_FORCE_ENERGY;
 917     int              bCalcFshift = flags & GMX_FORCE_VIRIAL;
 918
 919
 920     /* don't launch non-local copy-back if there was no non-local work to do */
 921     if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
 922     {
 923         return;
 924     }
 925
 926     /* calculate the atom data index range based on locality */
 927     if (LOCAL_A(aloc))
 928     {
 929         adat_begin  = 0;
 930         adat_len    = adat->natoms_local;
 931     }
 932     else
 933     {
 934         adat_begin  = adat->natoms_local;
 935         adat_len    = adat->natoms - adat->natoms_local;
 936     }
 937
 938     /* beginning of timed D2H section */
 939
 940     /* With DD the local D2H transfer can only start after the non-local
 941        has been launched. */
 942     if (iloc == eintLocal && nb->bUseTwoStreams)
 943     {
 944         sync_ocl_event(stream, &(nb->nonlocal_done));
 945     }
 946
 947     /* DtoH f */
 948     ocl_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f, adat_begin*3*sizeof(float),
 949                        (adat_len)* adat->f_elem_size, stream, bDoTime ? &(t->nb_d2h_f[iloc]) : NULL);
 950
 951     /* After the non-local D2H is launched the nonlocal_done event can be
 952        recorded which signals that the local D2H can proceed. This event is not
 953        placed after the non-local kernel because we first need the non-local
 954        data back first. */
 955     if (iloc == eintNonlocal)
 956     {
 957 #ifdef CL_VERSION_1_2
 958         cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->nonlocal_done));
 959 #else
 960         cl_error = clEnqueueMarker(stream, &(nb->nonlocal_done));
 961 #endif
 962         assert(CL_SUCCESS == cl_error);
 963     }
 964
 965     /* only transfer energies in the local stream */
 966     if (LOCAL_I(iloc))
 967     {
 968         /* DtoH fshift */
 969         if (bCalcFshift)
 970         {
 971             ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
 972                                SHIFTS * adat->fshift_elem_size, stream, bDoTime ? &(t->nb_d2h_fshift[iloc]) : NULL);
 973         }
 974
 975         /* DtoH energies */
 976         if (bCalcEner)
 977         {
 978             ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0,
 979                                sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_lj[iloc]) : NULL);
 980
 981             ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0,
 982                                sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_el[iloc]) : NULL);
 983         }
 984     }
 985
 986     debug_dump_cj4_f_fshift(nb, nbatom, stream, adat_begin, adat_len);
 987 }
 988
 989 /*! \brief
 990  * Wait for the asynchronously launched nonbonded calculations and data
 991  * transfers to finish.
 992  */
 993 void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_ocl_t *nb,
 994                             const nbnxn_atomdata_t gmx_unused *nbatom,
 995                             int flags, int aloc,
 996                             real *e_lj, real *e_el, rvec *fshift)
 997 {
 998     /* NOTE:  only implemented for single-precision at this time */
 999     cl_int gmx_unused      cl_error;
1000     int                    i, iloc = -1;
1001
1002     /* determine interaction locality from atom locality */
1003     if (LOCAL_A(aloc))
1004     {
1005         iloc = eintLocal;
1006     }
1007     else if (NONLOCAL_A(aloc))
1008     {
1009         iloc = eintNonlocal;
1010     }
1011     else
1012     {
1013         char stmp[STRLEN];
1014         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
1015                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
1016         gmx_incons(stmp);
1017     }
1018
1019     cl_plist_t                 *plist    = nb->plist[iloc];
1020     cl_timers_t                *timers   = nb->timers;
1021     struct gmx_wallclock_gpu_t *timings  = nb->timings;
1022     cl_nb_staging               nbst     = nb->nbst;
1023
1024     bool                        bCalcEner   = flags & GMX_FORCE_ENERGY;
1025     int                         bCalcFshift = flags & GMX_FORCE_VIRIAL;
1026
1027     /* turn energy calculation always on/off (for debugging/testing only) */
1028     bCalcEner = (bCalcEner || always_ener) && !never_ener;
1029
1030     /* Launch wait/update timers & counters, unless doing the non-local phase
1031        when there is not actually work to do. This is consistent with
1032        nbnxn_gpu_launch_kernel.
1033
1034        NOTE: if timing with multiple GPUs (streams) becomes possible, the
1035        counters could end up being inconsistent due to not being incremented
1036        on some of the nodes! */
1037     if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
1038     {
1039         return;
1040     }
1041
1042     /* Actual sync point. Waits for everything to be finished in the command queue. TODO: Find out if a more fine grained solution is needed */
1043     cl_error = clFinish(nb->stream[iloc]);
1044     assert(CL_SUCCESS == cl_error);
1045
1046     /* timing data accumulation */
1047     if (nb->bDoTime)
1048     {
1049         /* only increase counter once (at local F wait) */
1050         if (LOCAL_I(iloc))
1051         {
1052             timings->nb_c++;
1053             timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
1054         }
1055
1056         /* kernel timings */
1057
1058         timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
1059             ocl_event_elapsed_ms(timers->nb_k + iloc);
1060
1061         /* X/q H2D and F D2H timings */
1062         timings->nb_h2d_t += ocl_event_elapsed_ms(timers->nb_h2d        + iloc);
1063         timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_f      + iloc);
1064         timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_fshift + iloc);
1065         timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_e_el   + iloc);
1066         timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_e_lj   + iloc);
1067
1068         /* only count atdat and pair-list H2D at pair-search step */
1069         if (plist->bDoPrune)
1070         {
1071             /* atdat transfer timing (add only once, at local F wait) */
1072             if (LOCAL_A(aloc))
1073             {
1074                 timings->pl_h2d_c++;
1075                 timings->pl_h2d_t += ocl_event_elapsed_ms(&(timers->atdat));
1076             }
1077
1078             timings->pl_h2d_t +=
1079                 ocl_event_elapsed_ms(timers->pl_h2d_sci     + iloc) +
1080                 ocl_event_elapsed_ms(timers->pl_h2d_cj4     + iloc) +
1081                 ocl_event_elapsed_ms(timers->pl_h2d_excl    + iloc);
1082
1083         }
1084     }
1085
1086     /* add up energies and shift forces (only once at local F wait) */
1087     if (LOCAL_I(iloc))
1088     {
1089         if (bCalcEner)
1090         {
1091             *e_lj += *nbst.e_lj;
1092             *e_el += *nbst.e_el;
1093         }
1094
1095         if (bCalcFshift)
1096         {
1097             for (i = 0; i < SHIFTS; i++)
1098             {
1099                 fshift[i][0] += (nbst.fshift)[i][0];
1100                 fshift[i][1] += (nbst.fshift)[i][1];
1101                 fshift[i][2] += (nbst.fshift)[i][2];
1102             }
1103         }
1104     }
1105
1106     /* turn off pruning (doesn't matter if this is pair-search step or not) */
1107     plist->bDoPrune = false;
1108
1109 }
1110
1111 /*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
1112 int nbnxn_gpu_pick_ewald_kernel_type(bool bTwinCut)
1113 {
1114     bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
1115     int  kernel_type;
1116
1117     /* Benchmarking/development environment variables to force the use of
1118        analytical or tabulated Ewald kernel. */
1119     bForceAnalyticalEwald = (getenv("GMX_OCL_NB_ANA_EWALD") != NULL);
1120     bForceTabulatedEwald  = (getenv("GMX_OCL_NB_TAB_EWALD") != NULL);
1121
1122     if (bForceAnalyticalEwald && bForceTabulatedEwald)
1123     {
1124         gmx_incons("Both analytical and tabulated Ewald OpenCL non-bonded kernels "
1125                    "requested through environment variables.");
1126     }
1127
1128     /* CUDA: By default, on SM 3.0 and later use analytical Ewald, on earlier tabulated. */
1129     /* OpenCL: By default, use analytical Ewald, on earlier tabulated. */
1130     // TODO: decide if dev_info parameter should be added to recognize NVIDIA CC>=3.0 devices.
1131     //if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
1132     if ((1                         || bForceAnalyticalEwald) && !bForceTabulatedEwald)
1133     {
1134         bUseAnalyticalEwald = true;
1135
1136         if (debug)
1137         {
1138             fprintf(debug, "Using analytical Ewald OpenCL kernels\n");
1139         }
1140     }
1141     else
1142     {
1143         bUseAnalyticalEwald = false;
1144
1145         if (debug)
1146         {
1147             fprintf(debug, "Using tabulated Ewald OpenCL kernels\n");
1148         }
1149     }
1150
1151     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
1152        forces it (use it for debugging/benchmarking only). */
1153     if (!bTwinCut && (getenv("GMX_OCL_NB_EWALD_TWINCUT") == NULL))
1154     {
1155         kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA : eelOclEWALD_TAB;
1156     }
1157     else
1158     {
1159         kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA_TWIN : eelOclEWALD_TAB_TWIN;
1160     }
1161
1162     return kernel_type;
1163 }