src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *  \brief Define OpenCL implementation of nbnxn_gpu.h
  37  *
  38  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  39  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  40  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  41  *  \ingroup module_mdlib
  42  */
  43 #include "gmxpre.h"
  44
  45 #include "config.h"
  46
  47 #include <assert.h>
  48 #include <stdlib.h>
  49
  50 #if defined(_MSVC)
  51 #include <limits>
  52 #endif
  53
  54 #include "gromacs/gmxlib/ocl_tools/oclutils.h"
  55 #include "gromacs/legacyheaders/types/force_flags.h"
  56 #include "gromacs/legacyheaders/types/hw_info.h"
  57 #include "gromacs/legacyheaders/types/simple.h"
  58 #include "gromacs/mdlib/nb_verlet.h"
  59 #include "gromacs/mdlib/nbnxn_consts.h"
  60 #include "gromacs/mdlib/nbnxn_pairlist.h"
  61 #include "gromacs/timing/gpu_timing.h"
  62
  63 #ifdef TMPI_ATOMICS
  64 #include "thread_mpi/atomic.h"
  65 #endif
  66
  67 #include "gromacs/mdlib/nbnxn_gpu.h"
  68 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  69 #include "gromacs/pbcutil/ishift.h"
  70 #include "gromacs/utility/cstringutil.h"
  71 #include "gromacs/utility/fatalerror.h"
  72
  73 #include "nbnxn_ocl_types.h"
  74
  75 #if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ >= 300
  76 #define USE_TEXOBJ
  77 #endif
  78
  79 /*! \brief Convenience defines */
  80 //@{
  81 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  82 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  83 //@}
  84
  85 /*! \brief Always/never run the energy/pruning kernels -- only for benchmarking purposes */
  86 //@{
  87 static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
  88 static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
  89 static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
  90 //@}
  91
  92 /* Uncomment this define to enable kernel debugging */
  93 //#define DEBUG_OCL
  94
  95 /*! \brief Specifies which kernel run to debug */
  96 #define DEBUG_RUN_STEP 2
  97
  98 /*! \brief Validates the input global work size parameter.
  99  */
 100 static inline void validate_global_work_size(size_t *global_work_size, int work_dim, gmx_device_info_t *dinfo)
 101 {
 102     cl_uint device_size_t_size_bits;
 103     cl_uint host_size_t_size_bits;
 104
 105     assert(dinfo);
 106
 107     /* Each component of a global_work_size must not exceed the range given by the
 108        sizeof(device size_t) for the device on which the kernel execution will
 109        be enqueued. See:
 110        https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
 111      */
 112     device_size_t_size_bits = dinfo->adress_bits;
 113     host_size_t_size_bits   = (cl_uint)(sizeof(size_t) * 8);
 114
 115     /* If sizeof(host size_t) <= sizeof(device size_t)
 116             => global_work_size components will always be valid
 117        else
 118             => get device limit for global work size and
 119             compare it against each component of global_work_size.
 120      */
 121     if (host_size_t_size_bits > device_size_t_size_bits)
 122     {
 123         size_t device_limit;
 124
 125         device_limit = (((size_t)1) << device_size_t_size_bits) - 1;
 126
 127         for (int i = 0; i < work_dim; i++)
 128         {
 129             if (global_work_size[i] > device_limit)
 130             {
 131                 gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
 132                           "The number of nonbonded work units (=number of super-clusters) exceeds the"
 133                           "device capabilities. Global work size limit exceeded (%d > %d)!",
 134                           global_work_size[i], device_limit);
 135             }
 136         }
 137     }
 138 }
 139
 140 /* Constant arrays listing non-bonded kernel function names. The arrays are
 141  * organized in 2-dim arrays by: electrostatics and VDW type.
 142  *
 143  *  Note that the row- and column-order of function pointers has to match the
 144  *  order of corresponding enumerated electrostatics and vdw types, resp.,
 145  *  defined in nbnxn_cuda_types.h.
 146  */
 147
 148 /*! \brief Force-only kernel function names. */
 149 static const char* nb_kfunc_noener_noprune_ptr[eelOclNR][evdwOclNR] =
 150 {
 151     { "nbnxn_kernel_ElecCut_VdwLJ_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_opencl"            },
 152     { "nbnxn_kernel_ElecRF_VdwLJ_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_opencl"             },
 153     { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_opencl"        },
 154     { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_opencl" },
 155     { "nbnxn_kernel_ElecEw_VdwLJ_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_opencl"             },
 156     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_opencl"      }
 157 };
 158
 159 /*! \brief Force + energy kernel function pointers. */
 160 static const char* nb_kfunc_ener_noprune_ptr[eelOclNR][evdwOclNR] =
 161 {
 162     { "nbnxn_kernel_ElecCut_VdwLJ_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_opencl"              },
 163     { "nbnxn_kernel_ElecRF_VdwLJ_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_opencl"               },
 164     { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_opencl"          },
 165     { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_opencl"     },
 166     { "nbnxn_kernel_ElecEw_VdwLJ_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_opencl"               },
 167     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_opencl"        }
 168 };
 169
 170 /*! \brief Force + pruning kernel function pointers. */
 171 static const char* nb_kfunc_noener_prune_ptr[eelOclNR][evdwOclNR] =
 172 {
 173     { "nbnxn_kernel_ElecCut_VdwLJ_F_prune_opencl",             "nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_opencl"            },
 174     { "nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl",              "nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_opencl"             },
 175     { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_opencl",         "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_opencl"        },
 176     { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_opencl",  "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_opencl" },
 177     { "nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl",              "nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_opencl"             },
 178     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_opencl",       "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_opencl"      }
 179 };
 180
 181 /*! \brief Force + energy + pruning kernel function pointers. */
 182 static const char* nb_kfunc_ener_prune_ptr[eelOclNR][evdwOclNR] =
 183 {
 184     { "nbnxn_kernel_ElecCut_VdwLJ_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_opencl"            },
 185     { "nbnxn_kernel_ElecRF_VdwLJ_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_opencl"             },
 186     { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_opencl"        },
 187     { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_opencl" },
 188     { "nbnxn_kernel_ElecEw_VdwLJ_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_opencl"             },
 189     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_opencl"      }
 190 };
 191
 192 /*! \brief Return a pointer to the kernel version to be executed at the current step.
 193  *  OpenCL kernel objects are cached in nb. If the requested kernel is not
 194  *  found in the cache, it will be created and the cache will be updated.
 195  */
 196 static inline cl_kernel select_nbnxn_kernel(gmx_nbnxn_ocl_t   *nb,
 197                                             int                eeltype,
 198                                             int                evdwtype,
 199                                             bool               bDoEne,
 200                                             bool               bDoPrune)
 201 {
 202     const char* kernel_name_to_run;
 203     cl_kernel  *kernel_ptr;
 204     cl_int      cl_error;
 205
 206     assert(eeltype < eelOclNR);
 207     assert(evdwtype < eelOclNR);
 208
 209     if (bDoEne)
 210     {
 211         if (bDoPrune)
 212         {
 213             kernel_name_to_run = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
 214             kernel_ptr         = &(nb->kernel_ener_prune_ptr[eeltype][evdwtype]);
 215         }
 216         else
 217         {
 218             kernel_name_to_run = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
 219             kernel_ptr         = &(nb->kernel_ener_noprune_ptr[eeltype][evdwtype]);
 220         }
 221     }
 222     else
 223     {
 224         if (bDoPrune)
 225         {
 226             kernel_name_to_run = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
 227             kernel_ptr         = &(nb->kernel_noener_prune_ptr[eeltype][evdwtype]);
 228         }
 229         else
 230         {
 231             kernel_name_to_run = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
 232             kernel_ptr         = &(nb->kernel_noener_noprune_ptr[eeltype][evdwtype]);
 233         }
 234     }
 235
 236     if (NULL == kernel_ptr[0])
 237     {
 238         *kernel_ptr = clCreateKernel(nb->dev_info->program, kernel_name_to_run, &cl_error);
 239         assert(cl_error == CL_SUCCESS);
 240     }
 241     // TODO: handle errors
 242
 243     return *kernel_ptr;
 244 }
 245
 246 /*! \brief Calculates the amount of shared memory required by the OpenCL kernel in use.
 247  */
 248 static inline int calc_shmem_required()
 249 {
 250     int shmem;
 251
 252     /* size of shmem (force-buffers/xq/atom type preloading) */
 253     /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
 254     /* i-atom x+q in shared memory */
 255     //shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 256     shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float) * 4; /* xqib */
 257     /* cj in shared memory, for both warps separately */
 258     shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);       /* cjs  */
 259 #ifdef IATYPE_SHMEM                                         // CUDA ARCH >= 300
 260     /* i-atom types in shared memory */
 261     #pragma error "Should not be defined"
 262     shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);       /* atib */
 263 #endif
 264     /* force reduction buffers in shared memory */
 265     shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float); /* f_buf */
 266     /* Warp vote. In fact it must be * number of warps in block.. */
 267     shmem += sizeof(cl_uint) * 2;                   /* warp_any */
 268     return shmem;
 269 }
 270
 271 /*! \brief Initializes data structures that are going to be sent to the OpenCL device.
 272  *
 273  *  The device can't use the same data structures as the host for two main reasons:
 274  *  - OpenCL restrictions (pointers are not accepted inside data structures)
 275  *  - some host side fields are not needed for the OpenCL kernels.
 276  */
 277 static void fillin_ocl_structures(cl_nbparam_t        *nbp,
 278                                   cl_nbparam_params_t *nbparams_params)
 279 {
 280     nbparams_params->coulomb_tab_scale = nbp->coulomb_tab_scale;
 281     nbparams_params->coulomb_tab_size  = nbp->coulomb_tab_size;
 282     nbparams_params->c_rf              = nbp->c_rf;
 283     nbparams_params->dispersion_shift  = nbp->dispersion_shift;
 284     nbparams_params->eeltype           = nbp->eeltype;
 285     nbparams_params->epsfac            = nbp->epsfac;
 286     nbparams_params->ewaldcoeff_lj     = nbp->ewaldcoeff_lj;
 287     nbparams_params->ewald_beta        = nbp->ewald_beta;
 288     nbparams_params->rcoulomb_sq       = nbp->rcoulomb_sq;
 289     nbparams_params->repulsion_shift   = nbp->repulsion_shift;
 290     nbparams_params->rlist_sq          = nbp->rlist_sq;
 291     nbparams_params->rvdw_sq           = nbp->rvdw_sq;
 292     nbparams_params->rvdw_switch       = nbp->rvdw_switch;
 293     nbparams_params->sh_ewald          = nbp->sh_ewald;
 294     nbparams_params->sh_lj_ewald       = nbp->sh_lj_ewald;
 295     nbparams_params->two_k_rf          = nbp->two_k_rf;
 296     nbparams_params->vdwtype           = nbp->vdwtype;
 297     nbparams_params->vdw_switch        = nbp->vdw_switch;
 298 }
 299
 300 /*! \brief Waits for the commands associated with the input event to finish.
 301  * Then it releases the event and sets it to 0.
 302  * Don't use this function when more than one wait will be issued for the event.
 303  */
 304 void wait_ocl_event(cl_event *ocl_event)
 305 {
 306     cl_int gmx_unused cl_error;
 307
 308     /* Blocking wait for the event */
 309     cl_error = clWaitForEvents(1, ocl_event);
 310     assert(CL_SUCCESS == cl_error);
 311
 312     /* Release event and reset it to 0 */
 313     cl_error = clReleaseEvent(*ocl_event);
 314     assert(CL_SUCCESS == cl_error);
 315     *ocl_event = 0;
 316 }
 317
 318 /*! \brief Enqueues a wait for event completion.
 319  *
 320  * Then it releases the event and sets it to 0.
 321  * Don't use this function when more than one wait will be issued for the event.
 322  * Equivalent to Cuda Stream Sync. */
 323 void sync_ocl_event(cl_command_queue stream, cl_event *ocl_event)
 324 {
 325     cl_int gmx_unused cl_error;
 326
 327     /* Enqueue wait */
 328     cl_error = clEnqueueWaitForEvents(stream, 1, ocl_event);
 329
 330     assert(CL_SUCCESS == cl_error);
 331
 332     /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */
 333     cl_error = clReleaseEvent(*ocl_event);
 334     assert(CL_SUCCESS == cl_error);
 335     *ocl_event = 0;
 336 }
 337
 338 /*! \brief Returns the duration in miliseconds for the command associated with the event.
 339  *
 340  * It then releases the event and sets it to 0.
 341  * Before calling this function, make sure the command has finished either by
 342  * calling clFinish or clWaitForEvents.
 343  * The function returns 0.0 if the input event, *ocl_event, is 0.
 344  * Don't use this function when more than one wait will be issued for the event.
 345  */
 346 double ocl_event_elapsed_ms(cl_event *ocl_event)
 347 {
 348     cl_int gmx_unused cl_error;
 349     cl_ulong          start_ns, end_ns;
 350     double            elapsed_ms;
 351
 352     elapsed_ms = 0.0;
 353     assert(NULL != ocl_event);
 354
 355     if (*ocl_event)
 356     {
 357         cl_error = clGetEventProfilingInfo(*ocl_event, CL_PROFILING_COMMAND_START,
 358                                            sizeof(cl_ulong), &start_ns, NULL);
 359         assert(CL_SUCCESS == cl_error);
 360
 361         cl_error = clGetEventProfilingInfo(*ocl_event, CL_PROFILING_COMMAND_END,
 362                                            sizeof(cl_ulong), &end_ns, NULL);
 363         assert(CL_SUCCESS == cl_error);
 364
 365         clReleaseEvent(*ocl_event);
 366         *ocl_event = 0;
 367
 368         elapsed_ms = (end_ns - start_ns) / 1000000.0;
 369     }
 370
 371     return elapsed_ms;
 372 }
 373
 374 /*! \brief Launch GPU kernel
 375
 376    As we execute nonbonded workload in separate queues, before launching
 377    the kernel we need to make sure that he following operations have completed:
 378    - atomdata allocation and related H2D transfers (every nstlist step);
 379    - pair list H2D transfer (every nstlist step);
 380    - shift vector H2D transfer (every nstlist step);
 381    - force (+shift force and energy) output clearing (every step).
 382
 383    These operations are issued in the local queue at the beginning of the step
 384    and therefore always complete before the local kernel launch. The non-local
 385    kernel is launched after the local on the same device/context, so this is
 386    inherently scheduled after the operations in the local stream (including the
 387    above "misc_ops").
 388    However, for the sake of having a future-proof implementation, we use the
 389    misc_ops_done event to record the point in time when the above  operations
 390    are finished and synchronize with this event in the non-local stream.
 391  */
 392 void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
 393                              const struct nbnxn_atomdata_t *nbatom,
 394                              int                            flags,
 395                              int                            iloc)
 396 {
 397     cl_int               cl_error;
 398     int                  adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 399     /* OpenCL kernel launch-related stuff */
 400     int                  shmem;
 401     size_t               local_work_size[3], global_work_size[3];
 402     cl_kernel            nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
 403
 404     cl_atomdata_t       *adat    = nb->atdat;
 405     cl_nbparam_t        *nbp     = nb->nbparam;
 406     cl_plist_t          *plist   = nb->plist[iloc];
 407     cl_timers_t         *t       = nb->timers;
 408     cl_command_queue     stream  = nb->stream[iloc];
 409
 410     bool                 bCalcEner   = flags & GMX_FORCE_ENERGY;
 411     int                  bCalcFshift = flags & GMX_FORCE_VIRIAL;
 412     bool                 bDoTime     = nb->bDoTime;
 413     cl_uint              arg_no;
 414
 415     cl_nbparam_params_t  nbparams_params;
 416 #ifdef DEBUG_OCL
 417     float              * debug_buffer_h;
 418     size_t               debug_buffer_size;
 419 #endif
 420
 421     /* turn energy calculation always on/off (for debugging/testing only) */
 422     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 423
 424     /* Don't launch the non-local kernel if there is no work to do.
 425        Doing the same for the local kernel is more complicated, since the
 426        local part of the force array also depends on the non-local kernel.
 427        So to avoid complicating the code and to reduce the risk of bugs,
 428        we always call the local kernel, the local x+q copy and later (not in
 429        this function) the stream wait, local f copyback and the f buffer
 430        clearing. All these operations, except for the local interaction kernel,
 431        are needed for the non-local interactions. The skip of the local kernel
 432        call is taken care of later in this function. */
 433     if (iloc == eintNonlocal && plist->nsci == 0)
 434     {
 435         return;
 436     }
 437
 438     /* calculate the atom data index range based on locality */
 439     if (LOCAL_I(iloc))
 440     {
 441         adat_begin  = 0;
 442         adat_len    = adat->natoms_local;
 443     }
 444     else
 445     {
 446         adat_begin  = adat->natoms_local;
 447         adat_len    = adat->natoms - adat->natoms_local;
 448     }
 449
 450     /* When we get here all misc operations issues in the local stream are done,
 451        so we record that in the local stream and wait for it in the nonlocal one. */
 452     if (nb->bUseTwoStreams)
 453     {
 454         if (iloc == eintLocal)
 455         {
 456             cl_error = clEnqueueMarker(stream, &(nb->misc_ops_done));
 457             assert(CL_SUCCESS == cl_error);
 458         }
 459         else
 460         {
 461             sync_ocl_event(stream, &(nb->misc_ops_done));
 462         }
 463     }
 464
 465     /* beginning of timed HtoD section */
 466
 467     /* HtoD x, q */
 468     ocl_copy_H2D_async(adat->xq, nbatom->x + adat_begin * 4, adat_begin*sizeof(float)*4,
 469                        adat_len * sizeof(float) * 4, stream, bDoTime ? (&(t->nb_h2d[iloc])) : NULL);
 470
 471     if (plist->nsci == 0)
 472     {
 473         /* Don't launch an empty local kernel (is not allowed with OpenCL).
 474          * TODO: Separate H2D and kernel launch into separate functions.
 475          */
 476         return;
 477     }
 478
 479     /* beginning of timed nonbonded calculation section */
 480
 481     /* get the pointer to the kernel flavor we need to use */
 482     nb_kernel = select_nbnxn_kernel(nb,
 483                                     nbp->eeltype,
 484                                     nbp->vdwtype,
 485                                     bCalcEner,
 486                                     plist->bDoPrune || always_prune);
 487
 488     /* kernel launch config */
 489     local_work_size[0] = CL_SIZE;
 490     local_work_size[1] = CL_SIZE;
 491     local_work_size[2] = 1;
 492
 493     global_work_size[0] = plist->nsci * local_work_size[0];
 494     global_work_size[1] = 1 * local_work_size[1];
 495     global_work_size[2] = 1 * local_work_size[2];
 496
 497     validate_global_work_size(global_work_size, 3, nb->dev_info);
 498
 499     shmem     = calc_shmem_required();
 500
 501 #ifdef DEBUG_OCL
 502     {
 503         static int run_step = 1;
 504
 505         if (DEBUG_RUN_STEP == run_step)
 506         {
 507             debug_buffer_size = global_work_size[0] * global_work_size[1] * global_work_size[2] * sizeof(float);
 508             debug_buffer_h    = (float*)calloc(1, debug_buffer_size);
 509             assert(NULL != debug_buffer_h);
 510
 511             if (NULL == nb->debug_buffer)
 512             {
 513                 nb->debug_buffer = clCreateBuffer(nb->dev_info->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
 514                                                   debug_buffer_size, debug_buffer_h, &cl_error);
 515
 516                 assert(CL_SUCCESS == cl_error);
 517             }
 518         }
 519
 520         run_step++;
 521     }
 522 #endif
 523     if (debug)
 524     {
 525         fprintf(debug, "GPU launch configuration:\n\tLocal work size: %dx%dx%d\n\t"
 526                 "Global work size : %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
 527                 (int)(local_work_size[0]), (int)(local_work_size[1]), (int)(local_work_size[2]),
 528                 (int)(global_work_size[0]), (int)(global_work_size[1]), plist->nsci*NCL_PER_SUPERCL,
 529                 NCL_PER_SUPERCL, plist->na_c);
 530     }
 531
 532     fillin_ocl_structures(nbp, &nbparams_params);
 533
 534     arg_no    = 0;
 535     cl_error  = clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &(adat->ntypes));
 536     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(nbparams_params), &(nbparams_params));
 537     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->xq));
 538     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->f));
 539     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_lj));
 540     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_el));
 541     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->fshift));
 542     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->atom_types));
 543     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->shift_vec));
 544     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_climg2d));
 545     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_comb_climg2d));
 546     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->coulomb_tab_climg2d));
 547     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->sci));
 548     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->cj4));
 549     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->excl));
 550     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &bCalcFshift);
 551     cl_error |= clSetKernelArg(nb_kernel, arg_no++, shmem, NULL);
 552     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nb->debug_buffer));
 553
 554     assert(cl_error == CL_SUCCESS);
 555
 556     if (cl_error)
 557     {
 558         printf("ClERROR! %d\n", cl_error);
 559     }
 560     cl_error = clEnqueueNDRangeKernel(stream, nb_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, bDoTime ? &(t->nb_k[iloc]) : NULL);
 561     assert(cl_error == CL_SUCCESS);
 562
 563 #ifdef DEBUG_OCL
 564     {
 565         static int run_step = 1;
 566
 567         if (DEBUG_RUN_STEP == run_step)
 568         {
 569             FILE *pf;
 570             char  file_name[256] = {0};
 571
 572             ocl_copy_D2H_async(debug_buffer_h, nb->debug_buffer, 0,
 573                                debug_buffer_size, stream, NULL);
 574
 575             // Make sure all data has been transfered back from device
 576             clFinish(stream);
 577
 578             printf("\nWriting debug_buffer to debug_buffer_ocl.txt...");
 579
 580             sprintf(file_name, "debug_buffer_ocl_%d.txt", DEBUG_RUN_STEP);
 581             pf = fopen(file_name, "wt");
 582             assert(pf != NULL);
 583
 584             fprintf(pf, "%20s", "");
 585             for (int j = 0; j < global_work_size[0]; j++)
 586             {
 587                 char label[20];
 588                 sprintf(label, "(wIdx=%2d thIdx=%2d)", j / local_work_size[0], j % local_work_size[0]);
 589                 fprintf(pf, "%20s", label);
 590             }
 591
 592             for (int i = 0; i < global_work_size[1]; i++)
 593             {
 594                 char label[20];
 595                 sprintf(label, "(wIdy=%2d thIdy=%2d)", i / local_work_size[1], i % local_work_size[1]);
 596                 fprintf(pf, "\n%20s", label);
 597
 598                 for (int j = 0; j < global_work_size[0]; j++)
 599                 {
 600                     fprintf(pf, "%20.5f", debug_buffer_h[i * global_work_size[0] + j]);
 601                 }
 602
 603                 //fprintf(pf, "\n");
 604             }
 605
 606             fclose(pf);
 607
 608             printf(" done.\n");
 609
 610
 611             free(debug_buffer_h);
 612             debug_buffer_h = NULL;
 613         }
 614
 615         run_step++;
 616     }
 617 #endif
 618 }
 619
 620 /*! \brief Debugging helper function */
 621 void dump_compare_results_cj4(nbnxn_cj4_t* results, int cnt, char* out_file, char* ref_file)
 622 {
 623     FILE *pf;
 624
 625     pf = fopen(out_file, "wt");
 626     assert(pf != NULL);
 627
 628     fprintf(pf, "%20s%20s%20s%20s%20s%20s%20s%20s\n",
 629             "cj[0]", "cj[1]", "cj[2]", "cj[3]",
 630             "imei[0].excl_ind", "imei[0].imask",
 631             "imei[1].excl_ind", "imei[1].imask");
 632
 633     for (int index = 0; index < cnt; index++)
 634     {
 635         fprintf(pf, "%20d%20d%20d%20d%20d%20u%20d%20u\n",
 636                 results[index].cj[0], results[index].cj[1], results[index].cj[2], results[index].cj[3],
 637                 results[index].imei[0].excl_ind, results[index].imei[0].imask,
 638                 results[index].imei[1].excl_ind, results[index].imei[1].imask);
 639     }
 640
 641     fclose(pf);
 642
 643     printf("\nWrote results to %s", out_file);
 644
 645     pf = fopen(ref_file, "rt");
 646     if (pf)
 647     {
 648         char c;
 649         int  diff = 0;
 650         printf("\n%s file found. Comparing results...", ref_file);
 651
 652         /* Skip the first line */
 653         c = 0;
 654         while (c != '\n')
 655         {
 656             if (1 != fscanf(pf, "%c", &c))
 657             {
 658                 break;
 659             }
 660         }
 661
 662         for (int index = 0; index < cnt; index++)
 663         {
 664             int          ref_val;
 665             unsigned int u_ref_val;
 666
 667             for (int j = 0; j < 4; j++)
 668             {
 669                 if (1 != fscanf(pf, "%20d", &ref_val))
 670                 {
 671                     break;
 672                 }
 673
 674                 if (ref_val != results[index].cj[j])
 675                 {
 676                     printf("\nDifference for cj[%d] at index %d computed value = %d reference value = %d",
 677                            j, index, results[index].cj[j], ref_val);
 678
 679                     diff++;
 680                 }
 681             }
 682
 683             for (int j = 0; j < 2; j++)
 684             {
 685                 if (1 != fscanf(pf, "%20d", &ref_val))
 686                 {
 687                     break;
 688                 }
 689
 690                 if (ref_val != results[index].imei[j].excl_ind)
 691                 {
 692                     printf("\nDifference for imei[%d].excl_ind at index %d computed value = %d reference value = %d",
 693                            j, index, results[index].imei[j].excl_ind, ref_val);
 694
 695                     diff++;
 696                 }
 697
 698                 if (1 != fscanf(pf, "%20u", &u_ref_val))
 699                 {
 700                     break;
 701                 }
 702
 703                 if (u_ref_val != results[index].imei[j].imask)
 704                 {
 705                     printf("\nDifference for imei[%d].imask at index %d computed value = %u reference value = %u",
 706                            j, index, results[index].imei[j].imask, u_ref_val);
 707
 708                     diff++;
 709                 }
 710
 711             }
 712         }
 713
 714         printf("\nFinished comparing results. Total number of differences: %d", diff);
 715         fclose(pf);
 716     }
 717     else
 718     {
 719         printf("\n%s file not found. No comparison performed.", ref_file);
 720     }
 721 }
 722
 723 /*! \brief Debugging helper function */
 724 void dump_compare_results_f(float* results, int cnt, char* out_file, char* ref_file)
 725 {
 726     FILE *pf;
 727     float cmp_eps = 0.001f;
 728
 729     pf = fopen(out_file, "wt");
 730     assert(pf != NULL);
 731
 732     for (int index = 0; index < cnt; index++)
 733     {
 734         fprintf(pf, "%15.5f\n", results[index]);
 735     }
 736
 737     fclose(pf);
 738
 739     printf("\nWrote results to %s", out_file);
 740
 741     pf = fopen(ref_file, "rt");
 742     if (pf)
 743     {
 744         int diff = 0;
 745         printf("\n%s file found. Comparing results...", ref_file);
 746         for (int index = 0; index < cnt; index++)
 747         {
 748             float ref_val;
 749             if (1 != fscanf(pf, "%20f", &ref_val))
 750             {
 751                 break;
 752             }
 753
 754             if (((ref_val - results[index]) > cmp_eps) ||
 755                 ((ref_val - results[index]) < -cmp_eps))
 756             {
 757                 printf("\nDifference at index %d computed value = %15.5f reference value = %15.5f",
 758                        index, results[index], ref_val);
 759
 760                 diff++;
 761             }
 762         }
 763
 764         printf("\nFinished comparing results. Total number of differences: %d", diff);
 765         fclose(pf);
 766     }
 767     else
 768     {
 769         printf("\n%s file not found. No comparison performed.", ref_file);
 770     }
 771 }
 772
 773 /*! \brief
 774  * Debug function for dumping cj4, f and fshift buffers.
 775  * By default this function does nothing. To enable debugging for any of these
 776  * buffers, uncomment the corresponding definition inside the function:
 777  * DEBUG_DUMP_CJ4_OCL, DEBUG_DUMP_F_OCL, DEBUG_DUMP_FSHIFT_OCL.
 778  */
 779 static
 780 void debug_dump_cj4_f_fshift(gmx_nbnxn_ocl_t               gmx_unused *nb,
 781                              const struct nbnxn_atomdata_t gmx_unused *nbatom,
 782                              cl_command_queue              gmx_unused  stream,
 783                              int                           gmx_unused  adat_begin,
 784                              int                           gmx_unused  adat_len)
 785 {
 786 /* Uncomment this define to enable cj4 debugging for the first kernel run */
 787 //#define DEBUG_DUMP_CJ4_OCL
 788 #ifdef DEBUG_DUMP_CJ4_OCL
 789     {
 790         static int run_step = 1;
 791
 792         if (DEBUG_RUN_STEP == run_step)
 793         {
 794             nbnxn_cj4_t *temp_cj4;
 795             int          cnt;
 796             size_t       size;
 797             char         ocl_file_name[256]  = {0};
 798             char         cuda_file_name[256] = {0};
 799
 800             cnt      = nb->plist[0]->ncj4;
 801             size     = cnt * sizeof(nbnxn_cj4_t);
 802             temp_cj4 = (nbnxn_cj4_t*)malloc(size);
 803
 804             ocl_copy_D2H_async(temp_cj4, nb->plist[0]->cj4, 0,
 805                                size, stream, NULL);
 806
 807             // Make sure all data has been transfered back from device
 808             clFinish(stream);
 809
 810             sprintf(ocl_file_name, "ocl_cj4_%d.txt", DEBUG_RUN_STEP);
 811             sprintf(cuda_file_name, "cuda_cj4_%d.txt", DEBUG_RUN_STEP);
 812             dump_compare_results_cj4(temp_cj4, cnt, ocl_file_name, cuda_file_name);
 813
 814             free(temp_cj4);
 815         }
 816
 817         run_step++;
 818     }
 819 #endif
 820
 821 /* Uncomment this define to enable f debugging for the first kernel run */
 822 //#define DEBUG_DUMP_F_OCL
 823 #ifdef DEBUG_DUMP_F_OCL
 824     {
 825         static int run_step = 1;
 826
 827         if (DEBUG_RUN_STEP == run_step)
 828         {
 829             char ocl_file_name[256]  = {0};
 830             char cuda_file_name[256] = {0};
 831
 832             // Make sure all data has been transfered back from device
 833             clFinish(stream);
 834
 835             sprintf(ocl_file_name, "ocl_f_%d.txt", DEBUG_RUN_STEP);
 836             sprintf(cuda_file_name, "cuda_f_%d.txt", DEBUG_RUN_STEP);
 837
 838             dump_compare_results_f(nbatom->out[0].f + adat_begin * 3, (adat_len) * 3,
 839                                    ocl_file_name, cuda_file_name);
 840         }
 841
 842         run_step++;
 843     }
 844 #endif
 845
 846 /* Uncomment this define to enable fshift debugging for the first kernel run */
 847 //#define DEBUG_DUMP_FSHIFT_OCL
 848 #ifdef DEBUG_DUMP_FSHIFT_OCL
 849     {
 850         static int run_step = 1;
 851
 852         if (DEBUG_RUN_STEP == run_step)
 853         {
 854             char ocl_file_name[256]  = {0};
 855             char cuda_file_name[256] = {0};
 856
 857             // Make sure all data has been transfered back from device
 858             clFinish(stream);
 859
 860             sprintf(ocl_file_name, "ocl_fshift_%d.txt", DEBUG_RUN_STEP);
 861             sprintf(cuda_file_name, "cuda_fshift_%d.txt", DEBUG_RUN_STEP);
 862
 863             dump_compare_results_f((float*)(nb->nbst.fshift), SHIFTS * 3,
 864                                    ocl_file_name, cuda_file_name);
 865         }
 866
 867         run_step++;
 868     }
 869 #endif
 870 }
 871
 872 /*! \brief
 873  * Launch asynchronously the download of nonbonded forces from the GPU
 874  * (and energies/shift forces if required).
 875  */
 876 void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
 877                               const struct nbnxn_atomdata_t *nbatom,
 878                               int                            flags,
 879                               int                            aloc)
 880 {
 881     cl_int gmx_unused cl_error;
 882     int               adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 883     int               iloc = -1;
 884
 885     /* determine interaction locality from atom locality */
 886     if (LOCAL_A(aloc))
 887     {
 888         iloc = eintLocal;
 889     }
 890     else if (NONLOCAL_A(aloc))
 891     {
 892         iloc = eintNonlocal;
 893     }
 894     else
 895     {
 896         char stmp[STRLEN];
 897         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 898                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 899
 900         gmx_incons(stmp);
 901     }
 902
 903     cl_atomdata_t   *adat    = nb->atdat;
 904     cl_timers_t     *t       = nb->timers;
 905     bool             bDoTime = nb->bDoTime;
 906     cl_command_queue stream  = nb->stream[iloc];
 907
 908     bool             bCalcEner   = flags & GMX_FORCE_ENERGY;
 909     int              bCalcFshift = flags & GMX_FORCE_VIRIAL;
 910
 911
 912     /* don't launch non-local copy-back if there was no non-local work to do */
 913     if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
 914     {
 915         return;
 916     }
 917
 918     /* calculate the atom data index range based on locality */
 919     if (LOCAL_A(aloc))
 920     {
 921         adat_begin  = 0;
 922         adat_len    = adat->natoms_local;
 923     }
 924     else
 925     {
 926         adat_begin  = adat->natoms_local;
 927         adat_len    = adat->natoms - adat->natoms_local;
 928     }
 929
 930     /* beginning of timed D2H section */
 931
 932     /* With DD the local D2H transfer can only start after the non-local
 933        has been launched. */
 934     if (iloc == eintLocal && nb->bUseTwoStreams)
 935     {
 936         sync_ocl_event(stream, &(nb->nonlocal_done));
 937     }
 938
 939     /* DtoH f */
 940     ocl_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f, adat_begin*3*sizeof(float),
 941                        (adat_len)* adat->f_elem_size, stream, bDoTime ? &(t->nb_d2h_f[iloc]) : NULL);
 942
 943     /* After the non-local D2H is launched the nonlocal_done event can be
 944        recorded which signals that the local D2H can proceed. This event is not
 945        placed after the non-local kernel because we first need the non-local
 946        data back first. */
 947     if (iloc == eintNonlocal)
 948     {
 949         cl_error = clEnqueueMarker(stream, &(nb->nonlocal_done));
 950         assert(CL_SUCCESS == cl_error);
 951     }
 952
 953     /* only transfer energies in the local stream */
 954     if (LOCAL_I(iloc))
 955     {
 956         /* DtoH fshift */
 957         if (bCalcFshift)
 958         {
 959             ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
 960                                SHIFTS * adat->fshift_elem_size, stream, bDoTime ? &(t->nb_d2h_fshift[iloc]) : NULL);
 961         }
 962
 963         /* DtoH energies */
 964         if (bCalcEner)
 965         {
 966             ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0,
 967                                sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_lj[iloc]) : NULL);
 968
 969             ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0,
 970                                sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_el[iloc]) : NULL);
 971         }
 972     }
 973
 974     debug_dump_cj4_f_fshift(nb, nbatom, stream, adat_begin, adat_len);
 975 }
 976
 977 /*! \brief
 978  * Wait for the asynchronously launched nonbonded calculations and data
 979  * transfers to finish.
 980  */
 981 void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_ocl_t *nb,
 982                             const nbnxn_atomdata_t gmx_unused *nbatom,
 983                             int flags, int aloc,
 984                             real *e_lj, real *e_el, rvec *fshift)
 985 {
 986     /* NOTE:  only implemented for single-precision at this time */
 987     cl_int gmx_unused      cl_error;
 988     int                    i, iloc = -1;
 989
 990     /* determine interaction locality from atom locality */
 991     if (LOCAL_A(aloc))
 992     {
 993         iloc = eintLocal;
 994     }
 995     else if (NONLOCAL_A(aloc))
 996     {
 997         iloc = eintNonlocal;
 998     }
 999     else
1000     {
1001         char stmp[STRLEN];
1002         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
1003                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
1004         gmx_incons(stmp);
1005     }
1006
1007     cl_plist_t                 *plist    = nb->plist[iloc];
1008     cl_timers_t                *timers   = nb->timers;
1009     struct gmx_wallclock_gpu_t *timings  = nb->timings;
1010     cl_nb_staging               nbst     = nb->nbst;
1011
1012     bool                        bCalcEner   = flags & GMX_FORCE_ENERGY;
1013     int                         bCalcFshift = flags & GMX_FORCE_VIRIAL;
1014
1015     /* turn energy calculation always on/off (for debugging/testing only) */
1016     bCalcEner = (bCalcEner || always_ener) && !never_ener;
1017
1018     /* Launch wait/update timers & counters, unless doing the non-local phase
1019        when there is not actually work to do. This is consistent with
1020        nbnxn_gpu_launch_kernel.
1021
1022        NOTE: if timing with multiple GPUs (streams) becomes possible, the
1023        counters could end up being inconsistent due to not being incremented
1024        on some of the nodes! */
1025     if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
1026     {
1027         return;
1028     }
1029
1030     /* Actual sync point. Waits for everything to be finished in the command queue. TODO: Find out if a more fine grained solution is needed */
1031     cl_error = clFinish(nb->stream[iloc]);
1032     assert(CL_SUCCESS == cl_error);
1033
1034     /* timing data accumulation */
1035     if (nb->bDoTime)
1036     {
1037         /* only increase counter once (at local F wait) */
1038         if (LOCAL_I(iloc))
1039         {
1040             timings->nb_c++;
1041             timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
1042         }
1043
1044         /* kernel timings */
1045
1046         timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
1047             ocl_event_elapsed_ms(timers->nb_k + iloc);
1048
1049         /* X/q H2D and F D2H timings */
1050         timings->nb_h2d_t += ocl_event_elapsed_ms(timers->nb_h2d        + iloc);
1051         timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_f      + iloc);
1052         timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_fshift + iloc);
1053         timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_e_el   + iloc);
1054         timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_e_lj   + iloc);
1055
1056         /* only count atdat and pair-list H2D at pair-search step */
1057         if (plist->bDoPrune)
1058         {
1059             /* atdat transfer timing (add only once, at local F wait) */
1060             if (LOCAL_A(aloc))
1061             {
1062                 timings->pl_h2d_c++;
1063                 timings->pl_h2d_t += ocl_event_elapsed_ms(&(timers->atdat));
1064             }
1065
1066             timings->pl_h2d_t +=
1067                 ocl_event_elapsed_ms(timers->pl_h2d_sci     + iloc) +
1068                 ocl_event_elapsed_ms(timers->pl_h2d_cj4     + iloc) +
1069                 ocl_event_elapsed_ms(timers->pl_h2d_excl    + iloc);
1070
1071         }
1072     }
1073
1074     /* add up energies and shift forces (only once at local F wait) */
1075     if (LOCAL_I(iloc))
1076     {
1077         if (bCalcEner)
1078         {
1079             *e_lj += *nbst.e_lj;
1080             *e_el += *nbst.e_el;
1081         }
1082
1083         if (bCalcFshift)
1084         {
1085             for (i = 0; i < SHIFTS; i++)
1086             {
1087                 fshift[i][0] += (nbst.fshift)[i][0];
1088                 fshift[i][1] += (nbst.fshift)[i][1];
1089                 fshift[i][2] += (nbst.fshift)[i][2];
1090             }
1091         }
1092     }
1093
1094     /* turn off pruning (doesn't matter if this is pair-search step or not) */
1095     plist->bDoPrune = false;
1096
1097 }
1098
1099 /*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
1100 int nbnxn_gpu_pick_ewald_kernel_type(bool bTwinCut)
1101 {
1102     bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
1103     int  kernel_type;
1104
1105     /* Benchmarking/development environment variables to force the use of
1106        analytical or tabulated Ewald kernel. */
1107     bForceAnalyticalEwald = (getenv("GMX_OCL_NB_ANA_EWALD") != NULL);
1108     bForceTabulatedEwald  = (getenv("GMX_OCL_NB_TAB_EWALD") != NULL);
1109
1110     if (bForceAnalyticalEwald && bForceTabulatedEwald)
1111     {
1112         gmx_incons("Both analytical and tabulated Ewald OpenCL non-bonded kernels "
1113                    "requested through environment variables.");
1114     }
1115
1116     /* CUDA: By default, on SM 3.0 and later use analytical Ewald, on earlier tabulated. */
1117     /* OpenCL: By default, use analytical Ewald, on earlier tabulated. */
1118     // TODO: decide if dev_info parameter should be added to recognize NVIDIA CC>=3.0 devices.
1119     //if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
1120     if ((1                         || bForceAnalyticalEwald) && !bForceTabulatedEwald)
1121     {
1122         bUseAnalyticalEwald = true;
1123
1124         if (debug)
1125         {
1126             fprintf(debug, "Using analytical Ewald OpenCL kernels\n");
1127         }
1128     }
1129     else
1130     {
1131         bUseAnalyticalEwald = false;
1132
1133         if (debug)
1134         {
1135             fprintf(debug, "Using tabulated Ewald OpenCL kernels\n");
1136         }
1137     }
1138
1139     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
1140        forces it (use it for debugging/benchmarking only). */
1141     if (!bTwinCut && (getenv("GMX_OCL_NB_EWALD_TWINCUT") == NULL))
1142     {
1143         kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA : eelOclEWALD_TAB;
1144     }
1145     else
1146     {
1147         kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA_TWIN : eelOclEWALD_TAB_TWIN;
1148     }
1149
1150     return kernel_type;
1151 }