src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2020,2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *  \brief
  38  *  Data management and kernel launch functions for nbnxm sycl.
  39  *
  40  *  \ingroup module_nbnxm
  41  */
  42 #include "gmxpre.h"
  43
  44 #include "gromacs/nbnxm/gpu_common.h"
  45 #include "gromacs/utility/exceptions.h"
  46
  47 #include "nbnxm_sycl_kernel.h"
  48 #include "nbnxm_sycl_kernel_pruneonly.h"
  49 #include "nbnxm_sycl_types.h"
  50
  51 namespace Nbnxm
  52 {
  53
  54 /*! \brief
  55  * Launch asynchronously the download of nonbonded forces from the GPU
  56  * (and energies/shift forces if required).
  57  */
  58 void gpu_launch_cpyback(NbnxmGpu*                nb,
  59                         struct nbnxn_atomdata_t* nbatom,
  60                         const gmx::StepWorkload& stepWork,
  61                         const AtomLocality       atomLocality)
  62 {
  63     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
  64
  65     const InteractionLocality iloc         = gpuAtomToInteractionLocality(atomLocality);
  66     const DeviceStream&       deviceStream = *nb->deviceStreams[iloc];
  67     sycl_atomdata_t*          adat         = nb->atdat;
  68
  69     /* don't launch non-local copy-back if there was no non-local work to do */
  70     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
  71     {
  72         nb->bNonLocalStreamActive = false;
  73         return;
  74     }
  75
  76     int adatBegin, adatLen;
  77     getGpuAtomRange(adat, atomLocality, &adatBegin, &adatLen);
  78
  79     // With DD the local D2H transfer can only start after the non-local kernel has finished.
  80     if (iloc == InteractionLocality::Local && nb->bNonLocalStreamActive)
  81     {
  82         nb->nonlocal_done.waitForEvent();
  83     }
  84
  85     /* DtoH f
  86      * Skip if buffer ops / reduction is offloaded to the GPU.
  87      */
  88     if (!stepWork.useGpuFBufferOps)
  89     {
  90         GMX_ASSERT(adat->f.elementSize() == sizeof(float3),
  91                    "The size of the force buffer element should be equal to the size of float3.");
  92         copyFromDeviceBuffer(reinterpret_cast<float3*>(nbatom->out[0].f.data()) + adatBegin,
  93                              &adat->f,
  94                              adatBegin,
  95                              adatLen,
  96                              deviceStream,
  97                              GpuApiCallBehavior::Async,
  98                              nullptr);
  99     }
 100
 101     /* After the non-local D2H is launched the nonlocal_done event can be
 102        recorded which signals that the local D2H can proceed. This event is not
 103        placed after the non-local kernel because we want the non-local data
 104        back first. */
 105     if (iloc == InteractionLocality::NonLocal)
 106     {
 107         nb->nonlocal_done.markEvent(deviceStream);
 108         nb->bNonLocalStreamActive = true;
 109     }
 110
 111     /* only transfer energies in the local stream */
 112     if (iloc == InteractionLocality::Local)
 113     {
 114         /* DtoH fshift when virial is needed */
 115         if (stepWork.computeVirial)
 116         {
 117             GMX_ASSERT(sizeof(*nb->nbst.fshift) == adat->fShift.elementSize(),
 118                        "Sizes of host- and device-side shift vector elements should be the same.");
 119             copyFromDeviceBuffer(
 120                     nb->nbst.fshift, &adat->fShift, 0, SHIFTS, deviceStream, GpuApiCallBehavior::Async, nullptr);
 121         }
 122
 123         /* DtoH energies */
 124         if (stepWork.computeEnergy)
 125         {
 126             GMX_ASSERT(sizeof(*nb->nbst.e_lj) == sizeof(float),
 127                        "Sizes of host- and device-side LJ energy terms should be the same.");
 128             copyFromDeviceBuffer(
 129                     nb->nbst.e_lj, &adat->eLJ, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
 130             GMX_ASSERT(sizeof(*nb->nbst.e_el) == sizeof(float),
 131                        "Sizes of host- and device-side electrostatic energy terms should be the "
 132                        "same.");
 133             copyFromDeviceBuffer(
 134                     nb->nbst.e_el, &adat->eElec, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
 135         }
 136     }
 137 }
 138
 139 /*! \brief Launch asynchronously the xq buffer host to device copy. */
 140 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
 141 {
 142     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 143     validateGpuAtomLocality(atomLocality);
 144
 145     const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
 146
 147     sycl_atomdata_t*    adat         = nb->atdat;
 148     gpu_plist*          plist        = nb->plist[iloc];
 149     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 150
 151     /* Don't launch the non-local H2D copy if there is no dependent
 152        work to do: neither non-local nor other (e.g. bonded) work
 153        to do that has as input the nbnxn coordinates.
 154        Doing the same for the local kernel is more complicated, since the
 155        local part of the force array also depends on the non-local kernel.
 156        So to avoid complicating the code and to reduce the risk of bugs,
 157        we always call the local local x+q copy (and the rest of the local
 158        work in nbnxn_gpu_launch_kernel().
 159      */
 160     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
 161     {
 162         plist->haveFreshList = false;
 163         return;
 164     }
 165
 166     int adatBegin, adatLen;
 167     getGpuAtomRange(adat, atomLocality, &adatBegin, &adatLen);
 168
 169     /* HtoD x, q */
 170     GMX_ASSERT(adat->xq.elementSize() == sizeof(float4),
 171                "The size of the xyzq buffer element should be equal to the size of float4.");
 172     copyToDeviceBuffer(&adat->xq,
 173                        reinterpret_cast<const float4*>(nbatom->x().data()) + adatBegin,
 174                        adatBegin,
 175                        adatLen,
 176                        deviceStream,
 177                        GpuApiCallBehavior::Async,
 178                        nullptr);
 179
 180     /* No need to enforce stream synchronization with events like we do in CUDA/OpenCL.
 181      * Runtime should do the scheduling correctly based on data dependencies. */
 182 }
 183
 184 void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
 185 {
 186     gpu_plist* plist = nb->plist[iloc];
 187
 188     if (plist->haveFreshList)
 189     {
 190         GMX_ASSERT(numParts == 1, "With first pruning we expect 1 part");
 191
 192         /* Set rollingPruningNumParts to signal that it is not set */
 193         plist->rollingPruningNumParts = 0;
 194         plist->rollingPruningPart     = 0;
 195     }
 196     else
 197     {
 198         if (plist->rollingPruningNumParts == 0)
 199         {
 200             plist->rollingPruningNumParts = numParts;
 201         }
 202         else
 203         {
 204             GMX_ASSERT(numParts == plist->rollingPruningNumParts,
 205                        "It is not allowed to change numParts in between list generation steps");
 206         }
 207     }
 208
 209     /* Use a local variable for part and update in plist, so we can return here
 210      * without duplicating the part increment code.
 211      */
 212     const int part = plist->rollingPruningPart;
 213
 214     plist->rollingPruningPart++;
 215     if (plist->rollingPruningPart >= plist->rollingPruningNumParts)
 216     {
 217         plist->rollingPruningPart = 0;
 218     }
 219
 220     /* Compute the number of list entries to prune in this pass */
 221     const int numSciInPart = (plist->nsci - part) / numParts;
 222
 223     /* Don't launch the kernel if there is no work to do */
 224     if (numSciInPart <= 0)
 225     {
 226         plist->haveFreshList = false;
 227         return;
 228     }
 229
 230     launchNbnxmKernelPruneOnly(nb, iloc, numParts, part, numSciInPart);
 231
 232     if (plist->haveFreshList)
 233     {
 234         plist->haveFreshList = false;
 235         nb->didPrune[iloc]   = true; // Mark that pruning has been done
 236     }
 237     else
 238     {
 239         nb->didRollingPrune[iloc] = true; // Mark that rolling pruning has been done
 240     }
 241 }
 242
 243 void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nbnxm::InteractionLocality iloc)
 244 {
 245     const NBParamGpu* nbp   = nb->nbparam;
 246     gpu_plist*        plist = nb->plist[iloc];
 247
 248     if (canSkipNonbondedWork(*nb, iloc))
 249     {
 250         plist->haveFreshList = false;
 251         return;
 252     }
 253
 254     if (nbp->useDynamicPruning && plist->haveFreshList)
 255     {
 256         // Prunes for rlistOuter and rlistInner, sets plist->haveFreshList=false
 257         gpu_launch_kernel_pruneonly(nb, iloc, 1);
 258     }
 259
 260     if (plist->nsci == 0)
 261     {
 262         /* Don't launch an empty local kernel */
 263         return;
 264     }
 265
 266     launchNbnxmKernel(nb, stepWork, iloc);
 267 }
 268
 269 } // namespace Nbnxm