Fix random typos

[alexxy/gromacs.git] / src / gromacs / nbnxm / opencl / nbnxm_ocl.cpp
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp

index b8107fc857de911e79f6b48714301118a2a1aba5..b4b28c06526441dbc7f2b5e162a4a1d56bd4bb13 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -81,7 +81,6 @@
  #include "gromacs/nbnxm/nbnxm.h"
  #include "gromacs/nbnxm/nbnxm_gpu.h"
  #include "gromacs/nbnxm/pairlist.h"
-#include "gromacs/pbcutil/ishift.h"
  #include "gromacs/timing/gpu_timing.h"
  #include "gromacs/utility/cstringutil.h"
  #include "gromacs/utility/fatalerror.h"
@@ -489,107 +488,6 @@ static void fillin_ocl_structures(NBParamGpu* nbp, cl_nbparam_params_t* nbparams
      nbparams_params->vdw_switch        = nbp->vdw_switch;
  }
  
-void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
-{
-    const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
-
-    /* When we get here all misc operations issued in the local stream as well as
-       the local xq H2D are done,
-       so we record that in the local stream and wait for it in the nonlocal one.
-       This wait needs to precede any PP tasks, bonded or nonbonded, that may
-       compute on interactions between local and nonlocal atoms.
-     */
-    if (nb->bUseTwoStreams)
-    {
-        if (interactionLocality == InteractionLocality::Local)
-        {
-            nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
-
-            /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
-             * in the local stream in order to be able to sync with the above event
-             * from the non-local stream.
-             */
-            cl_int gmx_used_in_debug cl_error = clFlush(deviceStream.stream());
-            GMX_ASSERT(cl_error == CL_SUCCESS,
-                       ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
-        }
-        else
-        {
-            nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
-        }
-    }
-}
-
-/*! \brief Launch asynchronously the xq buffer host to device copy. */
-void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
-{
-    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
-    const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
-
-    NBAtomData*         adat         = nb->atdat;
-    gpu_plist*          plist        = nb->plist[iloc];
-    Nbnxm::GpuTimers*   timers       = nb->timers;
-    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
-
-    bool bDoTime = nb->bDoTime;
-
-    /* Don't launch the non-local H2D copy if there is no dependent
-       work to do: neither non-local nor other (e.g. bonded) work
-       to do that has as input the nbnxn coordinates.
-       Doing the same for the local kernel is more complicated, since the
-       local part of the force array also depends on the non-local kernel.
-       So to avoid complicating the code and to reduce the risk of bugs,
-       we always call the local local x+q copy (and the rest of the local
-       work in nbnxn_gpu_launch_kernel().
-     */
-    if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
-    {
-        plist->haveFreshList = false;
-
-        // The event is marked for Local interactions unconditionally,
-        // so it has to be released here because of the early return
-        // for NonLocal interactions.
-        nb->misc_ops_and_local_H2D_done.reset();
-
-        return;
-    }
-
-    /* local/nonlocal offset and length used for xq and f */
-    auto atomsRange = getGpuAtomRange(adat, atomLocality);
-
-    /* beginning of timed HtoD section */
-    if (bDoTime)
-    {
-        timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
-    }
-
-    /* HtoD x, q */
-    static_assert(sizeof(float) == sizeof(*nbatom->x().data()),
-                  "The size of the xyzq buffer element should be equal to the size of float4.");
-    copyToDeviceBuffer(&adat->xq,
-                       reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
-                       atomsRange.begin(),
-                       atomsRange.size(),
-                       deviceStream,
-                       GpuApiCallBehavior::Async,
-                       bDoTime ? timers->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
-
-    if (bDoTime)
-    {
-        timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
-    }
-
-    /* When we get here all misc operations issued in the local stream as well as
-       the local xq H2D are done,
-       so we record that in the local stream and wait for it in the nonlocal one.
-       This wait needs to precede any PP tasks, bonded or nonbonded, that may
-       compute on interactions between local and nonlocal atoms.
-     */
-    nbnxnInsertNonlocalGpuDependency(nb, iloc);
-}
-
-
  /*! \brief Launch GPU kernel
  
     As we execute nonbonded workload in separate queues, before launching
@@ -610,7 +508,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
   */
  void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nbnxm::InteractionLocality iloc)
  {
-    NBAtomData*         adat         = nb->atdat;
+    NBAtomDataGpu*      adat         = nb->atdat;
      NBParamGpu*         nbp          = nb->nbparam;
      gpu_plist*          plist        = nb->plist[iloc];
      Nbnxm::GpuTimers*   timers       = nb->timers;
@@ -756,7 +654,8 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb
   *  for OpenCL local memory.
   *
   * \param[in] num_threads_z cj4 concurrency equal to the number of threads/work items in the 3-rd
- * dimension. \returns   the amount of local memory in bytes required by the pruning kernel
+ * dimension.
+ * \returns   the amount of local memory in bytes required by the pruning kernel
   */
  static inline int calc_shmem_required_prune(const int num_threads_z)
  {
@@ -781,7 +680,7 @@ static inline int calc_shmem_required_prune(const int num_threads_z)
   */
  void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
  {
-    NBAtomData*         adat         = nb->atdat;
+    NBAtomDataGpu*      adat         = nb->atdat;
      NBParamGpu*         nbp          = nb->nbparam;
      gpu_plist*          plist        = nb->plist[iloc];
      Nbnxm::GpuTimers*   timers       = nb->timers;
@@ -913,135 +812,4 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
      }
  }
  
-/*! \brief
- * Launch asynchronously the download of nonbonded forces from the GPU
- * (and energies/shift forces if required).
- */
-void gpu_launch_cpyback(NbnxmGpu*                nb,
-                        struct nbnxn_atomdata_t* nbatom,
-                        const gmx::StepWorkload& stepWork,
-                        const AtomLocality       atomLocality)
-{
-    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
-    cl_int gmx_unused cl_error;
-
-    /* determine interaction locality from atom locality */
-    const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
-    GMX_ASSERT(iloc == InteractionLocality::Local
-                       || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
-               "Non-local stream is indicating that the copy back event is enqueued at the "
-               "beginning of the copy back function.");
-
-    NBAtomData*         adat         = nb->atdat;
-    Nbnxm::GpuTimers*   timers       = nb->timers;
-    bool                bDoTime      = nb->bDoTime;
-    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
-
-    /* don't launch non-local copy-back if there was no non-local work to do */
-    if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
-    {
-        /* TODO An alternative way to signal that non-local work is
-           complete is to use a clEnqueueMarker+clEnqueueBarrier
-           pair. However, the use of bNonLocalStreamDoneMarked has the
-           advantage of being local to the host, so probably minimizes
-           overhead. Curiously, for NVIDIA OpenCL with an empty-domain
-           test case, overall simulation performance was higher with
-           the API calls, but this has not been tested on AMD OpenCL,
-           so could be worth considering in future. */
-        nb->bNonLocalStreamDoneMarked = false;
-        return;
-    }
-
-    /* local/nonlocal offset and length used for xq and f */
-    auto atomsRange = getGpuAtomRange(adat, atomLocality);
-
-    /* beginning of timed D2H section */
-    if (bDoTime)
-    {
-        timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
-    }
-
-    /* With DD the local D2H transfer can only start after the non-local
-       has been launched. */
-    if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
-    {
-        nb->nonlocal_done.enqueueWaitEvent(deviceStream);
-        nb->bNonLocalStreamDoneMarked = false;
-    }
-
-    /* DtoH f */
-    GMX_ASSERT(sizeof(*nbatom->out[0].f.data()) == sizeof(float),
-               "The host force buffer should be in single precision to match device data size.");
-    copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
-                         &adat->f,
-                         atomsRange.begin(),
-                         atomsRange.size(),
-                         deviceStream,
-                         GpuApiCallBehavior::Async,
-                         bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
-
-    /* kick off work */
-    cl_error = clFlush(deviceStream.stream());
-    GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
-
-    /* After the non-local D2H is launched the nonlocal_done event can be
-       recorded which signals that the local D2H can proceed. This event is not
-       placed after the non-local kernel because we first need the non-local
-       data back first. */
-    if (iloc == InteractionLocality::NonLocal)
-    {
-        nb->nonlocal_done.markEvent(deviceStream);
-        nb->bNonLocalStreamDoneMarked = true;
-    }
-
-    /* only transfer energies in the local stream */
-    if (iloc == InteractionLocality::Local)
-    {
-        /* DtoH fshift when virial is needed */
-        if (stepWork.computeVirial)
-        {
-            static_assert(
-                    sizeof(*nb->nbst.fShift) == sizeof(Float3),
-                    "Sizes of host- and device-side shift vector elements should be the same.");
-            copyFromDeviceBuffer(nb->nbst.fShift,
-                                 &adat->fShift,
-                                 0,
-                                 SHIFTS,
-                                 deviceStream,
-                                 GpuApiCallBehavior::Async,
-                                 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
-        }
-
-        /* DtoH energies */
-        if (stepWork.computeEnergy)
-        {
-            static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
-                          "Sizes of host- and device-side LJ energy terms should be the same.");
-            copyFromDeviceBuffer(nb->nbst.eLJ,
-                                 &adat->eLJ,
-                                 0,
-                                 1,
-                                 deviceStream,
-                                 GpuApiCallBehavior::Async,
-                                 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
-            static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
-                          "Sizes of host- and device-side electrostatic energy terms should be the "
-                          "same.");
-            copyFromDeviceBuffer(nb->nbst.eElec,
-                                 &adat->eElec,
-                                 0,
-                                 1,
-                                 deviceStream,
-                                 GpuApiCallBehavior::Async,
-                                 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
-        }
-    }
-
-    if (bDoTime)
-    {
-        timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
-    }
-}
-
  } // namespace Nbnxm