}
}
-void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
-{
- int nalloc, natoms;
- bool realloced;
- bool bDoTime = nb->bDoTime;
- Nbnxm::GpuTimers* timers = nb->timers;
- NBAtomData* d_atdat = nb->atdat;
- const DeviceContext& deviceContext = *nb->deviceContext_;
- const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
-
- natoms = nbat->numAtoms();
- realloced = false;
-
- if (bDoTime)
- {
- /* time async copy */
- timers->atdat.openTimingRegion(localStream);
- }
-
- /* need to reallocate if we have to copy more atoms than the amount of space
- available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
- if (natoms > d_atdat->numAtomsAlloc)
- {
- nalloc = over_alloc_small(natoms);
-
- /* free up first if the arrays have already been initialized */
- if (d_atdat->numAtomsAlloc != -1)
- {
- freeDeviceBuffer(&d_atdat->f);
- freeDeviceBuffer(&d_atdat->xq);
- freeDeviceBuffer(&d_atdat->atomTypes);
- freeDeviceBuffer(&d_atdat->ljComb);
- }
-
- allocateDeviceBuffer(&d_atdat->f, nalloc, deviceContext);
- allocateDeviceBuffer(&d_atdat->xq, nalloc, deviceContext);
- if (useLjCombRule(nb->nbparam->vdwType))
- {
- allocateDeviceBuffer(&d_atdat->ljComb, nalloc, deviceContext);
- }
- else
- {
- allocateDeviceBuffer(&d_atdat->atomTypes, nalloc, deviceContext);
- }
-
- d_atdat->numAtomsAlloc = nalloc;
- realloced = true;
- }
-
- d_atdat->numAtoms = natoms;
- d_atdat->numAtomsLocal = nbat->natoms_local;
-
- /* need to clear GPU f output if realloc happened */
- if (realloced)
- {
- nbnxn_cuda_clear_f(nb, nalloc);
- }
-
- if (useLjCombRule(nb->nbparam->vdwType))
- {
- static_assert(sizeof(d_atdat->ljComb[0]) == sizeof(Float2),
- "Size of the LJ parameters element should be equal to the size of float2.");
- copyToDeviceBuffer(&d_atdat->ljComb,
- reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
- 0,
- natoms,
- localStream,
- GpuApiCallBehavior::Async,
- nullptr);
- }
- else
- {
- static_assert(sizeof(d_atdat->atomTypes[0]) == sizeof(nbat->params().type[0]),
- "Sizes of host- and device-side atom types should be the same.");
- copyToDeviceBuffer(&d_atdat->atomTypes,
- nbat->params().type.data(),
- 0,
- natoms,
- localStream,
- GpuApiCallBehavior::Async,
- nullptr);
- }
-
- if (bDoTime)
- {
- timers->atdat.closeTimingRegion(localStream);
- }
-}
-
void gpu_free(NbnxmGpu* nb)
{
if (nb == nullptr)
namespace Nbnxm
{
+inline void issueClFlushInStream(const DeviceStream& deviceStream)
+{
+#if GMX_GPU_OPENCL
+ /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
+ * in the stream after marking an event in it in order to be able to sync with
+ * the event from another stream.
+ */
+ cl_int cl_error = clFlush(deviceStream.stream());
+ if (cl_error != CL_SUCCESS)
+ {
+ GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
+ }
+#else
+ GMX_UNUSED_VALUE(deviceStream);
+#endif
+}
+
void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
NBParamGpu* nbp,
const DeviceContext& deviceContext)
d_plist->haveFreshList = true;
}
+void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
+{
+ bool bDoTime = nb->bDoTime;
+ Nbnxm::GpuTimers* timers = bDoTime ? nb->timers : nullptr;
+ NBAtomData* atdat = nb->atdat;
+ const DeviceContext& deviceContext = *nb->deviceContext_;
+ const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
+
+ int numAtoms = nbat->numAtoms();
+ bool realloced = false;
+
+ if (bDoTime)
+ {
+ /* time async copy */
+ timers->atdat.openTimingRegion(localStream);
+ }
+
+ /* need to reallocate if we have to copy more atoms than the amount of space
+ available and only allocate if we haven't initialized yet, i.e atdat->natoms == -1 */
+ if (numAtoms > atdat->numAtomsAlloc)
+ {
+ int numAlloc = over_alloc_small(numAtoms);
+
+ /* free up first if the arrays have already been initialized */
+ if (atdat->numAtomsAlloc != -1)
+ {
+ freeDeviceBuffer(&atdat->f);
+ freeDeviceBuffer(&atdat->xq);
+ freeDeviceBuffer(&atdat->ljComb);
+ freeDeviceBuffer(&atdat->atomTypes);
+ }
+
+
+ allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
+ allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
+
+ if (useLjCombRule(nb->nbparam->vdwType))
+ {
+ // Two Lennard-Jones parameters per atom
+ allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
+ }
+ else
+ {
+ allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
+ }
+
+ atdat->numAtomsAlloc = numAlloc;
+ realloced = true;
+ }
+
+ atdat->numAtoms = numAtoms;
+ atdat->numAtomsLocal = nbat->natoms_local;
+
+ /* need to clear GPU f output if realloc happened */
+ if (realloced)
+ {
+ clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
+ }
+
+ if (useLjCombRule(nb->nbparam->vdwType))
+ {
+ static_assert(
+ sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
+ "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
+ copyToDeviceBuffer(&atdat->ljComb,
+ reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
+ 0,
+ numAtoms,
+ localStream,
+ GpuApiCallBehavior::Async,
+ bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
+ }
+ else
+ {
+ static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
+ "Sizes of host- and device-side atom types should be the same.");
+ copyToDeviceBuffer(&atdat->atomTypes,
+ nbat->params().type.data(),
+ 0,
+ numAtoms,
+ localStream,
+ GpuApiCallBehavior::Async,
+ bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
+ }
+
+ if (bDoTime)
+ {
+ timers->atdat.closeTimingRegion(localStream);
+ }
+
+ /* kick off the tasks enqueued above to ensure concurrency with the search */
+ issueClFlushInStream(localStream);
+}
+
//! This function is documented in the header file
gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
{
return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality));
}
-inline void issueClFlushInStream(const DeviceStream& gmx_unused deviceStream)
-{
-#if GMX_GPU_OPENCL
- /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
- * in the stream after marking an event in it in order to be able to sync with
- * the event from another stream.
- */
- cl_int cl_error = clFlush(deviceStream.stream());
- if (cl_error != CL_SUCCESS)
- {
- GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
- }
-#endif
-}
-
void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
{
const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
}
}
-//! This function is documented in the header file
-void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
-{
- cl_int cl_error;
- int nalloc, natoms;
- bool realloced;
- bool bDoTime = nb->bDoTime;
- Nbnxm::GpuTimers* timers = nb->timers;
- NBAtomData* d_atdat = nb->atdat;
- const DeviceContext& deviceContext = *nb->deviceContext_;
- const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
-
- natoms = nbat->numAtoms();
- realloced = false;
-
- if (bDoTime)
- {
- /* time async copy */
- timers->atdat.openTimingRegion(localStream);
- }
-
- /* need to reallocate if we have to copy more atoms than the amount of space
- available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
- if (natoms > d_atdat->numAtomsAlloc)
- {
- nalloc = over_alloc_small(natoms);
-
- /* free up first if the arrays have already been initialized */
- if (d_atdat->numAtomsAlloc != -1)
- {
- freeDeviceBuffer(&d_atdat->f);
- freeDeviceBuffer(&d_atdat->xq);
- freeDeviceBuffer(&d_atdat->ljComb);
- freeDeviceBuffer(&d_atdat->atomTypes);
- }
-
-
- allocateDeviceBuffer(&d_atdat->f, nalloc, deviceContext);
- allocateDeviceBuffer(&d_atdat->xq, nalloc, deviceContext);
-
- if (useLjCombRule(nb->nbparam->vdwType))
- {
- // Two Lennard-Jones parameters per atom
- allocateDeviceBuffer(&d_atdat->ljComb, nalloc, deviceContext);
- }
- else
- {
- allocateDeviceBuffer(&d_atdat->atomTypes, nalloc, deviceContext);
- }
-
- d_atdat->numAtomsAlloc = nalloc;
- realloced = true;
- }
-
- d_atdat->numAtoms = natoms;
- d_atdat->numAtomsLocal = nbat->natoms_local;
-
- /* need to clear GPU f output if realloc happened */
- if (realloced)
- {
- nbnxn_ocl_clear_f(nb, nalloc);
- }
-
- if (useLjCombRule(nb->nbparam->vdwType))
- {
- static_assert(
- sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
- "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
- copyToDeviceBuffer(&d_atdat->ljComb,
- reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
- 0,
- natoms,
- localStream,
- GpuApiCallBehavior::Async,
- bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
- }
- else
- {
- static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
- "Sizes of host- and device-side atom types should be the same.");
- copyToDeviceBuffer(&d_atdat->atomTypes,
- nbat->params().type.data(),
- 0,
- natoms,
- localStream,
- GpuApiCallBehavior::Async,
- bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
- }
-
- if (bDoTime)
- {
- timers->atdat.closeTimingRegion(localStream);
- }
-
- /* kick off the tasks enqueued above to ensure concurrency with the search */
- cl_error = clFlush(localStream.stream());
- GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
- ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
-}
-
/*! \brief Releases an OpenCL kernel pointer */
static void free_kernel(cl_kernel* kernel_ptr)
{
}
}
-void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
-{
- GMX_ASSERT(!nb->bDoTime, "Timing on SYCL not supported yet");
- NBAtomData* atdat = nb->atdat;
- const DeviceContext& deviceContext = *nb->deviceContext_;
- const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
-
- int numAtoms = nbat->numAtoms();
- bool reallocated = false;
- if (numAtoms > atdat->numAtomsAlloc)
- {
- int numAlloc = over_alloc_small(numAtoms);
-
- /* free up first if the arrays have already been initialized */
- if (atdat->numAtomsAlloc != -1)
- {
- freeDeviceBuffer(&atdat->f);
- freeDeviceBuffer(&atdat->xq);
- freeDeviceBuffer(&atdat->atomTypes);
- freeDeviceBuffer(&atdat->ljComb);
- }
-
- allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
- allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
- if (useLjCombRule(nb->nbparam->vdwType))
- {
- allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
- }
- else
- {
- allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
- }
-
- atdat->numAtomsAlloc = numAlloc;
- reallocated = true;
- }
-
- atdat->numAtoms = numAtoms;
- atdat->numAtomsLocal = nbat->natoms_local;
-
- /* need to clear GPU f output if realloc happened */
- if (reallocated)
- {
- clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
- }
-
- if (useLjCombRule(nb->nbparam->vdwType))
- {
- GMX_ASSERT(atdat->ljComb.elementSize() == sizeof(Float2),
- "Size of the LJ parameters element should be equal to the size of float2.");
- copyToDeviceBuffer(&atdat->ljComb,
- reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
- 0,
- numAtoms,
- localStream,
- GpuApiCallBehavior::Async,
- nullptr);
- }
- else
- {
- GMX_ASSERT(atdat->atomTypes.elementSize() == sizeof(nbat->params().type[0]),
- "Sizes of host- and device-side atom types should be the same.");
- copyToDeviceBuffer(&atdat->atomTypes,
- nbat->params().type.data(),
- 0,
- numAtoms,
- localStream,
- GpuApiCallBehavior::Async,
- nullptr);
- }
-}
-
void gpu_free(NbnxmGpu* nb)
{
if (nb == nullptr)