#include "gromacs/gpu_utils/cuda_arch_utils.cuh"
#include "gromacs/gpu_utils/cudautils.cuh"
#include "gromacs/gpu_utils/devicebuffer.h"
-#include "gromacs/gpu_utils/gpu_vec.cuh"
-#include "gromacs/gpu_utils/gputraits.cuh"
-#include "gromacs/gpu_utils/hostallocator.h"
-#include "gromacs/listed_forces/gpubonded.h"
#include "gromacs/mdtypes/enerdata.h"
#include "gromacs/topology/forcefieldparameters.h"
-#include "gromacs/topology/idef.h"
struct t_forcerec;
GpuBonded::Impl::Impl(const gmx_ffparams_t &ffparams,
void *streamPtr)
{
- stream = *static_cast<CommandStream*>(streamPtr);
+ stream_ = *static_cast<CommandStream*>(streamPtr);
- allocateDeviceBuffer(&forceparamsDevice, ffparams.numTypes(), nullptr);
+ allocateDeviceBuffer(&d_forceParams_, ffparams.numTypes(), nullptr);
// This could be an async transfer (if the source is pinned), so
// long as it uses the same stream as the kernels and we are happy
// to consume additional pinned pages.
- copyToDeviceBuffer(&forceparamsDevice, ffparams.iparams.data(),
+ copyToDeviceBuffer(&d_forceParams_, ffparams.iparams.data(),
0, ffparams.numTypes(),
- stream, GpuApiCallBehavior::Sync, nullptr);
- vtot.resize(F_NRE);
- allocateDeviceBuffer(&vtotDevice, F_NRE, nullptr);
- clearDeviceBufferAsync(&vtotDevice, 0, F_NRE, stream);
+ stream_, GpuApiCallBehavior::Sync, nullptr);
+ vTot_.resize(F_NRE);
+ allocateDeviceBuffer(&d_vTot_, F_NRE, nullptr);
+ clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, stream_);
- for (int ftype = 0; ftype < F_NRE; ftype++)
+ for (int fType = 0; fType < F_NRE; fType++)
{
- iListsDevice[ftype].nr = 0;
- iListsDevice[ftype].iatoms = nullptr;
- iListsDevice[ftype].nalloc = 0;
+ d_iLists_[fType].nr = 0;
+ d_iLists_[fType].iatoms = nullptr;
+ d_iLists_[fType].nalloc = 0;
}
- kernelParams_.forceparamsDevice = forceparamsDevice;
- kernelParams_.xqDevice = xqDevice;
- kernelParams_.forceDevice = forceDevice;
- kernelParams_.fshiftDevice = fshiftDevice;
- kernelParams_.vtotDevice = vtotDevice;
- for (int i = 0; i < nFtypesOnGpu; i++)
+ kernelParams_.d_forceParams = d_forceParams_;
+ kernelParams_.d_xq = d_xq_;
+ kernelParams_.d_f = d_f_;
+ kernelParams_.d_fShift = d_fShift_;
+ kernelParams_.d_vTot = d_vTot_;
+ for (int i = 0; i < numFTypesOnGpu; i++)
{
- kernelParams_.iatoms[i] = nullptr;
- kernelParams_.ftypeRangeStart[i] = 0;
- kernelParams_.ftypeRangeEnd[i] = -1;
+ kernelParams_.d_iatoms[i] = nullptr;
+ kernelParams_.fTypeRangeStart[i] = 0;
+ kernelParams_.fTypeRangeEnd[i] = -1;
}
}
GpuBonded::Impl::~Impl()
{
- for (int ftype : ftypesOnGpu)
+ for (int fType : fTypesOnGpu)
{
- if (iListsDevice[ftype].iatoms)
+ if (d_iLists_[fType].iatoms)
{
- freeDeviceBuffer(&iListsDevice[ftype].iatoms);
- iListsDevice[ftype].iatoms = nullptr;
+ freeDeviceBuffer(&d_iLists_[fType].iatoms);
+ d_iLists_[fType].iatoms = nullptr;
}
}
- freeDeviceBuffer(&forceparamsDevice);
- freeDeviceBuffer(&vtotDevice);
+ freeDeviceBuffer(&d_forceParams_);
+ freeDeviceBuffer(&d_vTot_);
}
-//! Return whether function type \p ftype in \p idef has perturbed interactions
-static bool ftypeHasPerturbedEntries(const t_idef &idef,
- int ftype)
+//! Return whether function type \p fType in \p idef has perturbed interactions
+static bool fTypeHasPerturbedEntries(const t_idef &idef,
+ int fType)
{
GMX_ASSERT(idef.ilsort == ilsortNO_FE || idef.ilsort == ilsortFE_SORTED,
"Perturbed interations should be sorted here");
- const t_ilist &ilist = idef.il[ftype];
+ const t_ilist &ilist = idef.il[fType];
return (idef.ilsort != ilsortNO_FE && ilist.nr_nonperturbed != ilist.nr);
}
void
GpuBonded::Impl::updateInteractionListsAndDeviceBuffers(ArrayRef<const int> nbnxnAtomOrder,
const t_idef &idef,
- void *xqDevicePtr,
- void *forceDevicePtr,
- void *fshiftDevicePtr)
+ void *d_xqPtr,
+ void *d_fPtr,
+ void *d_fShiftPtr)
{
// TODO wallcycle sub start
haveInteractions_ = false;
- int ftypesCounter = 0;
+ int fTypesCounter = 0;
- for (int ftype : ftypesOnGpu)
+ for (int fType : fTypesOnGpu)
{
- auto &iList = iLists[ftype];
+ auto &iList = iLists_[fType];
/* Perturbation is not implemented in the GPU bonded kernels.
* But instead of doing all interactions on the CPU, we can
* still easily handle the types that have no perturbed
* interactions on the GPU. */
- if (idef.il[ftype].nr > 0 && !ftypeHasPerturbedEntries(idef, ftype))
+ if (idef.il[fType].nr > 0 && !fTypeHasPerturbedEntries(idef, fType))
{
haveInteractions_ = true;
- convertIlistToNbnxnOrder(idef.il[ftype],
+ convertIlistToNbnxnOrder(idef.il[fType],
&iList,
- NRAL(ftype), nbnxnAtomOrder);
+ NRAL(fType), nbnxnAtomOrder);
}
else
{
// end.
if (iList.size() > 0)
{
- t_ilist &iListDevice = iListsDevice[ftype];
+ t_ilist &d_iList = d_iLists_[fType];
- reallocateDeviceBuffer(&iListDevice.iatoms, iList.size(), &iListDevice.nr, &iListDevice.nalloc, nullptr);
+ reallocateDeviceBuffer(&d_iList.iatoms, iList.size(), &d_iList.nr, &d_iList.nalloc, nullptr);
- copyToDeviceBuffer(&iListDevice.iatoms, iList.iatoms.data(),
+ copyToDeviceBuffer(&d_iList.iatoms, iList.iatoms.data(),
0, iList.size(),
- stream, GpuApiCallBehavior::Async, nullptr);
+ stream_, GpuApiCallBehavior::Async, nullptr);
}
- kernelParams_.ftypesOnGpu[ftypesCounter] = ftype;
- kernelParams_.nrFTypeIAtoms[ftypesCounter] = iList.size();
- int nBonds = iList.size() / (interaction_function[ftype].nratoms + 1);
- kernelParams_.nrFTypeBonds[ftypesCounter] = nBonds;
- kernelParams_.iatoms[ftypesCounter] = iListsDevice[ftype].iatoms;
- if (ftypesCounter == 0)
+ kernelParams_.fTypesOnGpu[fTypesCounter] = fType;
+ kernelParams_.numFTypeIAtoms[fTypesCounter] = iList.size();
+ int numBonds = iList.size() / (interaction_function[fType].nratoms + 1);
+ kernelParams_.numFTypeBonds[fTypesCounter] = numBonds;
+ kernelParams_.d_iatoms[fTypesCounter] = d_iLists_[fType].iatoms;
+ if (fTypesCounter == 0)
{
- kernelParams_.ftypeRangeStart[ftypesCounter] = 0;
+ kernelParams_.fTypeRangeStart[fTypesCounter] = 0;
}
else
{
- kernelParams_.ftypeRangeStart[ftypesCounter] = kernelParams_.ftypeRangeEnd[ftypesCounter - 1] + 1;
+ kernelParams_.fTypeRangeStart[fTypesCounter] = kernelParams_.fTypeRangeEnd[fTypesCounter - 1] + 1;
}
- kernelParams_.ftypeRangeEnd[ftypesCounter] = kernelParams_.ftypeRangeStart[ftypesCounter] + roundUpToFactor(nBonds, warp_size) - 1;
+ kernelParams_.fTypeRangeEnd[fTypesCounter] = kernelParams_.fTypeRangeStart[fTypesCounter] + roundUpToFactor(numBonds, warp_size) - 1;
- GMX_ASSERT(nBonds > 0 || kernelParams_.ftypeRangeEnd[ftypesCounter] <= kernelParams_.ftypeRangeStart[ftypesCounter],
- "Invalid GPU listed forces setup. nBonds must be > 0 if there are threads allocated to do work on that interaction function type.");
- GMX_ASSERT(kernelParams_.ftypeRangeStart[ftypesCounter] % warp_size == 0 && (kernelParams_.ftypeRangeEnd[ftypesCounter] + 1) % warp_size == 0,
+ GMX_ASSERT(numBonds > 0 || kernelParams_.fTypeRangeEnd[fTypesCounter] <= kernelParams_.fTypeRangeStart[fTypesCounter],
+ "Invalid GPU listed forces setup. numBonds must be > 0 if there are threads allocated to do work on that interaction function type.");
+ GMX_ASSERT(kernelParams_.fTypeRangeStart[fTypesCounter] % warp_size == 0 && (kernelParams_.fTypeRangeEnd[fTypesCounter] + 1) % warp_size == 0,
"The bonded interactions must be assigned to the GPU in blocks of warp size.");
- ftypesCounter++;
+ fTypesCounter++;
}
- xqDevice = static_cast<float4 *>(xqDevicePtr);
- forceDevice = static_cast<fvec *>(forceDevicePtr);
- fshiftDevice = static_cast<fvec *>(fshiftDevicePtr);
+ d_xq_ = static_cast<float4 *>(d_xqPtr);
+ d_f_ = static_cast<fvec *>(d_fPtr);
+ d_fShift_ = static_cast<fvec *>(d_fShiftPtr);
- kernelParams_.xqDevice = xqDevice;
- kernelParams_.forceDevice = forceDevice;
- kernelParams_.fshiftDevice = fshiftDevice;
- kernelParams_.forceparamsDevice = forceparamsDevice;
- kernelParams_.vtotDevice = vtotDevice;
+ kernelParams_.d_xq = d_xq_;
+ kernelParams_.d_f = d_f_;
+ kernelParams_.d_fShift = d_fShift_;
+ kernelParams_.d_forceParams = d_forceParams_;
+ kernelParams_.d_vTot = d_vTot_;
// TODO wallcycle sub stop
}
// TODO should wrap with ewcLAUNCH_GPU
GMX_ASSERT(haveInteractions_, "No GPU bonded interactions, so no energies will be computed, so transfer should not be called");
- float *vtot_h = vtot.data();
- copyFromDeviceBuffer(vtot_h, &vtotDevice,
+ float *h_vTot = vTot_.data();
+ copyFromDeviceBuffer(h_vTot, &d_vTot_,
0, F_NRE,
- stream, GpuApiCallBehavior::Async, nullptr);
+ stream_, GpuApiCallBehavior::Async, nullptr);
}
void
// wait goes in to the "Rest" counter
GMX_ASSERT(haveInteractions_, "No GPU bonded interactions, so no energies will be computed or transferred, so accumulation should not occur");
- cudaError_t stat = cudaStreamSynchronize(stream);
+ cudaError_t stat = cudaStreamSynchronize(stream_);
CU_RET_ERR(stat, "D2H transfer of bonded energies failed");
- for (int ftype : ftypesOnGpu)
+ for (int fType : fTypesOnGpu)
{
- if (ftype != F_LJ14 && ftype != F_COUL14)
+ if (fType != F_LJ14 && fType != F_COUL14)
{
- enerd->term[ftype] += vtot[ftype];
+ enerd->term[fType] += vTot_[fType];
}
}
// Note: We do not support energy groups here
gmx_grppairener_t *grppener = &enerd->grpp;
GMX_RELEASE_ASSERT(grppener->nener == 1, "No energy group support for bondeds on the GPU");
- grppener->ener[egLJ14][0] += vtot[F_LJ14];
- grppener->ener[egCOUL14][0] += vtot[F_COUL14];
+ grppener->ener[egLJ14][0] += vTot_[F_LJ14];
+ grppener->ener[egCOUL14][0] += vTot_[F_COUL14];
}
void
GpuBonded::Impl::clearEnergies()
{
// TODO should wrap with ewcLAUNCH_GPU
- clearDeviceBufferAsync(&vtotDevice, 0, F_NRE, stream);
+ clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, stream_);
}
// ---- GpuBonded
void
GpuBonded::updateInteractionListsAndDeviceBuffers(ArrayRef<const int> nbnxnAtomOrder,
const t_idef &idef,
- void *xqDevice,
- void *forceDevice,
- void *fshiftDevice)
+ void *d_xq,
+ void *d_f,
+ void *d_fShift)
{
impl_->updateInteractionListsAndDeviceBuffers
- (nbnxnAtomOrder, idef, xqDevice, forceDevice, fshiftDevice);
+ (nbnxnAtomOrder, idef, d_xq, d_f, d_fShift);
}
bool
#include <math_constants.h>
#include "gromacs/gpu_utils/cudautils.cuh"
-#include "gromacs/gpu_utils/devicebuffer.h"
#include "gromacs/gpu_utils/gpu_vec.cuh"
-#include "gromacs/gpu_utils/gputraits.cuh"
#include "gromacs/listed_forces/gpubonded.h"
-#include "gromacs/listed_forces/listed_forces.h"
#include "gromacs/math/units.h"
#include "gromacs/mdlib/force_flags.h"
-#include "gromacs/mdtypes/enerdata.h"
#include "gromacs/mdtypes/forcerec.h"
-#include "gromacs/mdtypes/group.h"
-#include "gromacs/mdtypes/mdatom.h"
-#include "gromacs/pbcutil/ishift.h"
#include "gromacs/pbcutil/pbc.h"
#include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
-#include "gromacs/topology/idef.h"
-#include "gromacs/topology/ifunc.h"
#include "gromacs/utility/gmxassert.h"
#include "gpubonded_impl.h"
template <bool calcVir, bool calcEner>
__device__
-void bonds_gpu(const int i, float *vtot_loc, const int nBonds,
- const t_iatom forceatoms[], const t_iparams forceparams[],
- const float4 xq[], fvec force[], fvec sm_fShiftLoc[],
+void bonds_gpu(const int i, float *vtot_loc, const int numBonds,
+ const t_iatom d_forceatoms[], const t_iparams d_forceparams[],
+ const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
const PbcAiuc pbcAiuc)
{
- if (i < nBonds)
+ if (i < numBonds)
{
- int type = forceatoms[3*i];
- int ai = forceatoms[3*i + 1];
- int aj = forceatoms[3*i + 2];
+ int type = d_forceatoms[3*i];
+ int ai = d_forceatoms[3*i + 1];
+ int aj = d_forceatoms[3*i + 2];
/* dx = xi - xj, corrected for periodic boundary conditions. */
fvec dx;
- int ki = pbcDxAiuc<calcVir>(pbcAiuc, xq[ai], xq[aj], dx);
+ int ki = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[aj], dx);
float dr2 = iprod_gpu(dx, dx);
float dr = sqrt(dr2);
float vbond;
float fbond;
- harmonic_gpu(forceparams[type].harmonic.krA,
- forceparams[type].harmonic.rA,
+ harmonic_gpu(d_forceparams[type].harmonic.krA,
+ d_forceparams[type].harmonic.rA,
dr, &vbond, &fbond);
if (calcEner)
for (int m = 0; m < DIM; m++)
{
float fij = fbond*dx[m];
- atomicAdd(&force[ai][m], fij);
- atomicAdd(&force[aj][m], -fij);
+ atomicAdd(&gm_f[ai][m], fij);
+ atomicAdd(&gm_f[aj][m], -fij);
if (calcVir && ki != CENTRAL)
{
atomicAdd(&sm_fShiftLoc[ki][m], fij);
template <bool calcVir, bool calcEner>
__device__
-void angles_gpu(const int i, float *vtot_loc, const int nBonds,
- const t_iatom forceatoms[], const t_iparams forceparams[],
- const float4 xq[], fvec force[], fvec sm_fShiftLoc[],
+void angles_gpu(const int i, float *vtot_loc, const int numBonds,
+ const t_iatom d_forceatoms[], const t_iparams d_forceparams[],
+ const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
const PbcAiuc pbcAiuc)
{
- if (i < nBonds)
+ if (i < numBonds)
{
- int type = forceatoms[4*i];
- int ai = forceatoms[4*i + 1];
- int aj = forceatoms[4*i + 2];
- int ak = forceatoms[4*i + 3];
+ int type = d_forceatoms[4*i];
+ int ai = d_forceatoms[4*i + 1];
+ int aj = d_forceatoms[4*i + 2];
+ int ak = d_forceatoms[4*i + 3];
fvec r_ij;
fvec r_kj;
int t1;
int t2;
float theta =
- bond_angle_gpu<calcVir>(xq[ai], xq[aj], xq[ak], pbcAiuc,
+ bond_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], pbcAiuc,
r_ij, r_kj, &cos_theta, &t1, &t2);
float va;
float dVdt;
- harmonic_gpu(forceparams[type].harmonic.krA,
- forceparams[type].harmonic.rA*DEG2RAD,
+ harmonic_gpu(d_forceparams[type].harmonic.krA,
+ d_forceparams[type].harmonic.rA*DEG2RAD,
theta, &va, &dVdt);
if (calcEner)
f_i[m] = -(cik*r_kj[m] - cii*r_ij[m]);
f_k[m] = -(cik*r_ij[m] - ckk*r_kj[m]);
f_j[m] = -f_i[m] - f_k[m];
- atomicAdd(&force[ai][m], f_i[m]);
- atomicAdd(&force[aj][m], f_j[m]);
- atomicAdd(&force[ak][m], f_k[m]);
+ atomicAdd(&gm_f[ai][m], f_i[m]);
+ atomicAdd(&gm_f[aj][m], f_j[m]);
+ atomicAdd(&gm_f[ak][m], f_k[m]);
if (calcVir)
{
atomicAdd(&sm_fShiftLoc[t1][m], f_i[m]);
template <bool calcVir, bool calcEner>
__device__
-void urey_bradley_gpu(const int i, float *vtot_loc, const int nBonds,
- const t_iatom forceatoms[], const t_iparams forceparams[],
- const float4 xq[], fvec force[], fvec sm_fShiftLoc[],
+void urey_bradley_gpu(const int i, float *vtot_loc, const int numBonds,
+ const t_iatom d_forceatoms[], const t_iparams d_forceparams[],
+ const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
const PbcAiuc pbcAiuc)
{
- if (i < nBonds)
+ if (i < numBonds)
{
- int type = forceatoms[4*i];
- int ai = forceatoms[4*i+1];
- int aj = forceatoms[4*i+2];
- int ak = forceatoms[4*i+3];
+ int type = d_forceatoms[4*i];
+ int ai = d_forceatoms[4*i+1];
+ int aj = d_forceatoms[4*i+2];
+ int ak = d_forceatoms[4*i+3];
- float th0A = forceparams[type].u_b.thetaA*DEG2RAD;
- float kthA = forceparams[type].u_b.kthetaA;
- float r13A = forceparams[type].u_b.r13A;
- float kUBA = forceparams[type].u_b.kUBA;
+ float th0A = d_forceparams[type].u_b.thetaA*DEG2RAD;
+ float kthA = d_forceparams[type].u_b.kthetaA;
+ float r13A = d_forceparams[type].u_b.r13A;
+ float kUBA = d_forceparams[type].u_b.kUBA;
fvec r_ij;
fvec r_kj;
float cos_theta;
int t1;
int t2;
- float theta = bond_angle_gpu<calcVir>(xq[ai], xq[aj], xq[ak], pbcAiuc,
+ float theta = bond_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], pbcAiuc,
r_ij, r_kj, &cos_theta, &t1, &t2);
float va;
}
fvec r_ik;
- int ki = pbcDxAiuc<calcVir>(pbcAiuc, xq[ai], xq[ak], r_ik);
+ int ki = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[ak], r_ik);
float dr2 = iprod_gpu(r_ik, r_ik);
float dr = dr2*rsqrtf(dr2);
f_i[m] = -(cik*r_kj[m]-cii*r_ij[m]);
f_k[m] = -(cik*r_ij[m]-ckk*r_kj[m]);
f_j[m] = -f_i[m]-f_k[m];
- atomicAdd(&force[ai][m], f_i[m]);
- atomicAdd(&force[aj][m], f_j[m]);
- atomicAdd(&force[ak][m], f_k[m]);
+ atomicAdd(&gm_f[ai][m], f_i[m]);
+ atomicAdd(&gm_f[aj][m], f_j[m]);
+ atomicAdd(&gm_f[ak][m], f_k[m]);
if (calcVir)
{
atomicAdd(&sm_fShiftLoc[t1][m], f_i[m]);
for (int m = 0; m < DIM; m++)
{
float fik = fbond*r_ik[m];
- atomicAdd(&force[ai][m], fik);
- atomicAdd(&force[ak][m], -fik);
+ atomicAdd(&gm_f[ai][m], fik);
+ atomicAdd(&gm_f[ak][m], -fik);
if (calcVir && ki != CENTRAL)
{
__device__ __forceinline__
static void dopdihs_gpu(const float cpA, const float phiA, const int mult,
- const float phi, float *V, float *F)
+ const float phi, float *v, float *f)
{
float mdphi, sdphi;
mdphi = mult*phi - phiA*DEG2RAD;
sdphi = sinf(mdphi);
- *V = cpA * (1.0f + cosf(mdphi));
- *F = -cpA*mult*sdphi;
+ *v = cpA * (1.0f + cosf(mdphi));
+ *f = -cpA*mult*sdphi;
}
template <bool calcVir>
__device__
static void do_dih_fup_gpu(const int i, const int j, const int k, const int l,
const float ddphi, const fvec r_ij, const fvec r_kj, const fvec r_kl,
- const fvec m, const fvec n, fvec force[], fvec fshift[],
+ const fvec m, const fvec n, fvec gm_f[], fvec sm_fShiftLoc[],
const PbcAiuc &pbcAiuc,
- const float4 xq[], const int t1, const int t2, const int gmx_unused t3)
+ const float4 gm_xq[], const int t1, const int t2, const int gmx_unused t3)
{
float iprm = iprod_gpu(m, m);
float iprn = iprod_gpu(n, n);
#pragma unroll
for (int m = 0; (m < DIM); m++)
{
- atomicAdd(&force[i][m], f_i[m]);
- atomicAdd(&force[j][m], -f_j[m]);
- atomicAdd(&force[k][m], -f_k[m]);
- atomicAdd(&force[l][m], f_l[m]);
+ atomicAdd(&gm_f[i][m], f_i[m]);
+ atomicAdd(&gm_f[j][m], -f_j[m]);
+ atomicAdd(&gm_f[k][m], -f_k[m]);
+ atomicAdd(&gm_f[l][m], f_l[m]);
}
if (calcVir)
{
fvec dx_jl;
- int t3 = pbcDxAiuc<calcVir>(pbcAiuc, xq[l], xq[j], dx_jl);
+ int t3 = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[l], gm_xq[j], dx_jl);
#pragma unroll
for (int m = 0; (m < DIM); m++)
{
- atomicAdd(&fshift[t1][m], f_i[m]);
- atomicAdd(&fshift[CENTRAL][m], -f_j[m]);
- atomicAdd(&fshift[t2][m], -f_k[m]);
- atomicAdd(&fshift[t3][m], f_l[m]);
+ atomicAdd(&sm_fShiftLoc[t1][m], f_i[m]);
+ atomicAdd(&sm_fShiftLoc[CENTRAL][m], -f_j[m]);
+ atomicAdd(&sm_fShiftLoc[t2][m], -f_k[m]);
+ atomicAdd(&sm_fShiftLoc[t3][m], f_l[m]);
}
}
}
template <bool calcVir, bool calcEner>
__device__
-void pdihs_gpu(const int i, float *vtot_loc, const int nBonds,
- const t_iatom forceatoms[], const t_iparams forceparams[],
- const float4 xq[], fvec f[], fvec sm_fShiftLoc[],
+void pdihs_gpu(const int i, float *vtot_loc, const int numBonds,
+ const t_iatom d_forceatoms[], const t_iparams d_forceparams[],
+ const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
const PbcAiuc pbcAiuc)
{
- if (i < nBonds)
+ if (i < numBonds)
{
- int type = forceatoms[5*i];
- int ai = forceatoms[5*i + 1];
- int aj = forceatoms[5*i + 2];
- int ak = forceatoms[5*i + 3];
- int al = forceatoms[5*i + 4];
+ int type = d_forceatoms[5*i];
+ int ai = d_forceatoms[5*i + 1];
+ int aj = d_forceatoms[5*i + 2];
+ int ak = d_forceatoms[5*i + 3];
+ int al = d_forceatoms[5*i + 4];
fvec r_ij;
fvec r_kj;
int t2;
int t3;
float phi =
- dih_angle_gpu<calcVir>(xq[ai], xq[aj], xq[ak], xq[al], pbcAiuc,
+ dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
float vpd;
float ddphi;
- dopdihs_gpu(forceparams[type].pdihs.cpA,
- forceparams[type].pdihs.phiA,
- forceparams[type].pdihs.mult,
+ dopdihs_gpu(d_forceparams[type].pdihs.cpA,
+ d_forceparams[type].pdihs.phiA,
+ d_forceparams[type].pdihs.mult,
phi, &vpd, &ddphi);
if (calcEner)
do_dih_fup_gpu<calcVir>(ai, aj, ak, al,
ddphi, r_ij, r_kj, r_kl, m, n,
- f, sm_fShiftLoc, pbcAiuc,
- xq, t1, t2, t3);
+ gm_f, sm_fShiftLoc, pbcAiuc,
+ gm_xq, t1, t2, t3);
}
}
template <bool calcVir, bool calcEner>
__device__
-void rbdihs_gpu(const int i, float *vtot_loc, const int nBonds,
- const t_iatom forceatoms[], const t_iparams forceparams[],
- const float4 xq[], fvec f[], fvec sm_fShiftLoc[],
+void rbdihs_gpu(const int i, float *vtot_loc, const int numBonds,
+ const t_iatom d_forceatoms[], const t_iparams d_forceparams[],
+ const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
const PbcAiuc pbcAiuc)
{
constexpr float c0 = 0.0f, c1 = 1.0f, c2 = 2.0f, c3 = 3.0f, c4 = 4.0f, c5 = 5.0f;
- if (i < nBonds)
+ if (i < numBonds)
{
- int type = forceatoms[5*i];
- int ai = forceatoms[5*i+1];
- int aj = forceatoms[5*i+2];
- int ak = forceatoms[5*i+3];
- int al = forceatoms[5*i+4];
+ int type = d_forceatoms[5*i];
+ int ai = d_forceatoms[5*i+1];
+ int aj = d_forceatoms[5*i+2];
+ int ak = d_forceatoms[5*i+3];
+ int al = d_forceatoms[5*i+4];
fvec r_ij;
fvec r_kj;
int t2;
int t3;
float phi =
- dih_angle_gpu<calcVir>(xq[ai], xq[aj], xq[ak], xq[al], pbcAiuc,
+ dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
/* Change to polymer convention */
float parm[NR_RBDIHS];
for (int j = 0; j < NR_RBDIHS; j++)
{
- parm[j] = forceparams[type].rbdihs.rbcA[j];
+ parm[j] = d_forceparams[type].rbdihs.rbcA[j];
}
/* Calculate cosine powers */
/* Calculate the energy */
do_dih_fup_gpu<calcVir>(ai, aj, ak, al,
ddphi, r_ij, r_kj, r_kl, m, n,
- f, sm_fShiftLoc, pbcAiuc,
- xq, t1, t2, t3);
+ gm_f, sm_fShiftLoc, pbcAiuc,
+ gm_xq, t1, t2, t3);
if (calcEner)
{
*vtot_loc += v;
template <bool calcVir, bool calcEner>
__device__
-void idihs_gpu(const int i, float *vtot_loc, const int nBonds,
- const t_iatom forceatoms[], const t_iparams forceparams[],
- const float4 xq[], fvec f[], fvec sm_fShiftLoc[],
+void idihs_gpu(const int i, float *vtot_loc, const int numBonds,
+ const t_iatom d_forceatoms[], const t_iparams d_forceparams[],
+ const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
const PbcAiuc pbcAiuc)
{
- if (i < nBonds)
+ if (i < numBonds)
{
- int type = forceatoms[5*i];
- int ai = forceatoms[5*i + 1];
- int aj = forceatoms[5*i + 2];
- int ak = forceatoms[5*i + 3];
- int al = forceatoms[5*i + 4];
+ int type = d_forceatoms[5*i];
+ int ai = d_forceatoms[5*i + 1];
+ int aj = d_forceatoms[5*i + 2];
+ int ak = d_forceatoms[5*i + 3];
+ int al = d_forceatoms[5*i + 4];
fvec r_ij;
fvec r_kj;
int t2;
int t3;
float phi =
- dih_angle_gpu<calcVir>(xq[ai], xq[aj], xq[ak], xq[al], pbcAiuc,
+ dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
/* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
* the dihedral is Pi away from phiO, which is very unlikely due to
* the potential.
*/
- float kA = forceparams[type].harmonic.krA;
- float pA = forceparams[type].harmonic.rA;
+ float kA = d_forceparams[type].harmonic.krA;
+ float pA = d_forceparams[type].harmonic.rA;
float phi0 = pA*DEG2RAD;
do_dih_fup_gpu<calcVir>(ai, aj, ak, al,
-ddphi, r_ij, r_kj, r_kl, m, n,
- f, sm_fShiftLoc, pbcAiuc,
- xq, t1, t2, t3);
+ gm_f, sm_fShiftLoc, pbcAiuc,
+ gm_xq, t1, t2, t3);
if (calcEner)
{
template <bool calcVir, bool calcEner>
__device__
-void pairs_gpu(const int i, const int nBonds,
+void pairs_gpu(const int i, const int numBonds,
const t_iatom iatoms[], const t_iparams iparams[],
- const float4 xq[], fvec force[], fvec sm_fShiftLoc[],
+ const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
const PbcAiuc pbcAiuc,
const float scale_factor,
float *vtotVdw_loc, float *vtotElec_loc)
{
- if (i < nBonds)
+ if (i < numBonds)
{
int itype = iatoms[3*i];
int ai = iatoms[3*i + 1];
int aj = iatoms[3*i + 2];
- float qq = xq[ai].w*xq[aj].w;
+ float qq = gm_xq[ai].w*gm_xq[aj].w;
float c6 = iparams[itype].lj14.c6A;
float c12 = iparams[itype].lj14.c12A;
/* Do we need to apply full periodic boundary conditions? */
fvec dr;
- int fshift_index = pbcDxAiuc<calcVir>(pbcAiuc, xq[ai], xq[aj], dr);
+ int fshift_index = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[aj], dr);
float r2 = norm2_gpu(dr);
float rinv = rsqrtf(r2);
#pragma unroll
for (int m = 0; m < DIM; m++)
{
- atomicAdd(&force[ai][m], f[m]);
- atomicAdd(&force[aj][m], -f[m]);
+ atomicAdd(&gm_f[ai][m], f[m]);
+ atomicAdd(&gm_f[aj][m], -f[m]);
if (calcVir && fshift_index != CENTRAL)
{
atomicAdd(&sm_fShiftLoc[fshift_index][m], f[m]);
void exec_kernel_gpu(BondedCudaKernelParameters kernelParams)
{
assert(blockDim.y == 1 && blockDim.z == 1);
- const int threadIndex = blockIdx.x*blockDim.x+threadIdx.x;
+ const int tid = blockIdx.x*blockDim.x+threadIdx.x;
float vtot_loc = 0;
float vtotVdw_loc = 0;
float vtotElec_loc = 0;
__syncthreads();
}
- int ftype;
+ int fType;
bool threadComputedPotential = false;
#pragma unroll
- for (int j = 0; j < nFtypesOnGpu; j++)
+ for (int j = 0; j < numFTypesOnGpu; j++)
{
- if (threadIndex >= kernelParams.ftypeRangeStart[j] && threadIndex <= kernelParams.ftypeRangeEnd[j])
+ if (tid >= kernelParams.fTypeRangeStart[j] && tid <= kernelParams.fTypeRangeEnd[j])
{
- const int nBonds = kernelParams.nrFTypeBonds[j];
-
- int localThreadIndex = threadIndex - kernelParams.ftypeRangeStart[j];
- const t_iatom *iatoms = kernelParams.iatoms[j];
- ftype = kernelParams.ftypesOnGpu[j];
+ const int numBonds = kernelParams.numFTypeBonds[j];
+ int fTypeTid = tid - kernelParams.fTypeRangeStart[j];
+ const t_iatom *iatoms = kernelParams.d_iatoms[j];
+ fType = kernelParams.fTypesOnGpu[j];
if (calcEner)
{
threadComputedPotential = true;
}
- switch (ftype)
+ switch (fType)
{
case F_BONDS:
- bonds_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
- kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
+ bonds_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
+ kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
break;
case F_ANGLES:
- angles_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
- kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
+ angles_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
+ kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
break;
case F_UREY_BRADLEY:
- urey_bradley_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
- kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
+ urey_bradley_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
+ kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
break;
case F_PDIHS:
case F_PIDIHS:
- pdihs_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
- kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
+ pdihs_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
+ kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
break;
case F_RBDIHS:
- rbdihs_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
- kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
+ rbdihs_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
+ kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
break;
case F_IDIHS:
- idihs_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
- kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
+ idihs_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
+ kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
break;
case F_LJ14:
- pairs_gpu<calcVir, calcEner>(localThreadIndex, nBonds, iatoms, kernelParams.forceparamsDevice,
- kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc,
+ pairs_gpu<calcVir, calcEner>(fTypeTid, numBonds, iatoms, kernelParams.d_forceParams,
+ kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc,
kernelParams.scaleFactor, &vtotVdw_loc, &vtotElec_loc);
break;
}
if (threadComputedPotential)
{
- float *vtotVdw = kernelParams.vtotDevice + F_LJ14;
- float *vtotElec = kernelParams.vtotDevice + F_COUL14;
- atomicAdd(kernelParams.vtotDevice + ftype, vtot_loc);
+ float *vtotVdw = kernelParams.d_vTot + F_LJ14;
+ float *vtotElec = kernelParams.d_vTot + F_COUL14;
+ atomicAdd(kernelParams.d_vTot + fType, vtot_loc);
atomicAdd(vtotVdw, vtotVdw_loc);
atomicAdd(vtotElec, vtotElec_loc);
}
__syncthreads();
if (threadIdx.x < SHIFTS)
{
- fvec_inc_atomic(kernelParams.fshiftDevice[threadIdx.x], sm_fShiftLoc[threadIdx.x]);
+ fvec_inc_atomic(kernelParams.d_fShift[threadIdx.x], sm_fShiftLoc[threadIdx.x]);
}
}
}
PbcAiuc pbcAiuc;
setPbcAiuc(fr->bMolPBC ? ePBC2npbcdim(fr->ePBC) : 0, box, &pbcAiuc);
- int ftypeRangeEnd = kernelParams_.ftypeRangeEnd[nFtypesOnGpu - 1];
+ int fTypeRangeEnd = kernelParams_.fTypeRangeEnd[numFTypesOnGpu - 1];
- if (ftypeRangeEnd < 0)
+ if (fTypeRangeEnd < 0)
{
return;
}
config.blockSize[0] = TPB_BONDED;
config.blockSize[1] = 1;
config.blockSize[2] = 1;
- config.gridSize[0] = (ftypeRangeEnd + TPB_BONDED)/TPB_BONDED;
+ config.gridSize[0] = (fTypeRangeEnd + TPB_BONDED)/TPB_BONDED;
config.gridSize[1] = 1;
config.gridSize[2] = 1;
- config.stream = stream;
+ config.stream = stream_;
auto kernelPtr = exec_kernel_gpu<calcVir, calcEner>;
kernelParams_.scaleFactor = fr->ic->epsfac*fr->fudgeQQ;