From: Magnus Lundborg Date: Tue, 18 Jun 2019 12:22:10 +0000 (+0200) Subject: Cleaning up the GPU bonded kernel X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=ec176323c3d79a68b77aabb83032429721efef24;p=alexxy%2Fgromacs.git Cleaning up the GPU bonded kernel Improved variable names Removed unnecessary includes Refs #2988 Change-Id: Ife5f50e68f27994203305e376f2781a076052b64 --- diff --git a/src/gromacs/listed_forces/gpubonded.h b/src/gromacs/listed_forces/gpubonded.h index 3f0f5eba3f..df63d675d0 100644 --- a/src/gromacs/listed_forces/gpubonded.h +++ b/src/gromacs/listed_forces/gpubonded.h @@ -61,7 +61,7 @@ struct t_idef; struct t_inputrec; /*! \brief The number on bonded function types supported on GPUs */ -static constexpr int nFtypesOnGpu = 8; +static constexpr int numFTypesOnGpu = 8; namespace gmx { @@ -73,7 +73,7 @@ namespace gmx * \note The function types in the list are ordered on increasing value. * \note Currently bonded are only supported with CUDA, not with OpenCL. */ -constexpr std::array ftypesOnGpu = +constexpr std::array fTypesOnGpu = { F_BONDS, F_ANGLES, diff --git a/src/gromacs/listed_forces/gpubonded_impl.cpp b/src/gromacs/listed_forces/gpubonded_impl.cpp index dc816facc3..b7b20fdd52 100644 --- a/src/gromacs/listed_forces/gpubonded_impl.cpp +++ b/src/gromacs/listed_forces/gpubonded_impl.cpp @@ -58,9 +58,9 @@ namespace gmx //! Returns whether there are any interactions in ilists suitable for a GPU. static bool someInteractionsCanRunOnGpu(const InteractionLists &ilists) { - for (int ftype : ftypesOnGpu) + for (int fType : fTypesOnGpu) { - if (!ilists[ftype].iatoms.empty()) + if (!ilists[fType].iatoms.empty()) { // Perturbation is not implemented in the GPU bonded // kernels. If all the interactions were actually diff --git a/src/gromacs/listed_forces/gpubonded_impl.cu b/src/gromacs/listed_forces/gpubonded_impl.cu index 050efc0f2d..065e88e253 100644 --- a/src/gromacs/listed_forces/gpubonded_impl.cu +++ b/src/gromacs/listed_forces/gpubonded_impl.cu @@ -51,13 +51,8 @@ #include "gromacs/gpu_utils/cuda_arch_utils.cuh" #include "gromacs/gpu_utils/cudautils.cuh" #include "gromacs/gpu_utils/devicebuffer.h" -#include "gromacs/gpu_utils/gpu_vec.cuh" -#include "gromacs/gpu_utils/gputraits.cuh" -#include "gromacs/gpu_utils/hostallocator.h" -#include "gromacs/listed_forces/gpubonded.h" #include "gromacs/mdtypes/enerdata.h" #include "gromacs/topology/forcefieldparameters.h" -#include "gromacs/topology/idef.h" struct t_forcerec; @@ -69,62 +64,62 @@ namespace gmx GpuBonded::Impl::Impl(const gmx_ffparams_t &ffparams, void *streamPtr) { - stream = *static_cast(streamPtr); + stream_ = *static_cast(streamPtr); - allocateDeviceBuffer(&forceparamsDevice, ffparams.numTypes(), nullptr); + allocateDeviceBuffer(&d_forceParams_, ffparams.numTypes(), nullptr); // This could be an async transfer (if the source is pinned), so // long as it uses the same stream as the kernels and we are happy // to consume additional pinned pages. - copyToDeviceBuffer(&forceparamsDevice, ffparams.iparams.data(), + copyToDeviceBuffer(&d_forceParams_, ffparams.iparams.data(), 0, ffparams.numTypes(), - stream, GpuApiCallBehavior::Sync, nullptr); - vtot.resize(F_NRE); - allocateDeviceBuffer(&vtotDevice, F_NRE, nullptr); - clearDeviceBufferAsync(&vtotDevice, 0, F_NRE, stream); + stream_, GpuApiCallBehavior::Sync, nullptr); + vTot_.resize(F_NRE); + allocateDeviceBuffer(&d_vTot_, F_NRE, nullptr); + clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, stream_); - for (int ftype = 0; ftype < F_NRE; ftype++) + for (int fType = 0; fType < F_NRE; fType++) { - iListsDevice[ftype].nr = 0; - iListsDevice[ftype].iatoms = nullptr; - iListsDevice[ftype].nalloc = 0; + d_iLists_[fType].nr = 0; + d_iLists_[fType].iatoms = nullptr; + d_iLists_[fType].nalloc = 0; } - kernelParams_.forceparamsDevice = forceparamsDevice; - kernelParams_.xqDevice = xqDevice; - kernelParams_.forceDevice = forceDevice; - kernelParams_.fshiftDevice = fshiftDevice; - kernelParams_.vtotDevice = vtotDevice; - for (int i = 0; i < nFtypesOnGpu; i++) + kernelParams_.d_forceParams = d_forceParams_; + kernelParams_.d_xq = d_xq_; + kernelParams_.d_f = d_f_; + kernelParams_.d_fShift = d_fShift_; + kernelParams_.d_vTot = d_vTot_; + for (int i = 0; i < numFTypesOnGpu; i++) { - kernelParams_.iatoms[i] = nullptr; - kernelParams_.ftypeRangeStart[i] = 0; - kernelParams_.ftypeRangeEnd[i] = -1; + kernelParams_.d_iatoms[i] = nullptr; + kernelParams_.fTypeRangeStart[i] = 0; + kernelParams_.fTypeRangeEnd[i] = -1; } } GpuBonded::Impl::~Impl() { - for (int ftype : ftypesOnGpu) + for (int fType : fTypesOnGpu) { - if (iListsDevice[ftype].iatoms) + if (d_iLists_[fType].iatoms) { - freeDeviceBuffer(&iListsDevice[ftype].iatoms); - iListsDevice[ftype].iatoms = nullptr; + freeDeviceBuffer(&d_iLists_[fType].iatoms); + d_iLists_[fType].iatoms = nullptr; } } - freeDeviceBuffer(&forceparamsDevice); - freeDeviceBuffer(&vtotDevice); + freeDeviceBuffer(&d_forceParams_); + freeDeviceBuffer(&d_vTot_); } -//! Return whether function type \p ftype in \p idef has perturbed interactions -static bool ftypeHasPerturbedEntries(const t_idef &idef, - int ftype) +//! Return whether function type \p fType in \p idef has perturbed interactions +static bool fTypeHasPerturbedEntries(const t_idef &idef, + int fType) { GMX_ASSERT(idef.ilsort == ilsortNO_FE || idef.ilsort == ilsortFE_SORTED, "Perturbed interations should be sorted here"); - const t_ilist &ilist = idef.il[ftype]; + const t_ilist &ilist = idef.il[fType]; return (idef.ilsort != ilsortNO_FE && ilist.nr_nonperturbed != ilist.nr); } @@ -180,29 +175,29 @@ static inline int roundUpToFactor(const int input, const int factor) void GpuBonded::Impl::updateInteractionListsAndDeviceBuffers(ArrayRef nbnxnAtomOrder, const t_idef &idef, - void *xqDevicePtr, - void *forceDevicePtr, - void *fshiftDevicePtr) + void *d_xqPtr, + void *d_fPtr, + void *d_fShiftPtr) { // TODO wallcycle sub start haveInteractions_ = false; - int ftypesCounter = 0; + int fTypesCounter = 0; - for (int ftype : ftypesOnGpu) + for (int fType : fTypesOnGpu) { - auto &iList = iLists[ftype]; + auto &iList = iLists_[fType]; /* Perturbation is not implemented in the GPU bonded kernels. * But instead of doing all interactions on the CPU, we can * still easily handle the types that have no perturbed * interactions on the GPU. */ - if (idef.il[ftype].nr > 0 && !ftypeHasPerturbedEntries(idef, ftype)) + if (idef.il[fType].nr > 0 && !fTypeHasPerturbedEntries(idef, fType)) { haveInteractions_ = true; - convertIlistToNbnxnOrder(idef.il[ftype], + convertIlistToNbnxnOrder(idef.il[fType], &iList, - NRAL(ftype), nbnxnAtomOrder); + NRAL(fType), nbnxnAtomOrder); } else { @@ -215,46 +210,46 @@ GpuBonded::Impl::updateInteractionListsAndDeviceBuffers(ArrayRef nbn // end. if (iList.size() > 0) { - t_ilist &iListDevice = iListsDevice[ftype]; + t_ilist &d_iList = d_iLists_[fType]; - reallocateDeviceBuffer(&iListDevice.iatoms, iList.size(), &iListDevice.nr, &iListDevice.nalloc, nullptr); + reallocateDeviceBuffer(&d_iList.iatoms, iList.size(), &d_iList.nr, &d_iList.nalloc, nullptr); - copyToDeviceBuffer(&iListDevice.iatoms, iList.iatoms.data(), + copyToDeviceBuffer(&d_iList.iatoms, iList.iatoms.data(), 0, iList.size(), - stream, GpuApiCallBehavior::Async, nullptr); + stream_, GpuApiCallBehavior::Async, nullptr); } - kernelParams_.ftypesOnGpu[ftypesCounter] = ftype; - kernelParams_.nrFTypeIAtoms[ftypesCounter] = iList.size(); - int nBonds = iList.size() / (interaction_function[ftype].nratoms + 1); - kernelParams_.nrFTypeBonds[ftypesCounter] = nBonds; - kernelParams_.iatoms[ftypesCounter] = iListsDevice[ftype].iatoms; - if (ftypesCounter == 0) + kernelParams_.fTypesOnGpu[fTypesCounter] = fType; + kernelParams_.numFTypeIAtoms[fTypesCounter] = iList.size(); + int numBonds = iList.size() / (interaction_function[fType].nratoms + 1); + kernelParams_.numFTypeBonds[fTypesCounter] = numBonds; + kernelParams_.d_iatoms[fTypesCounter] = d_iLists_[fType].iatoms; + if (fTypesCounter == 0) { - kernelParams_.ftypeRangeStart[ftypesCounter] = 0; + kernelParams_.fTypeRangeStart[fTypesCounter] = 0; } else { - kernelParams_.ftypeRangeStart[ftypesCounter] = kernelParams_.ftypeRangeEnd[ftypesCounter - 1] + 1; + kernelParams_.fTypeRangeStart[fTypesCounter] = kernelParams_.fTypeRangeEnd[fTypesCounter - 1] + 1; } - kernelParams_.ftypeRangeEnd[ftypesCounter] = kernelParams_.ftypeRangeStart[ftypesCounter] + roundUpToFactor(nBonds, warp_size) - 1; + kernelParams_.fTypeRangeEnd[fTypesCounter] = kernelParams_.fTypeRangeStart[fTypesCounter] + roundUpToFactor(numBonds, warp_size) - 1; - GMX_ASSERT(nBonds > 0 || kernelParams_.ftypeRangeEnd[ftypesCounter] <= kernelParams_.ftypeRangeStart[ftypesCounter], - "Invalid GPU listed forces setup. nBonds must be > 0 if there are threads allocated to do work on that interaction function type."); - GMX_ASSERT(kernelParams_.ftypeRangeStart[ftypesCounter] % warp_size == 0 && (kernelParams_.ftypeRangeEnd[ftypesCounter] + 1) % warp_size == 0, + GMX_ASSERT(numBonds > 0 || kernelParams_.fTypeRangeEnd[fTypesCounter] <= kernelParams_.fTypeRangeStart[fTypesCounter], + "Invalid GPU listed forces setup. numBonds must be > 0 if there are threads allocated to do work on that interaction function type."); + GMX_ASSERT(kernelParams_.fTypeRangeStart[fTypesCounter] % warp_size == 0 && (kernelParams_.fTypeRangeEnd[fTypesCounter] + 1) % warp_size == 0, "The bonded interactions must be assigned to the GPU in blocks of warp size."); - ftypesCounter++; + fTypesCounter++; } - xqDevice = static_cast(xqDevicePtr); - forceDevice = static_cast(forceDevicePtr); - fshiftDevice = static_cast(fshiftDevicePtr); + d_xq_ = static_cast(d_xqPtr); + d_f_ = static_cast(d_fPtr); + d_fShift_ = static_cast(d_fShiftPtr); - kernelParams_.xqDevice = xqDevice; - kernelParams_.forceDevice = forceDevice; - kernelParams_.fshiftDevice = fshiftDevice; - kernelParams_.forceparamsDevice = forceparamsDevice; - kernelParams_.vtotDevice = vtotDevice; + kernelParams_.d_xq = d_xq_; + kernelParams_.d_f = d_f_; + kernelParams_.d_fShift = d_fShift_; + kernelParams_.d_forceParams = d_forceParams_; + kernelParams_.d_vTot = d_vTot_; // TODO wallcycle sub stop } @@ -271,10 +266,10 @@ GpuBonded::Impl::launchEnergyTransfer() // TODO should wrap with ewcLAUNCH_GPU GMX_ASSERT(haveInteractions_, "No GPU bonded interactions, so no energies will be computed, so transfer should not be called"); - float *vtot_h = vtot.data(); - copyFromDeviceBuffer(vtot_h, &vtotDevice, + float *h_vTot = vTot_.data(); + copyFromDeviceBuffer(h_vTot, &d_vTot_, 0, F_NRE, - stream, GpuApiCallBehavior::Async, nullptr); + stream_, GpuApiCallBehavior::Async, nullptr); } void @@ -284,29 +279,29 @@ GpuBonded::Impl::accumulateEnergyTerms(gmx_enerdata_t *enerd) // wait goes in to the "Rest" counter GMX_ASSERT(haveInteractions_, "No GPU bonded interactions, so no energies will be computed or transferred, so accumulation should not occur"); - cudaError_t stat = cudaStreamSynchronize(stream); + cudaError_t stat = cudaStreamSynchronize(stream_); CU_RET_ERR(stat, "D2H transfer of bonded energies failed"); - for (int ftype : ftypesOnGpu) + for (int fType : fTypesOnGpu) { - if (ftype != F_LJ14 && ftype != F_COUL14) + if (fType != F_LJ14 && fType != F_COUL14) { - enerd->term[ftype] += vtot[ftype]; + enerd->term[fType] += vTot_[fType]; } } // Note: We do not support energy groups here gmx_grppairener_t *grppener = &enerd->grpp; GMX_RELEASE_ASSERT(grppener->nener == 1, "No energy group support for bondeds on the GPU"); - grppener->ener[egLJ14][0] += vtot[F_LJ14]; - grppener->ener[egCOUL14][0] += vtot[F_COUL14]; + grppener->ener[egLJ14][0] += vTot_[F_LJ14]; + grppener->ener[egCOUL14][0] += vTot_[F_COUL14]; } void GpuBonded::Impl::clearEnergies() { // TODO should wrap with ewcLAUNCH_GPU - clearDeviceBufferAsync(&vtotDevice, 0, F_NRE, stream); + clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, stream_); } // ---- GpuBonded @@ -322,12 +317,12 @@ GpuBonded::~GpuBonded() = default; void GpuBonded::updateInteractionListsAndDeviceBuffers(ArrayRef nbnxnAtomOrder, const t_idef &idef, - void *xqDevice, - void *forceDevice, - void *fshiftDevice) + void *d_xq, + void *d_f, + void *d_fShift) { impl_->updateInteractionListsAndDeviceBuffers - (nbnxnAtomOrder, idef, xqDevice, forceDevice, fshiftDevice); + (nbnxnAtomOrder, idef, d_xq, d_f, d_fShift); } bool diff --git a/src/gromacs/listed_forces/gpubonded_impl.h b/src/gromacs/listed_forces/gpubonded_impl.h index 0cfc429f44..f33f645324 100644 --- a/src/gromacs/listed_forces/gpubonded_impl.h +++ b/src/gromacs/listed_forces/gpubonded_impl.h @@ -53,7 +53,6 @@ #include "gromacs/gpu_utils/hostallocator.h" #include "gromacs/listed_forces/gpubonded.h" #include "gromacs/pbcutil/pbc_aiuc.h" -#include "gromacs/topology/idef.h" struct gmx_ffparams_t; struct t_forcerec; @@ -87,28 +86,28 @@ struct BondedCudaKernelParameters //! Scale factor float scaleFactor; //! The bonded types on GPU - int ftypesOnGpu[nFtypesOnGpu]; + int fTypesOnGpu[numFTypesOnGpu]; //! The number of interaction atom (iatom) elements for every function type - int nrFTypeIAtoms[nFtypesOnGpu]; + int numFTypeIAtoms[numFTypesOnGpu]; //! The number of bonds for every function type - int nrFTypeBonds[nFtypesOnGpu]; + int numFTypeBonds[numFTypesOnGpu]; //! The start index in the range of each interaction type - int ftypeRangeStart[nFtypesOnGpu]; + int fTypeRangeStart[numFTypesOnGpu]; //! The end index in the range of each interaction type - int ftypeRangeEnd[nFtypesOnGpu]; + int fTypeRangeEnd[numFTypesOnGpu]; //! Force parameters (on GPU) - t_iparams *forceparamsDevice; + t_iparams *d_forceParams; //! Coordinates before the timestep (on GPU) - const float4 *xqDevice; + const float4 *d_xq; //! Forces on atoms (on GPU) - fvec *forceDevice; + fvec *d_f; //! Force shifts on atoms (on GPU) - fvec *fshiftDevice; + fvec *d_fShift; //! Total Energy (on GPU) - float *vtotDevice; + float *d_vTot; //! Interaction list atoms (on GPU) - t_iatom *iatoms[nFtypesOnGpu]; + t_iatom *d_iatoms[numFTypesOnGpu]; BondedCudaKernelParameters() { @@ -116,12 +115,12 @@ struct BondedCudaKernelParameters setPbcAiuc(0, boxDummy, &pbcAiuc); - scaleFactor = 1.0; - forceparamsDevice = nullptr; - xqDevice = nullptr; - forceDevice = nullptr; - fshiftDevice = nullptr; - vtotDevice = nullptr; + scaleFactor = 1.0; + d_forceParams = nullptr; + d_xq = nullptr; + d_f = nullptr; + d_fShift = nullptr; + d_vTot = nullptr; } }; @@ -167,27 +166,27 @@ class GpuBonded::Impl * * \todo This is potentially several pinned allocations, which * could contribute to exhausting such pages. */ - std::array iLists; + std::array iLists_; + //! Tells whether there are any interaction in iLists. - bool haveInteractions_; + bool haveInteractions_; //! Interaction lists on the device. - t_ilist iListsDevice[F_NRE]; - + t_ilist d_iLists_[F_NRE]; //! Bonded parameters for device-side use. - t_iparams *forceparamsDevice = nullptr; + t_iparams *d_forceParams_ = nullptr; //! Position-charge vector on the device. - const float4 *xqDevice = nullptr; + const float4 *d_xq_ = nullptr; //! Force vector on the device. - fvec *forceDevice = nullptr; + fvec *d_f_ = nullptr; //! Shift force vector on the device. - fvec *fshiftDevice = nullptr; + fvec *d_fShift_ = nullptr; //! \brief Host-side virial buffer - HostVector vtot = {{}, gmx::HostAllocationPolicy(gmx::PinningPolicy::PinnedIfSupported)}; + HostVector vTot_ = {{}, gmx::HostAllocationPolicy(gmx::PinningPolicy::PinnedIfSupported)}; //! \brief Device-side total virial - float *vtotDevice = nullptr; + float *d_vTot_ = nullptr; //! \brief Bonded GPU stream, not owned by this module - CommandStream stream; + CommandStream stream_; //! Parameters and pointers, passed to the CUDA kernel BondedCudaKernelParameters kernelParams_; diff --git a/src/gromacs/listed_forces/gpubondedkernels.cu b/src/gromacs/listed_forces/gpubondedkernels.cu index b0f19a8668..52a0091435 100644 --- a/src/gromacs/listed_forces/gpubondedkernels.cu +++ b/src/gromacs/listed_forces/gpubondedkernels.cu @@ -53,22 +53,13 @@ #include #include "gromacs/gpu_utils/cudautils.cuh" -#include "gromacs/gpu_utils/devicebuffer.h" #include "gromacs/gpu_utils/gpu_vec.cuh" -#include "gromacs/gpu_utils/gputraits.cuh" #include "gromacs/listed_forces/gpubonded.h" -#include "gromacs/listed_forces/listed_forces.h" #include "gromacs/math/units.h" #include "gromacs/mdlib/force_flags.h" -#include "gromacs/mdtypes/enerdata.h" #include "gromacs/mdtypes/forcerec.h" -#include "gromacs/mdtypes/group.h" -#include "gromacs/mdtypes/mdatom.h" -#include "gromacs/pbcutil/ishift.h" #include "gromacs/pbcutil/pbc.h" #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh" -#include "gromacs/topology/idef.h" -#include "gromacs/topology/ifunc.h" #include "gromacs/utility/gmxassert.h" #include "gpubonded_impl.h" @@ -102,28 +93,28 @@ static void harmonic_gpu(const float kA, const float xA, const float x, float *V template __device__ -void bonds_gpu(const int i, float *vtot_loc, const int nBonds, - const t_iatom forceatoms[], const t_iparams forceparams[], - const float4 xq[], fvec force[], fvec sm_fShiftLoc[], +void bonds_gpu(const int i, float *vtot_loc, const int numBonds, + const t_iatom d_forceatoms[], const t_iparams d_forceparams[], + const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[], const PbcAiuc pbcAiuc) { - if (i < nBonds) + if (i < numBonds) { - int type = forceatoms[3*i]; - int ai = forceatoms[3*i + 1]; - int aj = forceatoms[3*i + 2]; + int type = d_forceatoms[3*i]; + int ai = d_forceatoms[3*i + 1]; + int aj = d_forceatoms[3*i + 2]; /* dx = xi - xj, corrected for periodic boundary conditions. */ fvec dx; - int ki = pbcDxAiuc(pbcAiuc, xq[ai], xq[aj], dx); + int ki = pbcDxAiuc(pbcAiuc, gm_xq[ai], gm_xq[aj], dx); float dr2 = iprod_gpu(dx, dx); float dr = sqrt(dr2); float vbond; float fbond; - harmonic_gpu(forceparams[type].harmonic.krA, - forceparams[type].harmonic.rA, + harmonic_gpu(d_forceparams[type].harmonic.krA, + d_forceparams[type].harmonic.rA, dr, &vbond, &fbond); if (calcEner) @@ -139,8 +130,8 @@ void bonds_gpu(const int i, float *vtot_loc, const int nBonds, for (int m = 0; m < DIM; m++) { float fij = fbond*dx[m]; - atomicAdd(&force[ai][m], fij); - atomicAdd(&force[aj][m], -fij); + atomicAdd(&gm_f[ai][m], fij); + atomicAdd(&gm_f[aj][m], -fij); if (calcVir && ki != CENTRAL) { atomicAdd(&sm_fShiftLoc[ki][m], fij); @@ -170,17 +161,17 @@ static float bond_angle_gpu(const float4 xi, const float4 xj, const float4 xk, template __device__ -void angles_gpu(const int i, float *vtot_loc, const int nBonds, - const t_iatom forceatoms[], const t_iparams forceparams[], - const float4 xq[], fvec force[], fvec sm_fShiftLoc[], +void angles_gpu(const int i, float *vtot_loc, const int numBonds, + const t_iatom d_forceatoms[], const t_iparams d_forceparams[], + const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[], const PbcAiuc pbcAiuc) { - if (i < nBonds) + if (i < numBonds) { - int type = forceatoms[4*i]; - int ai = forceatoms[4*i + 1]; - int aj = forceatoms[4*i + 2]; - int ak = forceatoms[4*i + 3]; + int type = d_forceatoms[4*i]; + int ai = d_forceatoms[4*i + 1]; + int aj = d_forceatoms[4*i + 2]; + int ak = d_forceatoms[4*i + 3]; fvec r_ij; fvec r_kj; @@ -188,13 +179,13 @@ void angles_gpu(const int i, float *vtot_loc, const int nBonds, int t1; int t2; float theta = - bond_angle_gpu(xq[ai], xq[aj], xq[ak], pbcAiuc, + bond_angle_gpu(gm_xq[ai], gm_xq[aj], gm_xq[ak], pbcAiuc, r_ij, r_kj, &cos_theta, &t1, &t2); float va; float dVdt; - harmonic_gpu(forceparams[type].harmonic.krA, - forceparams[type].harmonic.rA*DEG2RAD, + harmonic_gpu(d_forceparams[type].harmonic.krA, + d_forceparams[type].harmonic.rA*DEG2RAD, theta, &va, &dVdt); if (calcEner) @@ -226,9 +217,9 @@ void angles_gpu(const int i, float *vtot_loc, const int nBonds, f_i[m] = -(cik*r_kj[m] - cii*r_ij[m]); f_k[m] = -(cik*r_ij[m] - ckk*r_kj[m]); f_j[m] = -f_i[m] - f_k[m]; - atomicAdd(&force[ai][m], f_i[m]); - atomicAdd(&force[aj][m], f_j[m]); - atomicAdd(&force[ak][m], f_k[m]); + atomicAdd(&gm_f[ai][m], f_i[m]); + atomicAdd(&gm_f[aj][m], f_j[m]); + atomicAdd(&gm_f[ak][m], f_k[m]); if (calcVir) { atomicAdd(&sm_fShiftLoc[t1][m], f_i[m]); @@ -243,29 +234,29 @@ void angles_gpu(const int i, float *vtot_loc, const int nBonds, template __device__ -void urey_bradley_gpu(const int i, float *vtot_loc, const int nBonds, - const t_iatom forceatoms[], const t_iparams forceparams[], - const float4 xq[], fvec force[], fvec sm_fShiftLoc[], +void urey_bradley_gpu(const int i, float *vtot_loc, const int numBonds, + const t_iatom d_forceatoms[], const t_iparams d_forceparams[], + const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[], const PbcAiuc pbcAiuc) { - if (i < nBonds) + if (i < numBonds) { - int type = forceatoms[4*i]; - int ai = forceatoms[4*i+1]; - int aj = forceatoms[4*i+2]; - int ak = forceatoms[4*i+3]; + int type = d_forceatoms[4*i]; + int ai = d_forceatoms[4*i+1]; + int aj = d_forceatoms[4*i+2]; + int ak = d_forceatoms[4*i+3]; - float th0A = forceparams[type].u_b.thetaA*DEG2RAD; - float kthA = forceparams[type].u_b.kthetaA; - float r13A = forceparams[type].u_b.r13A; - float kUBA = forceparams[type].u_b.kUBA; + float th0A = d_forceparams[type].u_b.thetaA*DEG2RAD; + float kthA = d_forceparams[type].u_b.kthetaA; + float r13A = d_forceparams[type].u_b.r13A; + float kUBA = d_forceparams[type].u_b.kUBA; fvec r_ij; fvec r_kj; float cos_theta; int t1; int t2; - float theta = bond_angle_gpu(xq[ai], xq[aj], xq[ak], pbcAiuc, + float theta = bond_angle_gpu(gm_xq[ai], gm_xq[aj], gm_xq[ak], pbcAiuc, r_ij, r_kj, &cos_theta, &t1, &t2); float va; @@ -278,7 +269,7 @@ void urey_bradley_gpu(const int i, float *vtot_loc, const int nBonds, } fvec r_ik; - int ki = pbcDxAiuc(pbcAiuc, xq[ai], xq[ak], r_ik); + int ki = pbcDxAiuc(pbcAiuc, gm_xq[ai], gm_xq[ak], r_ik); float dr2 = iprod_gpu(r_ik, r_ik); float dr = dr2*rsqrtf(dr2); @@ -309,9 +300,9 @@ void urey_bradley_gpu(const int i, float *vtot_loc, const int nBonds, f_i[m] = -(cik*r_kj[m]-cii*r_ij[m]); f_k[m] = -(cik*r_ij[m]-ckk*r_kj[m]); f_j[m] = -f_i[m]-f_k[m]; - atomicAdd(&force[ai][m], f_i[m]); - atomicAdd(&force[aj][m], f_j[m]); - atomicAdd(&force[ak][m], f_k[m]); + atomicAdd(&gm_f[ai][m], f_i[m]); + atomicAdd(&gm_f[aj][m], f_j[m]); + atomicAdd(&gm_f[ak][m], f_k[m]); if (calcVir) { atomicAdd(&sm_fShiftLoc[t1][m], f_i[m]); @@ -335,8 +326,8 @@ void urey_bradley_gpu(const int i, float *vtot_loc, const int nBonds, for (int m = 0; m < DIM; m++) { float fik = fbond*r_ik[m]; - atomicAdd(&force[ai][m], fik); - atomicAdd(&force[ak][m], -fik); + atomicAdd(&gm_f[ai][m], fik); + atomicAdd(&gm_f[ak][m], -fik); if (calcVir && ki != CENTRAL) { @@ -372,23 +363,23 @@ static float dih_angle_gpu(const T xi, const T xj, const T xk, const T xl, __device__ __forceinline__ static void dopdihs_gpu(const float cpA, const float phiA, const int mult, - const float phi, float *V, float *F) + const float phi, float *v, float *f) { float mdphi, sdphi; mdphi = mult*phi - phiA*DEG2RAD; sdphi = sinf(mdphi); - *V = cpA * (1.0f + cosf(mdphi)); - *F = -cpA*mult*sdphi; + *v = cpA * (1.0f + cosf(mdphi)); + *f = -cpA*mult*sdphi; } template __device__ static void do_dih_fup_gpu(const int i, const int j, const int k, const int l, const float ddphi, const fvec r_ij, const fvec r_kj, const fvec r_kl, - const fvec m, const fvec n, fvec force[], fvec fshift[], + const fvec m, const fvec n, fvec gm_f[], fvec sm_fShiftLoc[], const PbcAiuc &pbcAiuc, - const float4 xq[], const int t1, const int t2, const int gmx_unused t3) + const float4 gm_xq[], const int t1, const int t2, const int gmx_unused t3) { float iprm = iprod_gpu(m, m); float iprn = iprod_gpu(n, n); @@ -422,24 +413,24 @@ static void do_dih_fup_gpu(const int i, const int j, const int k, const int l, #pragma unroll for (int m = 0; (m < DIM); m++) { - atomicAdd(&force[i][m], f_i[m]); - atomicAdd(&force[j][m], -f_j[m]); - atomicAdd(&force[k][m], -f_k[m]); - atomicAdd(&force[l][m], f_l[m]); + atomicAdd(&gm_f[i][m], f_i[m]); + atomicAdd(&gm_f[j][m], -f_j[m]); + atomicAdd(&gm_f[k][m], -f_k[m]); + atomicAdd(&gm_f[l][m], f_l[m]); } if (calcVir) { fvec dx_jl; - int t3 = pbcDxAiuc(pbcAiuc, xq[l], xq[j], dx_jl); + int t3 = pbcDxAiuc(pbcAiuc, gm_xq[l], gm_xq[j], dx_jl); #pragma unroll for (int m = 0; (m < DIM); m++) { - atomicAdd(&fshift[t1][m], f_i[m]); - atomicAdd(&fshift[CENTRAL][m], -f_j[m]); - atomicAdd(&fshift[t2][m], -f_k[m]); - atomicAdd(&fshift[t3][m], f_l[m]); + atomicAdd(&sm_fShiftLoc[t1][m], f_i[m]); + atomicAdd(&sm_fShiftLoc[CENTRAL][m], -f_j[m]); + atomicAdd(&sm_fShiftLoc[t2][m], -f_k[m]); + atomicAdd(&sm_fShiftLoc[t3][m], f_l[m]); } } } @@ -447,18 +438,18 @@ static void do_dih_fup_gpu(const int i, const int j, const int k, const int l, template __device__ -void pdihs_gpu(const int i, float *vtot_loc, const int nBonds, - const t_iatom forceatoms[], const t_iparams forceparams[], - const float4 xq[], fvec f[], fvec sm_fShiftLoc[], +void pdihs_gpu(const int i, float *vtot_loc, const int numBonds, + const t_iatom d_forceatoms[], const t_iparams d_forceparams[], + const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[], const PbcAiuc pbcAiuc) { - if (i < nBonds) + if (i < numBonds) { - int type = forceatoms[5*i]; - int ai = forceatoms[5*i + 1]; - int aj = forceatoms[5*i + 2]; - int ak = forceatoms[5*i + 3]; - int al = forceatoms[5*i + 4]; + int type = d_forceatoms[5*i]; + int ai = d_forceatoms[5*i + 1]; + int aj = d_forceatoms[5*i + 2]; + int ak = d_forceatoms[5*i + 3]; + int al = d_forceatoms[5*i + 4]; fvec r_ij; fvec r_kj; @@ -469,14 +460,14 @@ void pdihs_gpu(const int i, float *vtot_loc, const int nBonds, int t2; int t3; float phi = - dih_angle_gpu(xq[ai], xq[aj], xq[ak], xq[al], pbcAiuc, + dih_angle_gpu(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc, r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3); float vpd; float ddphi; - dopdihs_gpu(forceparams[type].pdihs.cpA, - forceparams[type].pdihs.phiA, - forceparams[type].pdihs.mult, + dopdihs_gpu(d_forceparams[type].pdihs.cpA, + d_forceparams[type].pdihs.phiA, + d_forceparams[type].pdihs.mult, phi, &vpd, &ddphi); if (calcEner) @@ -486,28 +477,28 @@ void pdihs_gpu(const int i, float *vtot_loc, const int nBonds, do_dih_fup_gpu(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n, - f, sm_fShiftLoc, pbcAiuc, - xq, t1, t2, t3); + gm_f, sm_fShiftLoc, pbcAiuc, + gm_xq, t1, t2, t3); } } template __device__ -void rbdihs_gpu(const int i, float *vtot_loc, const int nBonds, - const t_iatom forceatoms[], const t_iparams forceparams[], - const float4 xq[], fvec f[], fvec sm_fShiftLoc[], +void rbdihs_gpu(const int i, float *vtot_loc, const int numBonds, + const t_iatom d_forceatoms[], const t_iparams d_forceparams[], + const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[], const PbcAiuc pbcAiuc) { constexpr float c0 = 0.0f, c1 = 1.0f, c2 = 2.0f, c3 = 3.0f, c4 = 4.0f, c5 = 5.0f; - if (i < nBonds) + if (i < numBonds) { - int type = forceatoms[5*i]; - int ai = forceatoms[5*i+1]; - int aj = forceatoms[5*i+2]; - int ak = forceatoms[5*i+3]; - int al = forceatoms[5*i+4]; + int type = d_forceatoms[5*i]; + int ai = d_forceatoms[5*i+1]; + int aj = d_forceatoms[5*i+2]; + int ak = d_forceatoms[5*i+3]; + int al = d_forceatoms[5*i+4]; fvec r_ij; fvec r_kj; @@ -518,7 +509,7 @@ void rbdihs_gpu(const int i, float *vtot_loc, const int nBonds, int t2; int t3; float phi = - dih_angle_gpu(xq[ai], xq[aj], xq[ak], xq[al], pbcAiuc, + dih_angle_gpu(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc, r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3); /* Change to polymer convention */ @@ -538,7 +529,7 @@ void rbdihs_gpu(const int i, float *vtot_loc, const int nBonds, float parm[NR_RBDIHS]; for (int j = 0; j < NR_RBDIHS; j++) { - parm[j] = forceparams[type].rbdihs.rbcA[j]; + parm[j] = d_forceparams[type].rbdihs.rbcA[j]; } /* Calculate cosine powers */ /* Calculate the energy */ @@ -587,8 +578,8 @@ void rbdihs_gpu(const int i, float *vtot_loc, const int nBonds, do_dih_fup_gpu(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n, - f, sm_fShiftLoc, pbcAiuc, - xq, t1, t2, t3); + gm_f, sm_fShiftLoc, pbcAiuc, + gm_xq, t1, t2, t3); if (calcEner) { *vtot_loc += v; @@ -612,18 +603,18 @@ static void make_dp_periodic_gpu(float *dp) template __device__ -void idihs_gpu(const int i, float *vtot_loc, const int nBonds, - const t_iatom forceatoms[], const t_iparams forceparams[], - const float4 xq[], fvec f[], fvec sm_fShiftLoc[], +void idihs_gpu(const int i, float *vtot_loc, const int numBonds, + const t_iatom d_forceatoms[], const t_iparams d_forceparams[], + const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[], const PbcAiuc pbcAiuc) { - if (i < nBonds) + if (i < numBonds) { - int type = forceatoms[5*i]; - int ai = forceatoms[5*i + 1]; - int aj = forceatoms[5*i + 2]; - int ak = forceatoms[5*i + 3]; - int al = forceatoms[5*i + 4]; + int type = d_forceatoms[5*i]; + int ai = d_forceatoms[5*i + 1]; + int aj = d_forceatoms[5*i + 2]; + int ak = d_forceatoms[5*i + 3]; + int al = d_forceatoms[5*i + 4]; fvec r_ij; fvec r_kj; @@ -634,7 +625,7 @@ void idihs_gpu(const int i, float *vtot_loc, const int nBonds, int t2; int t3; float phi = - dih_angle_gpu(xq[ai], xq[aj], xq[ak], xq[al], pbcAiuc, + dih_angle_gpu(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc, r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3); /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge @@ -644,8 +635,8 @@ void idihs_gpu(const int i, float *vtot_loc, const int nBonds, * the dihedral is Pi away from phiO, which is very unlikely due to * the potential. */ - float kA = forceparams[type].harmonic.krA; - float pA = forceparams[type].harmonic.rA; + float kA = d_forceparams[type].harmonic.krA; + float pA = d_forceparams[type].harmonic.rA; float phi0 = pA*DEG2RAD; @@ -657,8 +648,8 @@ void idihs_gpu(const int i, float *vtot_loc, const int nBonds, do_dih_fup_gpu(ai, aj, ak, al, -ddphi, r_ij, r_kj, r_kl, m, n, - f, sm_fShiftLoc, pbcAiuc, - xq, t1, t2, t3); + gm_f, sm_fShiftLoc, pbcAiuc, + gm_xq, t1, t2, t3); if (calcEner) { @@ -669,26 +660,26 @@ void idihs_gpu(const int i, float *vtot_loc, const int nBonds, template __device__ -void pairs_gpu(const int i, const int nBonds, +void pairs_gpu(const int i, const int numBonds, const t_iatom iatoms[], const t_iparams iparams[], - const float4 xq[], fvec force[], fvec sm_fShiftLoc[], + const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[], const PbcAiuc pbcAiuc, const float scale_factor, float *vtotVdw_loc, float *vtotElec_loc) { - if (i < nBonds) + if (i < numBonds) { int itype = iatoms[3*i]; int ai = iatoms[3*i + 1]; int aj = iatoms[3*i + 2]; - float qq = xq[ai].w*xq[aj].w; + float qq = gm_xq[ai].w*gm_xq[aj].w; float c6 = iparams[itype].lj14.c6A; float c12 = iparams[itype].lj14.c12A; /* Do we need to apply full periodic boundary conditions? */ fvec dr; - int fshift_index = pbcDxAiuc(pbcAiuc, xq[ai], xq[aj], dr); + int fshift_index = pbcDxAiuc(pbcAiuc, gm_xq[ai], gm_xq[aj], dr); float r2 = norm2_gpu(dr); float rinv = rsqrtf(r2); @@ -709,8 +700,8 @@ void pairs_gpu(const int i, const int nBonds, #pragma unroll for (int m = 0; m < DIM; m++) { - atomicAdd(&force[ai][m], f[m]); - atomicAdd(&force[aj][m], -f[m]); + atomicAdd(&gm_f[ai][m], f[m]); + atomicAdd(&gm_f[aj][m], -f[m]); if (calcVir && fshift_index != CENTRAL) { atomicAdd(&sm_fShiftLoc[fshift_index][m], f[m]); @@ -734,7 +725,7 @@ __global__ void exec_kernel_gpu(BondedCudaKernelParameters kernelParams) { assert(blockDim.y == 1 && blockDim.z == 1); - const int threadIndex = blockIdx.x*blockDim.x+threadIdx.x; + const int tid = blockIdx.x*blockDim.x+threadIdx.x; float vtot_loc = 0; float vtotVdw_loc = 0; float vtotElec_loc = 0; @@ -751,53 +742,52 @@ void exec_kernel_gpu(BondedCudaKernelParameters kernelParams) __syncthreads(); } - int ftype; + int fType; bool threadComputedPotential = false; #pragma unroll - for (int j = 0; j < nFtypesOnGpu; j++) + for (int j = 0; j < numFTypesOnGpu; j++) { - if (threadIndex >= kernelParams.ftypeRangeStart[j] && threadIndex <= kernelParams.ftypeRangeEnd[j]) + if (tid >= kernelParams.fTypeRangeStart[j] && tid <= kernelParams.fTypeRangeEnd[j]) { - const int nBonds = kernelParams.nrFTypeBonds[j]; - - int localThreadIndex = threadIndex - kernelParams.ftypeRangeStart[j]; - const t_iatom *iatoms = kernelParams.iatoms[j]; - ftype = kernelParams.ftypesOnGpu[j]; + const int numBonds = kernelParams.numFTypeBonds[j]; + int fTypeTid = tid - kernelParams.fTypeRangeStart[j]; + const t_iatom *iatoms = kernelParams.d_iatoms[j]; + fType = kernelParams.fTypesOnGpu[j]; if (calcEner) { threadComputedPotential = true; } - switch (ftype) + switch (fType) { case F_BONDS: - bonds_gpu(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice, - kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc); + bonds_gpu(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams, + kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc); break; case F_ANGLES: - angles_gpu(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice, - kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc); + angles_gpu(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams, + kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc); break; case F_UREY_BRADLEY: - urey_bradley_gpu(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice, - kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc); + urey_bradley_gpu(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams, + kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc); break; case F_PDIHS: case F_PIDIHS: - pdihs_gpu(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice, - kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc); + pdihs_gpu(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams, + kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc); break; case F_RBDIHS: - rbdihs_gpu(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice, - kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc); + rbdihs_gpu(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams, + kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc); break; case F_IDIHS: - idihs_gpu(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice, - kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc); + idihs_gpu(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams, + kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc); break; case F_LJ14: - pairs_gpu(localThreadIndex, nBonds, iatoms, kernelParams.forceparamsDevice, - kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc, + pairs_gpu(fTypeTid, numBonds, iatoms, kernelParams.d_forceParams, + kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc, kernelParams.scaleFactor, &vtotVdw_loc, &vtotElec_loc); break; } @@ -807,9 +797,9 @@ void exec_kernel_gpu(BondedCudaKernelParameters kernelParams) if (threadComputedPotential) { - float *vtotVdw = kernelParams.vtotDevice + F_LJ14; - float *vtotElec = kernelParams.vtotDevice + F_COUL14; - atomicAdd(kernelParams.vtotDevice + ftype, vtot_loc); + float *vtotVdw = kernelParams.d_vTot + F_LJ14; + float *vtotElec = kernelParams.d_vTot + F_COUL14; + atomicAdd(kernelParams.d_vTot + fType, vtot_loc); atomicAdd(vtotVdw, vtotVdw_loc); atomicAdd(vtotElec, vtotElec_loc); } @@ -819,7 +809,7 @@ void exec_kernel_gpu(BondedCudaKernelParameters kernelParams) __syncthreads(); if (threadIdx.x < SHIFTS) { - fvec_inc_atomic(kernelParams.fshiftDevice[threadIdx.x], sm_fShiftLoc[threadIdx.x]); + fvec_inc_atomic(kernelParams.d_fShift[threadIdx.x], sm_fShiftLoc[threadIdx.x]); } } } @@ -840,9 +830,9 @@ GpuBonded::Impl::launchKernel(const t_forcerec *fr, PbcAiuc pbcAiuc; setPbcAiuc(fr->bMolPBC ? ePBC2npbcdim(fr->ePBC) : 0, box, &pbcAiuc); - int ftypeRangeEnd = kernelParams_.ftypeRangeEnd[nFtypesOnGpu - 1]; + int fTypeRangeEnd = kernelParams_.fTypeRangeEnd[numFTypesOnGpu - 1]; - if (ftypeRangeEnd < 0) + if (fTypeRangeEnd < 0) { return; } @@ -851,10 +841,10 @@ GpuBonded::Impl::launchKernel(const t_forcerec *fr, config.blockSize[0] = TPB_BONDED; config.blockSize[1] = 1; config.blockSize[2] = 1; - config.gridSize[0] = (ftypeRangeEnd + TPB_BONDED)/TPB_BONDED; + config.gridSize[0] = (fTypeRangeEnd + TPB_BONDED)/TPB_BONDED; config.gridSize[1] = 1; config.gridSize[2] = 1; - config.stream = stream; + config.stream = stream_; auto kernelPtr = exec_kernel_gpu; kernelParams_.scaleFactor = fr->ic->epsfac*fr->fudgeQQ; diff --git a/src/gromacs/listed_forces/manage_threading.cpp b/src/gromacs/listed_forces/manage_threading.cpp index ee89e9b69a..3ad73d5dc1 100644 --- a/src/gromacs/listed_forces/manage_threading.cpp +++ b/src/gromacs/listed_forces/manage_threading.cpp @@ -76,13 +76,13 @@ typedef struct { /*! \brief Divides listed interactions over threads * - * This routine attempts to divide all interactions of the ntype bondeds + * This routine attempts to divide all interactions of the numType bondeds * types stored in ild over the threads such that each thread has roughly * equal load and different threads avoid touching the same atoms as much * as possible. */ static void divide_bondeds_by_locality(bonded_threading_t *bt, - int ntype, + int numType, const ilist_data_t *ild) { int nat_tot, nat_sum; @@ -90,10 +90,10 @@ static void divide_bondeds_by_locality(bonded_threading_t *bt, int at_ind[F_NRE]; /* index of the first atom of the interaction at ind */ int f, t; - assert(ntype <= F_NRE); + assert(numType <= F_NRE); nat_tot = 0; - for (f = 0; f < ntype; f++) + for (f = 0; f < numType; f++) { /* Sum #bondeds*#atoms_per_bond over all bonded types */ nat_tot += ild[f].il->nr/(ild[f].nat + 1)*ild[f].nat; @@ -108,7 +108,7 @@ static void divide_bondeds_by_locality(bonded_threading_t *bt, /* Loop over the end bounds of the nthreads threads to determine * which interactions threads 0 to nthreads shall calculate. * - * NOTE: The cost of these combined loops is #interactions*ntype. + * NOTE: The cost of these combined loops is #interactions*numType. * This code is running single threaded (difficult to parallelize * over threads). So the relative cost of this function increases * linearly with the number of threads. Since the inner-most loop @@ -146,14 +146,14 @@ static void divide_bondeds_by_locality(bonded_threading_t *bt, /* Find out which of the types has the lowest atom index */ f_min = 0; - for (f = 1; f < ntype; f++) + for (f = 1; f < numType; f++) { if (at_ind[f] < at_ind[f_min]) { f_min = f; } } - assert(f_min >= 0 && f_min < ntype); + assert(f_min >= 0 && f_min < numType); /* Assign the interaction with the lowest atom index (of type * index f_min) to thread t-1 by increasing ind. @@ -177,13 +177,13 @@ static void divide_bondeds_by_locality(bonded_threading_t *bt, } /* Store the bonded end boundaries (at index t) for thread t-1 */ - for (f = 0; f < ntype; f++) + for (f = 0; f < numType; f++) { bt->workDivision.setBound(ild[f].ftype, t, ind[f]); } } - for (f = 0; f < ntype; f++) + for (f = 0; f < numType; f++) { assert(ind[f] == ild[f].il->nr); } @@ -211,29 +211,29 @@ static void divide_bondeds_over_threads(bonded_threading_t *bt, assert(bt->nthreads > 0); bt->haveBondeds = false; - int ntype = 0; - size_t ftypeGpuIndex = 0; - for (int ftype = 0; ftype < F_NRE; ftype++) + int numType = 0; + size_t fTypeGpuIndex = 0; + for (int fType = 0; fType < F_NRE; fType++) { - if (!ftype_is_bonded_potential(ftype)) + if (!ftype_is_bonded_potential(fType)) { continue; } - const t_ilist &il = idef.il[ftype]; + const t_ilist &il = idef.il[fType]; int nrToAssignToCpuThreads = il.nr; if (useGpuForBondeds && - ftypeGpuIndex < gmx::ftypesOnGpu.size() && - gmx::ftypesOnGpu[ftypeGpuIndex] == ftype) + fTypeGpuIndex < gmx::fTypesOnGpu.size() && + gmx::fTypesOnGpu[fTypeGpuIndex] == fType) { - ftypeGpuIndex++; + fTypeGpuIndex++; /* Perturbation is not implemented in the GPU bonded kernels. * But instead of doing all on the CPU, we could do only * the actually perturbed interactions on the CPU. */ - if (!ftypeHasPerturbedEntries(idef, ftype)) + if (!ftypeHasPerturbedEntries(idef, fType)) { /* We will assign this interaction type to the GPU */ nrToAssignToCpuThreads = 0; @@ -250,23 +250,23 @@ static void divide_bondeds_over_threads(bonded_threading_t *bt, /* No interactions, avoid all the integer math below */ for (int t = 0; t <= bt->nthreads; t++) { - bt->workDivision.setBound(ftype, t, 0); + bt->workDivision.setBound(fType, t, 0); } } - else if (bt->nthreads <= bt->max_nthread_uniform || ftype == F_DISRES) + else if (bt->nthreads <= bt->max_nthread_uniform || fType == F_DISRES) { /* On up to 4 threads, load balancing the bonded work * is more important than minimizing the reduction cost. */ - const int stride = 1 + NRAL(ftype); + const int stride = 1 + NRAL(fType); for (int t = 0; t <= bt->nthreads; t++) { /* Divide equally over the threads */ int nr_t = (((nrToAssignToCpuThreads/stride)*t)/bt->nthreads)*stride; - if (ftype == F_DISRES) + if (fType == F_DISRES) { /* Ensure that distance restraint pairs with the same label * end up on the same thread. @@ -279,27 +279,27 @@ static void divide_bondeds_over_threads(bonded_threading_t *bt, } } - bt->workDivision.setBound(ftype, t, nr_t); + bt->workDivision.setBound(fType, t, nr_t); } } else { - /* Add this ftype to the list to be distributed */ - int nat = NRAL(ftype); - ild[ntype].ftype = ftype; - ild[ntype].il = &il; - ild[ntype].nat = nat; + /* Add this fType to the list to be distributed */ + int nat = NRAL(fType); + ild[numType].ftype = fType; + ild[numType].il = &il; + ild[numType].nat = nat; /* The first index for the thread division is always 0 */ - bt->workDivision.setBound(ftype, 0, 0); + bt->workDivision.setBound(fType, 0, 0); - ntype++; + numType++; } } - if (ntype > 0) + if (numType > 0) { - divide_bondeds_by_locality(bt, ntype, ild); + divide_bondeds_by_locality(bt, numType, ild); } if (debug)