dev-manual/build-system.rst
dev-manual/commitstyle.rst
dev-manual/documentation-generation.rst
+ dev-manual/contribute.rst
dev-manual/doxygen.rst
dev-manual/error-handling.rst
dev-manual/formatting.rst
fragments/doxygen-links.rst
install-guide/index.rst
release-notes/index.rst
- release-notes/features.rst
- release-notes/performance.rst
- release-notes/tools.rst
- release-notes/bugs-fixed.rst
- release-notes/removed-features.rst
- release-notes/portability.rst
- release-notes/miscellaneous.rst
+ release-notes/2018/2018.1.rst
+ release-notes/2018/major/highlights.rst
+ release-notes/2018/major/features.rst
+ release-notes/2018/major/performance.rst
+ release-notes/2018/major/tools.rst
+ release-notes/2018/major/bugs-fixed.rst
+ release-notes/2018/major/removed-features.rst
+ release-notes/2018/major/portability.rst
+ release-notes/2018/major/miscellaneous.rst
user-guide/index.rst
user-guide/cutoff-schemes.rst
user-guide/getting-started.rst
struct PmeShared
{
/*! \brief Grid count - currently always 1 on GPU */
- int ngrids;
+ int ngrids;
/*! \brief Grid dimensions - nkx, nky, nkz */
- int nk[DIM];
- /*! \brief Padded grid dimensions - pmegrid_nx, pmegrid_ny, pmegrid_nz
- * TODO: find out if these are really needed for the CPU FFT compatibility.
- */
- int pmegrid_n[DIM];
+ int nk[DIM];
/*! \brief PME interpolation order */
int pme_order;
/*! \brief Ewald splitting coefficient for Coulomb */
/*! \brief A pointer to the device used during the execution. */
gmx_device_info_t *deviceInfo;
+ /*! \brief Kernel scheduling grid width limit in X - derived from deviceinfo compute capability in CUDA.
+ * Declared as very large int to make it useful in computations with type promotion, to avoid overflows.
+ */
+ std::intmax_t maxGridWidthX;
+
/*! \brief A single structure encompassing all the PME data used on GPU.
* Its value is the only argument to all the PME GPU kernels.
* \todo Test whether this should be copied to the constant GPU memory once for each computation
//! Spreading max block size in threads
constexpr int c_spreadMaxThreadsPerBlock = c_spreadMaxWarpsPerBlock * warp_size;
-//! Texture references for CC 2.x
-texture<int, 1, cudaReadModeElementType> gridlineIndicesTableTextureRef;
-texture<float, 1, cudaReadModeElementType> fractShiftsTableTextureRef;
-
-/*! Returns the reference to the gridlineIndices texture. */
-const struct texture<int, 1, cudaReadModeElementType> &pme_gpu_get_gridline_texref()
-{
- return gridlineIndicesTableTextureRef;
-}
-
-/*! Returns the reference to the fractShifts texture. */
-const struct texture<float, 1, cudaReadModeElementType> &pme_gpu_get_fract_shifts_texref()
-{
- return fractShiftsTableTextureRef;
-}
/*! \brief
* General purpose function for loading atom-related data from global to shared memory.
const T * __restrict__ gm_source)
{
static_assert(c_usePadding, "With padding disabled, index checking should be fixed to account for spline theta/dtheta per-warp alignment");
+ const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
const int threadLocalIndex = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x) + threadIdx.x;
const int localIndex = threadLocalIndex;
- const int globalIndexBase = blockIdx.x * atomsPerBlock * dataCountPerAtom;
+ const int globalIndexBase = blockIndex * atomsPerBlock * dataCountPerAtom;
const int globalIndex = globalIndexBase + localIndex;
const int globalCheck = pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
sm_fractCoords[sharedMemoryIndex] +=
fetchFromParamLookupTable(kernelParams.grid.d_fractShiftsTable,
kernelParams.fractShiftsTableTexture,
-#if DISABLE_CUDA_TEXTURES == 0
- fractShiftsTableTextureRef,
-#endif
tableIndex);
sm_gridlineIndices[sharedMemoryIndex] =
fetchFromParamLookupTable(kernelParams.grid.d_gridlineIndicesTable,
kernelParams.gridlineIndicesTableTexture,
-#if DISABLE_CUDA_TEXTURES == 0
- gridlineIndicesTableTextureRef,
-#endif
tableIndex);
gm_gridlineIndices[atomIndexOffset * DIM + sharedMemoryIndex] = sm_gridlineIndices[sharedMemoryIndex];
}
// Spline values
__shared__ float sm_theta[atomsPerBlock * DIM * order];
- const int atomIndexOffset = blockIdx.x * atomsPerBlock;
+ const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
+ const int atomIndexOffset = blockIndex * atomsPerBlock;
+
+ /* Early return for fully empty blocks at the end
+ * (should only happen on Fermi or billions of input atoms)
+ */
+ if (atomIndexOffset >= kernelParams.atoms.nAtoms)
+ {
+ return;
+ }
/* Staging coefficients/charges for both spline and spread */
pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, kernelParams.atoms.d_coefficients);
//(for spline data mostly, together with varying PME_GPU_PARALLEL_SPLINE define)
GMX_ASSERT(!c_usePadding || !(PME_ATOM_DATA_ALIGNMENT % atomsPerBlock), "inconsistent atom data padding vs. spreading block size");
- dim3 nBlocks(pmeGpu->nAtomsPadded / atomsPerBlock);
+ const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
+ auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount);
dim3 dimBlock(order, order, atomsPerBlock);
// These should later check for PME decomposition
if (spreadCharges)
{
pme_gpu_start_timing(pmeGpu, gtPME_SPLINEANDSPREAD);
- pme_spline_and_spread_kernel<4, true, true, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+ pme_spline_and_spread_kernel<4, true, true, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
pme_gpu_stop_timing(pmeGpu, gtPME_SPLINEANDSPREAD);
}
else
{
pme_gpu_start_timing(pmeGpu, gtPME_SPLINE);
- pme_spline_and_spread_kernel<4, true, false, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+ pme_spline_and_spread_kernel<4, true, false, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
pme_gpu_stop_timing(pmeGpu, gtPME_SPLINE);
}
else
{
pme_gpu_start_timing(pmeGpu, gtPME_SPREAD);
- pme_spline_and_spread_kernel<4, false, true, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+ pme_spline_and_spread_kernel<4, false, true, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
pme_gpu_stop_timing(pmeGpu, gtPME_SPREAD);
}
initParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
kernelParamsPtr->fractShiftsTableTexture,
- &pme_gpu_get_fract_shifts_texref(),
pmeGpu->common->fsh.data(),
newFractShiftsSize,
pmeGpu->deviceInfo);
initParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
kernelParamsPtr->gridlineIndicesTableTexture,
- &pme_gpu_get_gridline_texref(),
pmeGpu->common->nn.data(),
newFractShiftsSize,
pmeGpu->deviceInfo);
auto *kernelParamsPtr = pmeGpu->kernelParams.get();
destroyParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
kernelParamsPtr->fractShiftsTableTexture,
- &pme_gpu_get_fract_shifts_texref(),
pmeGpu->deviceInfo);
destroyParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
kernelParamsPtr->gridlineIndicesTableTexture,
- &pme_gpu_get_gridline_texref(),
pmeGpu->deviceInfo);
}
// TODO: Consider turning on by default when we can detect nr of streams.
pmeGpu->archSpecific->useTiming = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
+ pmeGpu->maxGridWidthX = pmeGpu->deviceInfo->prop.maxGridSize[0];
+
/* Creating a PME CUDA stream */
cudaError_t stat;
int highest_priority, lowest_priority;
return c_skipNeutralAtoms ? (coefficient != 0.0f) : 1;
}
+ /*! \brief \internal
+ * Given possibly large \p blockCount, returns a compact 1D or 2D grid for kernel scheduling,
+ * to minimize number of unused blocks.
+ */
+ template <typename PmeGpu>
+ dim3 __host__ inline pmeGpuCreateGrid(const PmeGpu *pmeGpu, int blockCount)
+ {
+ // How many maximum widths in X do we need (hopefully just one)
+ const int minRowCount = (blockCount + pmeGpu->maxGridWidthX - 1) / pmeGpu->maxGridWidthX;
+ // Trying to make things even
+ const int colCount = (blockCount + minRowCount - 1) / minRowCount;
+ GMX_ASSERT((colCount * minRowCount - blockCount) >= 0, "pmeGpuCreateGrid: totally wrong");
+ GMX_ASSERT((colCount * minRowCount - blockCount) < minRowCount, "pmeGpuCreateGrid: excessive blocks");
+ return dim3(colCount, minRowCount);
+ }
+
/*! \brief \internal
* The main PME CUDA-specific host data structure, included in the PME GPU structure by the archSpecific pointer.
*/
cudaTextureObject_t gridlineIndicesTableTexture;
};
-/* CUDA texture reference functions which reside in respective kernel files
- * (due to texture references having scope of a translation unit).
- */
-/*! Returns the reference to the gridlineIndices texture. */
-const struct texture<int, 1, cudaReadModeElementType> &pme_gpu_get_gridline_texref();
-/*! Returns the reference to the fractShifts texture. */
-const struct texture<float, 1, cudaReadModeElementType> &pme_gpu_get_fract_shifts_texref();
-
#endif
#include "gromacs/gmxpreprocess/vsite_parm.h"
#include "gromacs/math/units.h"
#include "gromacs/math/utilities.h"
-#include "gromacs/mdlib/genborn.h"
#include "gromacs/mdtypes/inputrec.h"
#include "gromacs/mdtypes/md_enums.h"
#include "gromacs/pbcutil/pbc.h"
/* Check mass and charge */
q = 0.0;
- for (mb = 0; mb < mtop->nmoltype; mb++)
+ for (mb = 0; mb < mtop->nmolblock; mb++)
{
atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
nmol = mtop->molblock[mb].nmol;
}
-static int
-find_gb_bondlength(t_params *plist, int ai, int aj, real *length)
-{
- int i, j, a1, a2;
-
- int found = 0;
- int status;
-
- for (i = 0; i < F_NRE && !found; i++)
- {
- if (IS_CHEMBOND(i))
- {
- for (j = 0; j < plist[i].nr; j++)
- {
- a1 = plist[i].param[j].a[0];
- a2 = plist[i].param[j].a[1];
-
- if ( (a1 == ai && a2 == aj) || (a1 == aj && a2 == ai))
- {
- /* Equilibrium bond distance */
- *length = plist[i].param[j].c[0];
- found = 1;
- }
- }
- }
- }
- status = !found;
-
- return status;
-}
-
-
-static int
-find_gb_anglelength(t_params *plist, int ai, int ak, real *length)
-{
- int i, j, a1, a2, a3;
- real r12, r23, a123;
- int found = 0;
- int status, status1, status2;
-
- r12 = r23 = 0;
-
- for (i = 0; i < F_NRE && !found; i++)
- {
- if (IS_ANGLE(i))
- {
- for (j = 0; j < plist[i].nr; j++)
- {
- a1 = plist[i].param[j].a[0];
- a2 = plist[i].param[j].a[1];
- a3 = plist[i].param[j].a[2];
-
- /* We dont care what the middle atom is, but use it below */
- if ( (a1 == ai && a3 == ak) || (a1 == ak && a3 == ai) )
- {
- /* Equilibrium bond distance */
- a123 = plist[i].param[j].c[0];
- /* Use middle atom to find reference distances r12 and r23 */
- status1 = find_gb_bondlength(plist, a1, a2, &r12);
- status2 = find_gb_bondlength(plist, a2, a3, &r23);
-
- if (status1 == 0 && status2 == 0)
- {
- /* cosine theorem to get r13 */
- *length = std::sqrt(r12*r12+r23*r23-(2*r12*r23*cos(a123/RAD2DEG)));
- found = 1;
- }
- }
- }
- }
- }
- status = !found;
-
- return status;
-}
-
-static int
-generate_gb_exclusion_interactions(t_molinfo *mi, gpp_atomtype_t atype, t_nextnb *nnb)
-{
- int j, n, ai, aj, ti, tj;
- int ftype;
- t_param param;
- t_params * plist;
- t_atoms * at;
- real radiusi, radiusj;
- real gb_radiusi, gb_radiusj;
- real param_c2, param_c4;
- real distance;
-
- plist = mi->plist;
- at = &mi->atoms;
-
- for (n = 1; n <= nnb->nrex; n++)
- {
- switch (n)
- {
- case 1:
- ftype = F_GB12;
- param_c2 = STILL_P2;
- param_c4 = 0.8875;
- break;
- case 2:
- ftype = F_GB13;
- param_c2 = STILL_P3;
- param_c4 = 0.3516;
- break;
- default:
- /* Put all higher-order exclusions into 1,4 list so we dont miss them */
- ftype = F_GB14;
- param_c2 = STILL_P3;
- param_c4 = 0.3516;
- break;
- }
-
- for (ai = 0; ai < nnb->nr; ai++)
- {
- ti = at->atom[ai].type;
- radiusi = get_atomtype_radius(ti, atype);
- gb_radiusi = get_atomtype_gb_radius(ti, atype);
-
- for (j = 0; j < nnb->nrexcl[ai][n]; j++)
- {
- aj = nnb->a[ai][n][j];
-
- /* Only add the interactions once */
- if (aj > ai)
- {
- tj = at->atom[aj].type;
- radiusj = get_atomtype_radius(tj, atype);
- gb_radiusj = get_atomtype_gb_radius(tj, atype);
-
- /* There is an exclusion of type "ftype" between atoms ai and aj */
- param.a[0] = ai;
- param.a[1] = aj;
-
- /* Reference distance, not used for 1-4 interactions */
- switch (ftype)
- {
- case F_GB12:
- if (find_gb_bondlength(plist, ai, aj, &distance) != 0)
- {
- gmx_fatal(FARGS, "Cannot find bond length for atoms %d-%d", ai, aj);
- }
- break;
- case F_GB13:
- if (find_gb_anglelength(plist, ai, aj, &distance) != 0)
- {
- gmx_fatal(FARGS, "Cannot find length for atoms %d-%d involved in angle", ai, aj);
- }
- break;
- default:
- distance = -1;
- break;
- }
- /* Assign GB parameters */
- /* Sum of radii */
- param.c[0] = radiusi+radiusj;
- /* Reference distance distance */
- param.c[1] = distance;
- /* Still parameter */
- param.c[2] = param_c2;
- /* GB radius */
- param.c[3] = gb_radiusi+gb_radiusj;
- /* Parameter */
- param.c[4] = param_c4;
-
- /* Add it to the parameter list */
- add_param_to_list(&plist[ftype], ¶m);
- }
- }
- }
- }
- return 0;
-}
-
-
static void make_atoms_sys(int nmolb, const gmx_molblock_t *molb,
const t_molinfo *molinfo,
t_atoms *atoms)
int *nmolblock,
gmx_molblock_t **molblock,
gmx_bool bFEP,
- gmx_bool bGenborn,
gmx_bool bZero,
gmx_bool usingFullRangeElectrostatics,
warninp_t wi)
*/
case d_implicit_genborn_params:
- push_gb_params(atype, pline, wi);
+ // Skip this line, so old topologies with
+ // GB parameters can be read.
break;
case d_implicit_surface_params:
- gmx_fatal(FARGS, "Implicit surface directive not supported yet.");
+ // Skip this line, so that any topologies
+ // with surface parameters can be read
+ // (even though these were never formally
+ // supported).
break;
case d_cmaptypes:
- /* nnb contains information about first,2nd,3rd bonded neighbors.
- * Use this to generate GB 1-2,1-3,1-4 interactions when necessary.
- */
- if (bGenborn == TRUE)
- {
- generate_gb_exclusion_interactions(mi0, atype, &nnb);
- }
-
done_nnb(&nnb);
if (bCouple)
const t_inputrec *ir,
int *nmolblock,
gmx_molblock_t **molblock,
- gmx_bool bGenborn,
warninp_t wi)
{
/* Tmpfile might contain a long path */
nrmols, molinfo, intermolecular_interactions,
plist, combination_rule, repulsion_power,
opts, fudgeQQ, nmolblock, molblock,
- ir->efep != efepNO, bGenborn, bZero,
+ ir->efep != efepNO, bZero,
EEL_FULL(ir->coulombtype), wi);
if ((*combination_rule != eCOMB_GEOMETRIC) &&
{ "They don't have half hours in the north", "Carl Caleman" },
{ "Safety lights are for dudes", "Ghostbusters 2016" },
{ "It's 2040 now. Our President is a plant.", "Ghostbusters 2016" },
- { "It's just B I O L O G Y, can't you see?" "Joe Jackson" },
+ { "It's just B I O L O G Y, can't you see?", "Joe Jackson" },
{ "Input, output, electricity", "Joni Mitchell" },
{ "Your daddy ain't your daddy but your daddy don't know", "Dalahan" },
{ "Why is the Earth moving 'round the sun? Floating in the vacuum with no purpose, not a one", "Fleet Foxes" },
{ "A computer once beat me at chess, but it was no match for me at kick boxing.", "Emo Philips" },
{ "Home computers are being called upon to perform many new functions, including the consumption of homework formerly eaten by the dog.", "Doug Larson" },
{ "Forcefields are like dating; things go fine for a while and then sometimes it goes really bad.", "Alex MacKerell" },
- { "This type of advanced sampling techniques... which are not so advanced, really.", "Viveca Lindahl, on AWH, at her thesis defense." }
+ { "This type of advanced sampling techniques... which are not so advanced, really.", "Viveca Lindahl, on AWH, at her thesis defense." },
+ { "C++ is tricky. You can do everything. You can even make every mistake.", "Nicolai Josuttis, CppCon2017" },
};
if (beCool())