nbat->natoms_local = gridSet.grids()[0].atomIndexEnd();
}
- const int nth = useGpu ? 1 : gmx_omp_nthreads_get(emntPairsearch);
-
-#pragma omp parallel for num_threads(nth) schedule(static)
- for (int th = 0; th < nth; th++)
+ if (useGpu)
{
- try
+ for (int g = gridBegin; g < gridEnd; g++)
{
- for (int g = gridBegin; g < gridEnd; g++)
+ nbnxn_gpu_x_to_nbat_x(gridSet.grids()[g],
+ FillLocal && g == 0,
+ gpu_nbv,
+ xPmeDevicePtr,
+ locality,
+ x);
+ }
+ }
+ else
+ {
+ const int nth = gmx_omp_nthreads_get(emntPairsearch);
+#pragma omp parallel for num_threads(nth) schedule(static)
+ for (int th = 0; th < nth; th++)
+ {
+ try
{
- const Nbnxm::Grid &grid = gridSet.grids()[g];
-
- int maxAtomsInColumn = 0;
-
- const int numCellsXY = grid.numColumns();
-
- const int cxy0 = (numCellsXY* th + nth - 1)/nth;
- const int cxy1 = (numCellsXY*(th + 1) + nth - 1)/nth;
-
- for (int cxy = cxy0; cxy < cxy1; cxy++)
+ for (int g = gridBegin; g < gridEnd; g++)
{
- const int na = grid.numAtomsInColumn(cxy);
- const int ash = grid.firstAtomInColumn(cxy);
+ const Nbnxm::Grid &grid = gridSet.grids()[g];
+ const int numCellsXY = grid.numColumns();
- int na_fill;
- if (g == 0 && FillLocal)
- {
- na_fill = grid.paddedNumAtomsInColumn(cxy);
- }
- else
- {
- /* We fill only the real particle locations.
- * We assume the filling entries at the end have been
- * properly set before during pair-list generation.
- */
- na_fill = na;
- }
- if (useGpu)
- {
- // All columns will be processed in a single GPU kernel (below).
- // We need to determine the maximum number of atoms in a column
- maxAtomsInColumn = std::max(maxAtomsInColumn, na);
- }
- else
+ const int cxy0 = (numCellsXY* th + nth - 1)/nth;
+ const int cxy1 = (numCellsXY*(th + 1) + nth - 1)/nth;
+
+ for (int cxy = cxy0; cxy < cxy1; cxy++)
{
+ const int na = grid.numAtomsInColumn(cxy);
+ const int ash = grid.firstAtomInColumn(cxy);
+
+ int na_fill;
+ if (g == 0 && FillLocal)
+ {
+ na_fill = grid.paddedNumAtomsInColumn(cxy);
+ }
+ else
+ {
+ /* We fill only the real particle locations.
+ * We assume the filling entries at the end have been
+ * properly set before during pair-list generation.
+ */
+ na_fill = na;
+ }
copy_rvec_to_nbat_real(gridSet.atomIndices().data() + ash,
na, na_fill, x,
nbat->XFormat, nbat->x().data(), ash);
}
}
- if (useGpu)
- {
- nbnxn_gpu_x_to_nbat_x(gridSet,
- g,
- FillLocal,
- gpu_nbv,
- xPmeDevicePtr,
- maxAtomsInColumn,
- locality,
- x);
- }
}
+ GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
}
- GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
}
}
* \param[in] numColumns extent of cell-level parallelism
* \param[out] xnb position buffer in nbnxm layout
- * \param[in] gridIndex grid index
- * \param[in] FillLocal boolean to specify if Fill Local is true
+ * \param[in] setFillerCoords tells whether to set the coordinates of the filler particles
* \param[in] x position buffer
* \param[in] a atom index mapping stride between atoms in memory
* \param[in] cxy_na array of extents
*/
__global__ void nbnxn_gpu_x_to_nbat_x_kernel(int numColumns,
float * __restrict__ xnb,
- int gridIndex,
- bool FillLocal,
+ bool setFillerCoords,
const rvec * __restrict__ x,
const int * __restrict__ a,
const int * __restrict__ cxy_na,
__global__ void nbnxn_gpu_x_to_nbat_x_kernel(int numColumns,
float * __restrict__ xnb,
- int gridIndex,
- bool FillLocal,
+ bool setFillerCoords,
const rvec * __restrict__ x,
const int * __restrict__ a,
const int * __restrict__ cxy_na,
int na = cxy_na[cxy];
int a0 = (cellOffset + cxy_ind[cxy])*numAtomsPerCell;
int na_round;
- if (gridIndex == 0 && FillLocal)
+ if (setFillerCoords)
{
// TODO: This can be done more efficiently
na_round =
#include "gromacs/nbnxm/gpu_common.h"
#include "gromacs/nbnxm/gpu_common_utils.h"
#include "gromacs/nbnxm/gpu_data_mgmt.h"
-#include "gromacs/nbnxm/gridset.h"
+#include "gromacs/nbnxm/grid.h"
#include "gromacs/nbnxm/nbnxm.h"
#include "gromacs/nbnxm/pairlist.h"
#include "gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh"
}
/* X buffer operations on GPU: performs conversion from rvec to nb format. */
-void nbnxn_gpu_x_to_nbat_x(const Nbnxm::GridSet &gridSet,
- int gridIndex,
- bool FillLocal,
+void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid &grid,
+ bool setFillerCoords,
gmx_nbnxn_gpu_t *nb,
void *xPmeDevicePtr,
- int maxAtomsInColumn,
const Nbnxm::AtomLocality locality,
const rvec *x)
{
cu_atomdata_t *adat = nb->atdat;
bool bDoTime = nb->bDoTime;
- const Nbnxm::Grid &grid = gridSet.grids()[gridIndex];
-
const int numColumns = grid.numColumns();
const int cellOffset = grid.cellOffset();
const int numAtomsPerCell = grid.numAtomsPerCell();
+ // TODO: Document this, one can not infer the interaction locality from the atom locality
Nbnxm::InteractionLocality interactionLoc = Nbnxm::InteractionLocality::Local;
- int nCopyAtoms = gridSet.numRealAtomsLocal();
- int copyAtomStart = 0;
+ int nCopyAtoms = grid.srcAtomEnd() - grid.srcAtomBegin();
+ int copyAtomStart = grid.srcAtomBegin();
if (locality == Nbnxm::AtomLocality::NonLocal)
{
interactionLoc = Nbnxm::InteractionLocality::NonLocal;
- nCopyAtoms = gridSet.numRealAtomsTotal()-gridSet.numRealAtomsLocal();
- copyAtomStart = gridSet.numRealAtomsLocal();
}
cudaStream_t stream = nb->stream[interactionLoc];
config.blockSize[0] = threadsPerBlock;
config.blockSize[1] = 1;
config.blockSize[2] = 1;
- config.gridSize[0] = ((maxAtomsInColumn+1)+threadsPerBlock-1)/threadsPerBlock;
+ config.gridSize[0] = (grid.numCellsColumnMax()*numAtomsPerCell + threadsPerBlock - 1)/threadsPerBlock;
config.gridSize[1] = numColumns;
config.gridSize[2] = 1;
+ GMX_ASSERT(config.gridSize[0] > 0, "Can not have empty grid, early return above avoids this");
config.sharedMemorySize = 0;
config.stream = stream;
const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config,
&numColumns,
&xqPtr,
- &gridIndex,
- &FillLocal,
+ &setFillerCoords,
&d_x,
&d_atomIndices,
&d_cxy_na,
const int numAtomsMoved,
nbnxn_atomdata_t *nbat)
{
- cellOffset_ = cellOffset;
+ cellOffset_ = cellOffset;
+
+ srcAtomBegin_ = atomStart;
+ srcAtomEnd_ = atomEnd;
const int nthread = gmx_omp_nthreads_get(emntPairsearch);
/* Clear cxy_na_, so we can reuse the array below */
cxy_na_[i] = 0;
}
- numCellsTotal_ = cxy_ind_[numColumns()] - cxy_ind_[0];
+ numCellsTotal_ = cxy_ind_[numColumns()] - cxy_ind_[0];
+ numCellsColumnMax_ = ncz_max;
/* Resize grid and atom data which depend on the number of cells */
resizeForNumberOfCells(atomIndexEnd(), numAtomsMoved, gridSetData, nbat);
return cellOffset_;
}
+ //! Returns the maximum number of grid cells in a column
+ int numCellsColumnMax() const
+ {
+ return numCellsColumnMax_;
+ }
+
+ //! Returns the start of the source atom range mapped to this grid
+ int srcAtomBegin() const
+ {
+ return srcAtomBegin_;
+ }
+
+ //! Returns the end of the source atom range mapped to this grid
+ int srcAtomEnd() const
+ {
+ return srcAtomEnd_;
+ }
+
//! Returns the first cell index in the grid, starting at 0 in this grid
int firstCellInColumn(int columnIndex) const
{
//! The total number of cells in this grid
int numCellsTotal_;
- //! Index in nbs->cell corresponding to cell 0 */
+ //! Index in nbs->cell corresponding to cell 0
int cellOffset_;
+ //! The maximum number of cells in a column
+ int numCellsColumnMax_;
+
+ //! The start of the source atom range mapped to this grid
+ int srcAtomBegin_;
+ //! The end of the source atom range mapped to this grid
+ int srcAtomEnd_;
/* Grid data */
/*! \brief The number of, non-filler, atoms for each grid column.
namespace Nbnxm
{
-class GridSet;
+class Grid;
/*! \brief
* Launch asynchronously the xq buffer host to device copy.
/*! \brief X buffer operations on GPU: performs conversion from rvec to nb format.
*/
CUDA_FUNC_QUALIFIER
-void nbnxn_gpu_x_to_nbat_x(const Nbnxm::GridSet gmx_unused &gridSet,
- int gmx_unused g,
- bool gmx_unused FillLocal,
+void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused &grid,
+ bool gmx_unused setFillerCoords,
gmx_nbnxn_gpu_t gmx_unused *gpu_nbv,
void gmx_unused *xPmeDevicePtr,
- int gmx_unused na_round_max,
Nbnxm::AtomLocality gmx_unused locality,
const rvec gmx_unused *x) CUDA_FUNC_TERM