src/gromacs/nbnxm/cuda/nbnxm_gpu_buffer_ops_internal.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 /*! \file
  37  *  \brief Define CUDA kernel (and its wrapper) for transforming position coordinates from rvec to nbnxm layout.
  38  *
  39  *  \author Alan Gray <alang@nvidia.com>
  40  *  \author Jon Vincent <jvincent@nvidia.com>
  41  *  \author Szilard Pall <pall.szilard@gmail.com>
  42  */
  43
  44 #include "gmxpre.h"
  45
  46 #include "gromacs/gpu_utils/typecasts.cuh"
  47 #include "gromacs/gpu_utils/vectype_ops.cuh"
  48 #include "gromacs/nbnxm/grid.h"
  49 #include "gromacs/nbnxm/nbnxm_gpu_buffer_ops_internal.h"
  50 #include "gromacs/nbnxm/cuda/nbnxm_cuda_types.h"
  51
  52 /*! \brief CUDA kernel for transforming position coordinates from rvec to nbnxm layout.
  53  *
  54  * TODO:
  55  *  - rename kernel so naming matches with the other NBNXM kernels;
  56  *  - enable separate compilation unit
  57
  58  * \param[in]     numColumns          Extent of cell-level parallelism.
  59  * \param[out]    gm_xq               Coordinates buffer in nbnxm layout.
  60  * \param[in]     gm_x                Coordinates buffer.
  61  * \param[in]     gm_atomIndex        Atom index mapping.
  62  * \param[in]     gm_numAtoms         Array of number of atoms.
  63  * \param[in]     gm_cellIndex        Array of cell indices.
  64  * \param[in]     cellOffset          First cell.
  65  * \param[in]     numAtomsPerCell     Number of atoms per cell.
  66  */
  67 static __global__ void nbnxn_gpu_x_to_nbat_x_kernel(int numColumns,
  68                                                     float4* __restrict__ gm_xq,
  69                                                     const float3* __restrict__ gm_x,
  70                                                     const int* __restrict__ gm_atomIndex,
  71                                                     const int* __restrict__ gm_numAtoms,
  72                                                     const int* __restrict__ gm_cellIndex,
  73                                                     int cellOffset,
  74                                                     int numAtomsPerCell)
  75 {
  76
  77     const float farAway = -1000000.0F;
  78
  79     // Map cell-level parallelism to y component of CUDA block index.
  80     int cxy = blockIdx.y;
  81
  82     if (cxy < numColumns)
  83     {
  84
  85         const int numAtoms = gm_numAtoms[cxy];
  86         const int offset   = (cellOffset + gm_cellIndex[cxy]) * numAtomsPerCell;
  87
  88         const int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
  89
  90         // Destination address where x should be stored in nbnxm layout. We use this cast here to
  91         // save only x, y and z components, not touching the w (q) component, which is pre-defined.
  92         float3* gm_xqDest = reinterpret_cast<float3*>(&gm_xq[threadIndex + offset]);
  93
  94         // Perform layout conversion of each element.
  95         if (threadIndex < numAtoms)
  96         {
  97             if (threadIndex < numAtoms)
  98             {
  99                 *gm_xqDest = gm_x[gm_atomIndex[threadIndex + offset]];
 100             }
 101             else
 102             {
 103                 *gm_xqDest = make_float3(farAway);
 104             }
 105         }
 106     }
 107 }
 108
 109
 110 namespace Nbnxm
 111 {
 112
 113 //! Number of CUDA threads in a block
 114 // TODO Optimize this through experimentation
 115 constexpr static int c_bufOpsThreadsPerBlock = 128;
 116
 117 void launchNbnxmKernelTransformXToXq(const Grid&          grid,
 118                                      NbnxmGpu*            nb,
 119                                      DeviceBuffer<Float3> d_x,
 120                                      const DeviceStream&  deviceStream,
 121                                      const unsigned int   numColumnsMax,
 122                                      const int            gridId)
 123 {
 124     const int numColumns      = grid.numColumns();
 125     const int cellOffset      = grid.cellOffset();
 126     const int numAtomsPerCell = grid.numAtomsPerCell();
 127
 128     KernelLaunchConfig config;
 129     config.blockSize[0] = c_bufOpsThreadsPerBlock;
 130     config.blockSize[1] = 1;
 131     config.blockSize[2] = 1;
 132     config.gridSize[0]  = (grid.numCellsColumnMax() * numAtomsPerCell + c_bufOpsThreadsPerBlock - 1)
 133                          / c_bufOpsThreadsPerBlock;
 134     config.gridSize[1] = numColumns;
 135     config.gridSize[2] = 1;
 136     GMX_ASSERT(config.gridSize[0] > 0, "Can not have empty grid, early return above avoids this");
 137     config.sharedMemorySize = 0;
 138
 139     auto       kernelFn      = nbnxn_gpu_x_to_nbat_x_kernel;
 140     float3*    d_xFloat3     = asFloat3(d_x);
 141     float4*    d_xq          = nb->atdat->xq;
 142     const int* d_atomIndices = nb->atomIndices;
 143     const int* d_cxy_na      = &nb->cxy_na[numColumnsMax * gridId];
 144     const int* d_cxy_ind     = &nb->cxy_ind[numColumnsMax * gridId];
 145     const auto kernelArgs    = prepareGpuKernelArguments(
 146             kernelFn, config, &numColumns, &d_xq, &d_xFloat3, &d_atomIndices, &d_cxy_na, &d_cxy_ind, &cellOffset, &numAtomsPerCell);
 147     launchGpuKernel(kernelFn, config, deviceStream, nullptr, "XbufferOps", kernelArgs);
 148 }
 149
 150 } // namespace Nbnxm