From: Berk Hess Date: Fri, 21 Dec 2018 20:49:29 +0000 (+0100) Subject: Convert nbnxn_atomdata_t to C++ X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=f43e9454a4ed72f383ef71548499e00935744f48;p=alexxy%2Fgromacs.git Convert nbnxn_atomdata_t to C++ Changed all manually managed pointer to std::vector. Split of a Params and a SimdMasks struct. Changed some data members to be private, more to be done. This change is ony refactoring, no functional changes. Note: minor, negligible performance impact of the nbnxn gridding due to (unnecessary) initialization of std::vector during resize(). Change-Id: I9c70a1f8f272c80a7cf335fcbd867bd79c4102a2 --- diff --git a/src/gromacs/gpu_utils/cudautils.cu b/src/gromacs/gpu_utils/cudautils.cu index fda0e9bb90..bda125511b 100644 --- a/src/gromacs/gpu_utils/cudautils.cu +++ b/src/gromacs/gpu_utils/cudautils.cu @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2014,2015,2016,2017,2018, by the GROMACS development team, led by + * Copyright (c) 2012,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -91,7 +91,7 @@ int cu_copy_D2H_async(void * h_dest, void * d_src, size_t bytes, cudaStream_t s } // TODO: template on transferKind to avoid runtime conditionals -int cu_copy_H2D(void *d_dest, void *h_src, size_t bytes, +int cu_copy_H2D(void *d_dest, const void *h_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t s = nullptr) { cudaError_t stat; @@ -121,7 +121,7 @@ int cu_copy_H2D(void *d_dest, void *h_src, size_t bytes, return 0; } -int cu_copy_H2D_sync(void * d_dest, void * h_src, size_t bytes) +int cu_copy_H2D_sync(void * d_dest, const void * h_src, size_t bytes) { return cu_copy_H2D(d_dest, h_src, bytes, GpuApiCallBehavior::Sync); } @@ -129,7 +129,7 @@ int cu_copy_H2D_sync(void * d_dest, void * h_src, size_t bytes) /*! * The copy is launched in stream s or if not specified, in stream 0. */ -int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s = nullptr) +int cu_copy_H2D_async(void * d_dest, const void * h_src, size_t bytes, cudaStream_t s = nullptr) { return cu_copy_H2D(d_dest, h_src, bytes, GpuApiCallBehavior::Async, s); } diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh index 1116d66674..35ab3d8753 100644 --- a/src/gromacs/gpu_utils/cudautils.cuh +++ b/src/gromacs/gpu_utils/cudautils.cuh @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2014,2015,2016,2017,2018, by the GROMACS development team, led by + * Copyright (c) 2012,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -157,13 +157,13 @@ int cu_copy_D2H_async(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/, cud * * The copy is launched in stream s or if not specified, in stream 0. */ -int cu_copy_H2D(void *d_dest, void *h_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t /*s = nullptr*/); +int cu_copy_H2D(void *d_dest, const void *h_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t /*s = nullptr*/); /*! Launches synchronous host to device memory copy. */ -int cu_copy_H2D_sync(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/); +int cu_copy_H2D_sync(void * /*d_dest*/, const void * /*h_src*/, size_t /*bytes*/); /*! Launches asynchronous host to device memory copy in stream s. */ -int cu_copy_H2D_async(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/, cudaStream_t /*s = nullptr*/); +int cu_copy_H2D_async(void * /*d_dest*/, const void * /*h_src*/, size_t /*bytes*/, cudaStream_t /*s = nullptr*/); // TODO: the 2 functions below are pretty much a constructor/destructor of a simple // GPU table object. There is also almost self-contained fetchFromParamLookupTable() diff --git a/src/gromacs/gpu_utils/oclutils.cpp b/src/gromacs/gpu_utils/oclutils.cpp index 26e750a2e4..b5efc4397e 100644 --- a/src/gromacs/gpu_utils/oclutils.cpp +++ b/src/gromacs/gpu_utils/oclutils.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2014,2015,2016,2017,2018, by the GROMACS development team, led by + * Copyright (c) 2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -52,7 +52,7 @@ #include "gromacs/utility/fatalerror.h" #include "gromacs/utility/smalloc.h" -int ocl_copy_H2D(cl_mem d_dest, void* h_src, +int ocl_copy_H2D(cl_mem d_dest, const void* h_src, size_t offset, size_t bytes, GpuApiCallBehavior transferKind, cl_command_queue command_queue, @@ -92,7 +92,7 @@ int ocl_copy_H2D(cl_mem d_dest, void* h_src, * identifying this particular host to device operation. The event can further * be used to queue a wait for this operation or to query profiling information. */ -int ocl_copy_H2D_async(cl_mem d_dest, void * h_src, +int ocl_copy_H2D_async(cl_mem d_dest, const void * h_src, size_t offset, size_t bytes, cl_command_queue command_queue, cl_event *copy_event) @@ -102,7 +102,7 @@ int ocl_copy_H2D_async(cl_mem d_dest, void * h_src, /*! \brief Launches synchronous host to device memory copy. */ -int ocl_copy_H2D_sync(cl_mem d_dest, void * h_src, +int ocl_copy_H2D_sync(cl_mem d_dest, const void * h_src, size_t offset, size_t bytes, cl_command_queue command_queue) { diff --git a/src/gromacs/gpu_utils/oclutils.h b/src/gromacs/gpu_utils/oclutils.h index 78cd747f7b..0cbb7a46aa 100644 --- a/src/gromacs/gpu_utils/oclutils.h +++ b/src/gromacs/gpu_utils/oclutils.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2014,2015,2016,2017,2018, by the GROMACS development team, led by + * Copyright (c) 2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -132,20 +132,20 @@ int ocl_copy_D2H_async(void * h_dest, cl_mem d_src, * identifying this particular host to device operation. The event can further * be used to queue a wait for this operation or to query profiling information. */ -int ocl_copy_H2D(cl_mem d_dest, void* h_src, +int ocl_copy_H2D(cl_mem d_dest, const void* h_src, size_t offset, size_t bytes, GpuApiCallBehavior transferKind, cl_command_queue command_queue, cl_event *copy_event); /*! \brief Launches asynchronous host to device memory copy. */ -int ocl_copy_H2D_async(cl_mem d_dest, void * h_src, +int ocl_copy_H2D_async(cl_mem d_dest, const void * h_src, size_t offset, size_t bytes, cl_command_queue command_queue, cl_event *copy_event); /*! \brief Launches synchronous host to device memory copy. */ -int ocl_copy_H2D_sync(cl_mem d_dest, void * h_src, +int ocl_copy_H2D_sync(cl_mem d_dest, const void * h_src, size_t offset, size_t bytes, cl_command_queue command_queue); diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp index 763aaaa6c1..8255b7b0f8 100644 --- a/src/gromacs/mdlib/forcerec.cpp +++ b/src/gromacs/mdlib/forcerec.cpp @@ -2196,7 +2196,7 @@ static void init_nb_verlet(const gmx::MDLogger &mdlog, enbnxninitcombrule = enbnxninitcombruleNONE; } - snew(nbv->nbat, 1); + nbv->nbat = new nbnxn_atomdata_t(nbv->bUseGPU ? gmx::PinningPolicy::PinnedIfSupported : gmx::PinningPolicy::CannotBePinned); int mimimumNumEnergyGroupNonbonded = ir->opts.ngener; if (ir->opts.ngener - ir->nwall == 1) { @@ -2213,8 +2213,7 @@ static void init_nb_verlet(const gmx::MDLogger &mdlog, enbnxninitcombrule, fr->ntype, fr->nbfp, mimimumNumEnergyGroupNonbonded, - bSimpleList ? gmx_omp_nthreads_get(emntNonbonded) : 1, - nb_alloc, nb_free); + bSimpleList ? gmx_omp_nthreads_get(emntNonbonded) : 1); if (nbv->bUseGPU) { diff --git a/src/gromacs/mdlib/nbnxn_atomdata.cpp b/src/gromacs/mdlib/nbnxn_atomdata.cpp index 9c70f5ab60..34edde97c8 100644 --- a/src/gromacs/mdlib/nbnxn_atomdata.cpp +++ b/src/gromacs/mdlib/nbnxn_atomdata.cpp @@ -112,73 +112,46 @@ void nbnxn_realloc_void(void **ptr, *ptr = ptr_new; } -/* Reallocate the nbnxn_atomdata_t for a size of n atoms */ -void nbnxn_atomdata_realloc(nbnxn_atomdata_t *nbat, int n) +void nbnxn_atomdata_t::resizeCoordinateBuffer(int numAtoms) { - GMX_ASSERT(nbat->nalloc >= nbat->natoms, "We should have at least as many elelements allocated as there are set"); - - int t; - - nbnxn_realloc_void(reinterpret_cast(&nbat->type), - nbat->natoms*sizeof(*nbat->type), - n*sizeof(*nbat->type), - nbat->alloc, nbat->free); - nbnxn_realloc_void(reinterpret_cast(&nbat->lj_comb), - nbat->natoms*2*sizeof(*nbat->lj_comb), - n*2*sizeof(*nbat->lj_comb), - nbat->alloc, nbat->free); - if (nbat->XFormat != nbatXYZQ) - { - nbnxn_realloc_void(reinterpret_cast(&nbat->q), - nbat->natoms*sizeof(*nbat->q), - n*sizeof(*nbat->q), - nbat->alloc, nbat->free); - } - if (nbat->nenergrp > 1) - { - nbnxn_realloc_void(reinterpret_cast(&nbat->energrp), - nbat->natoms/nbat->na_c*sizeof(*nbat->energrp), - n/nbat->na_c*sizeof(*nbat->energrp), - nbat->alloc, nbat->free); - } - nbnxn_realloc_void(reinterpret_cast(&nbat->x), - nbat->natoms*nbat->xstride*sizeof(*nbat->x), - n*nbat->xstride*sizeof(*nbat->x), - nbat->alloc, nbat->free); - for (t = 0; t < nbat->nout; t++) + numAtoms_ = numAtoms; + + x_.resize(numAtoms*xstride); +} + +void nbnxn_atomdata_t::resizeForceBuffers() +{ + /* Force buffers need padding up to a multiple of the buffer flag size */ + const int paddedSize = (numAtoms() + NBNXN_BUFFERFLAG_SIZE - 1)/NBNXN_BUFFERFLAG_SIZE*NBNXN_BUFFERFLAG_SIZE; + + /* Should we let each thread allocate it's own data instead? */ + for (nbnxn_atomdata_output_t &outBuffer : out) { - /* Allocate one element extra for possible signaling with GPUs */ - nbnxn_realloc_void(reinterpret_cast(&nbat->out[t].f), - nbat->natoms*nbat->fstride*sizeof(*nbat->out[t].f), - n*nbat->fstride*sizeof(*nbat->out[t].f), - nbat->alloc, nbat->free); + outBuffer.f.resize(paddedSize*fstride); } - nbat->nalloc = n; } /* Initializes an nbnxn_atomdata_output_t data structure */ -static void nbnxn_atomdata_output_init(nbnxn_atomdata_output_t *out, - int nb_kernel_type, - int nenergrp, int stride, - nbnxn_alloc_t *ma) +nbnxn_atomdata_output_t::nbnxn_atomdata_output_t(int nb_kernel_type, + int numEnergyGroups, + int simdEnergyBufferStride, + gmx::PinningPolicy pinningPolicy) : + f({}, {pinningPolicy}), + fshift({}, {pinningPolicy}), + Vvdw({}, {pinningPolicy}), + Vc({}, {pinningPolicy}) { - out->f = nullptr; - ma(reinterpret_cast(&out->fshift), SHIFTS*DIM*sizeof(*out->fshift)); - out->nV = nenergrp*nenergrp; - ma(reinterpret_cast(&out->Vvdw), out->nV*sizeof(*out->Vvdw)); - ma(reinterpret_cast(&out->Vc), out->nV*sizeof(*out->Vc )); + fshift.resize(SHIFTS*DIM); + Vvdw.resize(numEnergyGroups*numEnergyGroups); + Vc.resize(numEnergyGroups*numEnergyGroups); if (nb_kernel_type == nbnxnk4xN_SIMD_4xN || nb_kernel_type == nbnxnk4xN_SIMD_2xNN) { - int cj_size = nbnxn_kernel_to_cluster_j_size(nb_kernel_type); - out->nVS = nenergrp*nenergrp*stride*(cj_size>>1)*cj_size; - ma(reinterpret_cast(&out->VSvdw), out->nVS*sizeof(*out->VSvdw)); - ma(reinterpret_cast(&out->VSc), out->nVS*sizeof(*out->VSc )); - } - else - { - out->nVS = 0; + int cj_size = nbnxn_kernel_to_cluster_j_size(nb_kernel_type); + int numElements = numEnergyGroups*numEnergyGroups*simdEnergyBufferStride*(cj_size/2)*cj_size; + VSvdw.resize(numElements); + VSc.resize(numElements); } } @@ -321,11 +294,9 @@ void copy_rvec_to_nbat_real(const int *a, int na, int na_round, } /* Stores the LJ parameter data in a format convenient for different kernels */ -static void set_lj_parameter_data(nbnxn_atomdata_t *nbat, gmx_bool bSIMD) +static void set_lj_parameter_data(nbnxn_atomdata_t::Params *params, gmx_bool bSIMD) { - real c6, c12; - - int nt = nbat->ntype; + int nt = params->numTypes; if (bSIMD) { @@ -337,16 +308,16 @@ static void set_lj_parameter_data(nbnxn_atomdata_t *nbat, gmx_bool bSIMD) * when it might not be used, but introducing the conditional code is not * really worth it. */ - nbat->alloc(reinterpret_cast(&nbat->nbfp_aligned), - nt*nt*c_simdBestPairAlignment*sizeof(*nbat->nbfp_aligned)); + params->nbfp_aligned.resize(nt*nt*c_simdBestPairAlignment); + for (int i = 0; i < nt; i++) { for (int j = 0; j < nt; j++) { - nbat->nbfp_aligned[(i*nt+j)*c_simdBestPairAlignment+0] = nbat->nbfp[(i*nt+j)*2+0]; - nbat->nbfp_aligned[(i*nt+j)*c_simdBestPairAlignment+1] = nbat->nbfp[(i*nt+j)*2+1]; - nbat->nbfp_aligned[(i*nt+j)*c_simdBestPairAlignment+2] = 0; - nbat->nbfp_aligned[(i*nt+j)*c_simdBestPairAlignment+3] = 0; + params->nbfp_aligned[(i*nt+j)*c_simdBestPairAlignment+0] = params->nbfp[(i*nt+j)*2+0]; + params->nbfp_aligned[(i*nt+j)*c_simdBestPairAlignment+1] = params->nbfp[(i*nt+j)*2+1]; + params->nbfp_aligned[(i*nt+j)*c_simdBestPairAlignment+2] = 0; + params->nbfp_aligned[(i*nt+j)*c_simdBestPairAlignment+3] = 0; } } #endif @@ -356,36 +327,37 @@ static void set_lj_parameter_data(nbnxn_atomdata_t *nbat, gmx_bool bSIMD) * and with LJ-PME kernels. We then only need parameters per atom type, * not per pair of atom types. */ - switch (nbat->comb_rule) + params->nbfp_comb.resize(nt*2); + switch (params->comb_rule) { case ljcrGEOM: - nbat->comb_rule = ljcrGEOM; + params->comb_rule = ljcrGEOM; for (int i = 0; i < nt; i++) { /* Store the sqrt of the diagonal from the nbfp matrix */ - nbat->nbfp_comb[i*2 ] = std::sqrt(nbat->nbfp[(i*nt+i)*2 ]); - nbat->nbfp_comb[i*2+1] = std::sqrt(nbat->nbfp[(i*nt+i)*2+1]); + params->nbfp_comb[i*2 ] = std::sqrt(params->nbfp[(i*nt+i)*2 ]); + params->nbfp_comb[i*2+1] = std::sqrt(params->nbfp[(i*nt+i)*2+1]); } break; case ljcrLB: for (int i = 0; i < nt; i++) { /* Get 6*C6 and 12*C12 from the diagonal of the nbfp matrix */ - c6 = nbat->nbfp[(i*nt+i)*2 ]; - c12 = nbat->nbfp[(i*nt+i)*2+1]; + const real c6 = params->nbfp[(i*nt+i)*2 ]; + const real c12 = params->nbfp[(i*nt+i)*2+1]; if (c6 > 0 && c12 > 0) { /* We store 0.5*2^1/6*sigma and sqrt(4*3*eps), * so we get 6*C6 and 12*C12 after combining. */ - nbat->nbfp_comb[i*2 ] = 0.5*gmx::sixthroot(c12/c6); - nbat->nbfp_comb[i*2+1] = std::sqrt(c6*c6/c12); + params->nbfp_comb[i*2 ] = 0.5*gmx::sixthroot(c12/c6); + params->nbfp_comb[i*2+1] = std::sqrt(c6*c6/c12); } else { - nbat->nbfp_comb[i*2 ] = 0; - nbat->nbfp_comb[i*2+1] = 0; + params->nbfp_comb[i*2 ] = 0; + params->nbfp_comb[i*2+1] = 0; } } break; @@ -397,34 +369,30 @@ static void set_lj_parameter_data(nbnxn_atomdata_t *nbat, gmx_bool bSIMD) } } -#if GMX_SIMD -static void -nbnxn_atomdata_init_simple_exclusion_masks(nbnxn_atomdata_t *nbat) +nbnxn_atomdata_t::SimdMasks::SimdMasks() { - const int simd_width = GMX_SIMD_REAL_WIDTH; - int simd_excl_size; +#if GMX_SIMD + constexpr int simd_width = GMX_SIMD_REAL_WIDTH; /* Set the diagonal cluster pair exclusion mask setup data. * In the kernel we check 0 < j - i to generate the masks. * Here we store j - i for generating the mask for the first i, * we substract 0.5 to avoid rounding issues. * In the kernel we can subtract 1 to generate the subsequent mask. */ - int simd_4xn_diag_size; - - simd_4xn_diag_size = std::max(c_nbnxnCpuIClusterSize, simd_width); - snew_aligned(nbat->simd_4xn_diagonal_j_minus_i, simd_4xn_diag_size, NBNXN_MEM_ALIGN); + const int simd_4xn_diag_size = std::max(c_nbnxnCpuIClusterSize, simd_width); + diagonal_4xn_j_minus_i.resize(simd_4xn_diag_size); for (int j = 0; j < simd_4xn_diag_size; j++) { - nbat->simd_4xn_diagonal_j_minus_i[j] = j - 0.5; + diagonal_4xn_j_minus_i[j] = j - 0.5; } - snew_aligned(nbat->simd_2xnn_diagonal_j_minus_i, simd_width, NBNXN_MEM_ALIGN); + diagonal_2xnn_j_minus_i.resize(simd_width); for (int j = 0; j < simd_width/2; j++) { /* The j-cluster size is half the SIMD width */ - nbat->simd_2xnn_diagonal_j_minus_i[j] = j - 0.5; + diagonal_2xnn_j_minus_i[j] = j - 0.5; /* The next half of the SIMD width is for i + 1 */ - nbat->simd_2xnn_diagonal_j_minus_i[simd_width/2+j] = j - 1 - 0.5; + diagonal_2xnn_j_minus_i[simd_width/2 + j] = j - 1 - 0.5; } /* We use up to 32 bits for exclusion masking. @@ -434,95 +402,99 @@ nbnxn_atomdata_init_simple_exclusion_masks(nbnxn_atomdata_t *nbat) * In single precision this means the real and integer SIMD registers * are of equal size. */ - simd_excl_size = c_nbnxnCpuIClusterSize*simd_width; + const int simd_excl_size = c_nbnxnCpuIClusterSize*simd_width; #if GMX_DOUBLE && !GMX_SIMD_HAVE_INT32_LOGICAL - snew_aligned(nbat->simd_exclusion_filter64, simd_excl_size, NBNXN_MEM_ALIGN); + exclusion_filter64.resize(simd_excl_size); #else - snew_aligned(nbat->simd_exclusion_filter, simd_excl_size, NBNXN_MEM_ALIGN); + exclusion_filter.resize(simd_excl_size); #endif for (int j = 0; j < simd_excl_size; j++) { /* Set the consecutive bits for masking pair exclusions */ #if GMX_DOUBLE && !GMX_SIMD_HAVE_INT32_LOGICAL - nbat->simd_exclusion_filter64[j] = (1U << j); + exclusion_filter64[j] = (1U << j); #else - nbat->simd_exclusion_filter[j] = (1U << j); + exclusion_filter[j] = (1U << j); #endif } -#if !GMX_SIMD_HAVE_LOGICAL && !GMX_SIMD_HAVE_INT32_LOGICAL - // If the SIMD implementation has no bitwise logical operation support - // whatsoever we cannot use the normal masking. Instead, - // we generate a vector of all 2^4 possible ways an i atom - // interacts with its 4 j atoms. Each array entry contains - // GMX_SIMD_REAL_WIDTH values that are read with a single aligned SIMD load. - // Since there is no logical value representation in this case, we use - // any nonzero value to indicate 'true', while zero mean 'false'. - // This can then be converted to a SIMD boolean internally in the SIMD - // module by comparing to zero. - // Each array entry encodes how this i atom will interact with the 4 j atoms. - // Matching code exists in set_ci_top_excls() to generate indices into this array. - // Those indices are used in the kernels. - - simd_excl_size = c_nbnxnCpuIClusterSize*c_nbnxnCpuIClusterSize; - const real simdFalse = 0.0; - const real simdTrue = 1.0; - real *simd_interaction_array; - - snew_aligned(simd_interaction_array, simd_excl_size * GMX_SIMD_REAL_WIDTH, NBNXN_MEM_ALIGN); - for (int j = 0; j < simd_excl_size; j++) + if (!GMX_SIMD_HAVE_LOGICAL && !GMX_SIMD_HAVE_INT32_LOGICAL) { - int index = j * GMX_SIMD_REAL_WIDTH; - for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++) + // If the SIMD implementation has no bitwise logical operation support + // whatsoever we cannot use the normal masking. Instead, + // we generate a vector of all 2^4 possible ways an i atom + // interacts with its 4 j atoms. Each array entry contains + // GMX_SIMD_REAL_WIDTH values that are read with a single aligned SIMD load. + // Since there is no logical value representation in this case, we use + // any nonzero value to indicate 'true', while zero mean 'false'. + // This can then be converted to a SIMD boolean internally in the SIMD + // module by comparing to zero. + // Each array entry encodes how this i atom will interact with the 4 j atoms. + // Matching code exists in set_ci_top_excls() to generate indices into this array. + // Those indices are used in the kernels. + + const int simd_excl_size = c_nbnxnCpuIClusterSize*c_nbnxnCpuIClusterSize; + const real simdFalse = 0.0; + const real simdTrue = 1.0; + + interaction_array.resize(simd_excl_size * GMX_SIMD_REAL_WIDTH); + for (int j = 0; j < simd_excl_size; j++) { - simd_interaction_array[index + i] = (j & (1 << i)) ? simdTrue : simdFalse; + const int index = j * GMX_SIMD_REAL_WIDTH; + for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++) + { + interaction_array[index + i] = (j & (1 << i)) ? simdTrue : simdFalse; + } } } - nbat->simd_interaction_array = simd_interaction_array; #endif } -#endif -/* Initializes the nbnxn_atomdata_t parameters data structure */ +nbnxn_atomdata_t::Params::Params(gmx::PinningPolicy pinningPolicy) : + numTypes(0), + nbfp({}, {pinningPolicy}), + nbfp_comb({}, {pinningPolicy}), + type({}, {pinningPolicy}), + lj_comb({}, {pinningPolicy}), + q({}, {pinningPolicy}), + nenergrp(0), + neg_2log(0), + energrp({}, {pinningPolicy}) +{ +} + +nbnxn_atomdata_t::nbnxn_atomdata_t(gmx::PinningPolicy pinningPolicy) : + params_(pinningPolicy), + numAtoms_(0), + natoms_local(0), + shift_vec({}, {pinningPolicy}), + x_({}, {pinningPolicy}), + simdMasks(), + bUseBufferFlags(FALSE), + bUseTreeReduce(FALSE) +{ +} + +/* Initializes an nbnxn_atomdata_t::Params data structure */ static void nbnxn_atomdata_params_init(const gmx::MDLogger &mdlog, - nbnxn_atomdata_t *nbat, + nbnxn_atomdata_t::Params *params, int nb_kernel_type, int enbnxninitcombrule, int ntype, const real *nbfp, - int n_energygroups, - nbnxn_alloc_t *alloc, - nbnxn_free_t *free) + int n_energygroups) { real c6, c12, tol; char *ptr; gmx_bool simple, bCombGeom, bCombLB, bSIMD; - if (alloc == nullptr) - { - nbat->alloc = nbnxn_alloc_aligned; - } - else - { - nbat->alloc = alloc; - } - if (free == nullptr) - { - nbat->free = nbnxn_free_aligned; - } - else - { - nbat->free = free; - } - if (debug) { fprintf(debug, "There are %d atom types in the system, adding one for nbnxn_atomdata_t\n", ntype); } - nbat->ntype = ntype + 1; - nbat->alloc(reinterpret_cast(&nbat->nbfp), - nbat->ntype*nbat->ntype*2*sizeof(*nbat->nbfp)); - nbat->alloc(reinterpret_cast(&nbat->nbfp_comb), nbat->ntype*2*sizeof(*nbat->nbfp_comb)); + params->numTypes = ntype + 1; + params->nbfp.resize(params->numTypes*params->numTypes*2); + params->nbfp_comb.resize(params->numTypes*2); /* A tolerance of 1e-5 seems reasonable for (possibly hand-typed) * force-field floating point parameters. @@ -539,22 +511,22 @@ static void nbnxn_atomdata_params_init(const gmx::MDLogger &mdlog, bCombGeom = TRUE; bCombLB = TRUE; - /* Temporarily fill nbat->nbfp_comb with sigma and epsilon + /* Temporarily fill params->nbfp_comb with sigma and epsilon * to check for the LB rule. */ for (int i = 0; i < ntype; i++) { - c6 = nbfp[(i*ntype+i)*2 ]/6.0; - c12 = nbfp[(i*ntype+i)*2+1]/12.0; + c6 = nbfp[(i*ntype+i)*2 ]/6.0; + c12 = nbfp[(i*ntype+i)*2 + 1]/12.0; if (c6 > 0 && c12 > 0) { - nbat->nbfp_comb[i*2 ] = gmx::sixthroot(c12/c6); - nbat->nbfp_comb[i*2+1] = 0.25*c6*c6/c12; + params->nbfp_comb[i*2 ] = gmx::sixthroot(c12/c6); + params->nbfp_comb[i*2 + 1] = 0.25*c6*c6/c12; } else if (c6 == 0 && c12 == 0) { - nbat->nbfp_comb[i*2 ] = 0; - nbat->nbfp_comb[i*2+1] = 0; + params->nbfp_comb[i*2 ] = 0; + params->nbfp_comb[i*2 + 1] = 0; } else { @@ -563,41 +535,41 @@ static void nbnxn_atomdata_params_init(const gmx::MDLogger &mdlog, } } - for (int i = 0; i < nbat->ntype; i++) + for (int i = 0; i < params->numTypes; i++) { - for (int j = 0; j < nbat->ntype; j++) + for (int j = 0; j < params->numTypes; j++) { if (i < ntype && j < ntype) { /* fr->nbfp has been updated, so that array too now stores c6/c12 including * the 6.0/12.0 prefactors to save 2 flops in the most common case (force-only). */ - c6 = nbfp[(i*ntype+j)*2 ]; - c12 = nbfp[(i*ntype+j)*2+1]; - nbat->nbfp[(i*nbat->ntype+j)*2 ] = c6; - nbat->nbfp[(i*nbat->ntype+j)*2+1] = c12; + c6 = nbfp[(i*ntype+j)*2 ]; + c12 = nbfp[(i*ntype+j)*2 + 1]; + params->nbfp[(i*params->numTypes+j)*2 ] = c6; + params->nbfp[(i*params->numTypes+j)*2 + 1] = c12; /* Compare 6*C6 and 12*C12 for geometric cobination rule */ bCombGeom = bCombGeom && gmx_within_tol(c6*c6, nbfp[(i*ntype+i)*2 ]*nbfp[(j*ntype+j)*2 ], tol) && - gmx_within_tol(c12*c12, nbfp[(i*ntype+i)*2+1]*nbfp[(j*ntype+j)*2+1], tol); + gmx_within_tol(c12*c12, nbfp[(i*ntype+i)*2 + 1]*nbfp[(j*ntype+j)*2 + 1], tol); /* Compare C6 and C12 for Lorentz-Berthelot combination rule */ c6 /= 6.0; c12 /= 12.0; bCombLB = bCombLB && ((c6 == 0 && c12 == 0 && - (nbat->nbfp_comb[i*2+1] == 0 || nbat->nbfp_comb[j*2+1] == 0)) || + (params->nbfp_comb[i*2 + 1] == 0 || params->nbfp_comb[j*2 + 1] == 0)) || (c6 > 0 && c12 > 0 && gmx_within_tol(gmx::sixthroot(c12/c6), - 0.5*(nbat->nbfp_comb[i*2]+nbat->nbfp_comb[j*2]), tol) && - gmx_within_tol(0.25*c6*c6/c12, std::sqrt(nbat->nbfp_comb[i*2+1]*nbat->nbfp_comb[j*2+1]), tol))); + 0.5*(params->nbfp_comb[i*2]+params->nbfp_comb[j*2]), tol) && + gmx_within_tol(0.25*c6*c6/c12, std::sqrt(params->nbfp_comb[i*2 + 1]*params->nbfp_comb[j*2 + 1]), tol))); } else { /* Add zero parameters for the additional dummy atom type */ - nbat->nbfp[(i*nbat->ntype+j)*2 ] = 0; - nbat->nbfp[(i*nbat->ntype+j)*2+1] = 0; + params->nbfp[(i*params->numTypes + j)*2 ] = 0; + params->nbfp[(i*params->numTypes + j)*2+1] = 0; } } } @@ -617,43 +589,43 @@ static void nbnxn_atomdata_params_init(const gmx::MDLogger &mdlog, */ if (bCombGeom) { - nbat->comb_rule = ljcrGEOM; + params->comb_rule = ljcrGEOM; } else if (bCombLB) { - nbat->comb_rule = ljcrLB; + params->comb_rule = ljcrLB; } else { - nbat->comb_rule = ljcrNONE; + params->comb_rule = ljcrNONE; - nbat->free(nbat->nbfp_comb); + params->nbfp_comb.clear(); } { std::string mesg; - if (nbat->comb_rule == ljcrNONE) + if (params->comb_rule == ljcrNONE) { mesg = "Using full Lennard-Jones parameter combination matrix"; } else { mesg = gmx::formatString("Using %s Lennard-Jones combination rule", - nbat->comb_rule == ljcrGEOM ? "geometric" : "Lorentz-Berthelot"); + params->comb_rule == ljcrGEOM ? "geometric" : "Lorentz-Berthelot"); } GMX_LOG(mdlog.info).asParagraph().appendText(mesg); } break; case enbnxninitcombruleGEOM: - nbat->comb_rule = ljcrGEOM; + params->comb_rule = ljcrGEOM; break; case enbnxninitcombruleLB: - nbat->comb_rule = ljcrLB; + params->comb_rule = ljcrLB; break; case enbnxninitcombruleNONE: - nbat->comb_rule = ljcrNONE; + params->comb_rule = ljcrNONE; - nbat->free(nbat->nbfp_comb); + params->nbfp_comb.clear(); break; default: gmx_incons("Unknown enbnxninitcombrule"); @@ -662,23 +634,23 @@ static void nbnxn_atomdata_params_init(const gmx::MDLogger &mdlog, bSIMD = (nb_kernel_type == nbnxnk4xN_SIMD_4xN || nb_kernel_type == nbnxnk4xN_SIMD_2xNN); - set_lj_parameter_data(nbat, bSIMD); + set_lj_parameter_data(params, bSIMD); - nbat->nenergrp = n_energygroups; + params->nenergrp = n_energygroups; if (!simple) { // We now check for energy groups already when starting mdrun GMX_RELEASE_ASSERT(n_energygroups == 1, "GPU kernels do not support energy groups"); } /* Temporary storage goes as #grp^3*simd_width^2/2, so limit to 64 */ - if (nbat->nenergrp > 64) + if (params->nenergrp > 64) { gmx_fatal(FARGS, "With NxN kernels not more than 64 energy groups are supported\n"); } - nbat->neg_2log = 1; - while (nbat->nenergrp > (1<neg_2log)) + params->neg_2log = 1; + while (params->nenergrp > (1<neg_2log)) { - nbat->neg_2log++; + params->neg_2log++; } } @@ -689,20 +661,15 @@ void nbnxn_atomdata_init(const gmx::MDLogger &mdlog, int enbnxninitcombrule, int ntype, const real *nbfp, int n_energygroups, - int nout, - nbnxn_alloc_t *alloc, - nbnxn_free_t *free) + int nout) { - nbnxn_atomdata_params_init(mdlog, nbat, nb_kernel_type, - enbnxninitcombrule, ntype, nbfp, n_energygroups, - alloc, free); + nbnxn_atomdata_params_init(mdlog, &nbat->paramsDeprecated(), nb_kernel_type, + enbnxninitcombrule, ntype, nbfp, n_energygroups); const gmx_bool simple = nbnxn_kernel_pairlist_simple(nb_kernel_type); const gmx_bool bSIMD = (nb_kernel_type == nbnxnk4xN_SIMD_4xN || nb_kernel_type == nbnxnk4xN_SIMD_2xNN); - set_lj_parameter_data(nbat, bSIMD); - if (simple) { int pack_x; @@ -736,30 +703,19 @@ void nbnxn_atomdata_init(const gmx::MDLogger &mdlog, nbat->FFormat = nbatXYZ; } - nbat->alloc(reinterpret_cast(&nbat->shift_vec), SHIFTS*sizeof(*nbat->shift_vec)); + nbat->shift_vec.resize(SHIFTS); nbat->xstride = (nbat->XFormat == nbatXYZQ ? STRIDE_XYZQ : DIM); nbat->fstride = (nbat->FFormat == nbatXYZQ ? STRIDE_XYZQ : DIM); - nbat->x = nullptr; - -#if GMX_SIMD - if (simple) - { - nbnxn_atomdata_init_simple_exclusion_masks(nbat); - } -#endif /* Initialize the output data structures */ - nbat->nout = nout; - snew(nbat->out, nbat->nout); - nbat->nalloc = 0; - for (int i = 0; i < nbat->nout; i++) + for (int i = 0; i < nout; i++) { - nbnxn_atomdata_output_init(&nbat->out[i], - nb_kernel_type, - nbat->nenergrp, 1<neg_2log, - nbat->alloc); + const auto &pinningPolicy = nbat->params().type.get_allocator().pinningPolicy(); + nbat->out.emplace_back(nb_kernel_type, nbat->params().nenergrp, 1 << nbat->params().neg_2log, + pinningPolicy); } + nbat->buffer_flags.flag = nullptr; nbat->buffer_flags.flag_nalloc = 0; @@ -789,7 +745,7 @@ void nbnxn_atomdata_init(const gmx::MDLogger &mdlog, } template -static void copy_lj_to_nbat_lj_comb(const real *ljparam_type, +static void copy_lj_to_nbat_lj_comb(gmx::ArrayRef ljparam_type, const int *type, int na, real *ljparam_at) { @@ -807,11 +763,20 @@ static void copy_lj_to_nbat_lj_comb(const real *ljparam_type, } } +static int numAtomsFromGrids(const nbnxn_search &nbs) +{ + const nbnxn_grid_t &lastGrid = nbs.grid.back(); + + return (lastGrid.cell0 + lastGrid.nc)*lastGrid.na_sc; +} + /* Sets the atom type in nbnxn_atomdata_t */ -static void nbnxn_atomdata_set_atomtypes(nbnxn_atomdata_t *nbat, - const nbnxn_search *nbs, - const int *type) +static void nbnxn_atomdata_set_atomtypes(nbnxn_atomdata_t::Params *params, + const nbnxn_search *nbs, + const int *type) { + params->type.resize(numAtomsFromGrids(*nbs)); + for (const nbnxn_grid_t &grid : nbs->grid) { /* Loop over all columns and copy and fill */ @@ -821,16 +786,19 @@ static void nbnxn_atomdata_set_atomtypes(nbnxn_atomdata_t *nbat, int ash = (grid.cell0 + grid.cxy_ind[i])*grid.na_sc; copy_int_to_nbat_int(nbs->a.data() + ash, grid.cxy_na[i], ncz*grid.na_sc, - type, nbat->ntype-1, nbat->type+ash); + type, params->numTypes - 1, params->type.data() + ash); } } } /* Sets the LJ combination rule parameters in nbnxn_atomdata_t */ -static void nbnxn_atomdata_set_ljcombparams(nbnxn_atomdata_t *nbat, - const nbnxn_search *nbs) +static void nbnxn_atomdata_set_ljcombparams(nbnxn_atomdata_t::Params *params, + const int XFormat, + const nbnxn_search *nbs) { - if (nbat->comb_rule != ljcrNONE) + params->lj_comb.resize(numAtomsFromGrids(*nbs)*2); + + if (params->comb_rule != ljcrNONE) { for (const nbnxn_grid_t &grid : nbs->grid) { @@ -840,26 +808,26 @@ static void nbnxn_atomdata_set_ljcombparams(nbnxn_atomdata_t *nbat, int ncz = grid.cxy_ind[i+1] - grid.cxy_ind[i]; int ash = (grid.cell0 + grid.cxy_ind[i])*grid.na_sc; - if (nbat->XFormat == nbatX4) + if (XFormat == nbatX4) { - copy_lj_to_nbat_lj_comb(nbat->nbfp_comb, - nbat->type + ash, + copy_lj_to_nbat_lj_comb(params->nbfp_comb, + params->type.data() + ash, ncz*grid.na_sc, - nbat->lj_comb + ash*2); + params->lj_comb.data() + ash*2); } - else if (nbat->XFormat == nbatX8) + else if (XFormat == nbatX8) { - copy_lj_to_nbat_lj_comb(nbat->nbfp_comb, - nbat->type + ash, + copy_lj_to_nbat_lj_comb(params->nbfp_comb, + params->type.data() + ash, ncz*grid.na_sc, - nbat->lj_comb + ash*2); + params->lj_comb.data() + ash*2); } - else if (nbat->XFormat == nbatXYZQ) + else if (XFormat == nbatXYZQ) { - copy_lj_to_nbat_lj_comb<1>(nbat->nbfp_comb, - nbat->type + ash, + copy_lj_to_nbat_lj_comb<1>(params->nbfp_comb, + params->type.data() + ash, ncz*grid.na_sc, - nbat->lj_comb + ash*2); + params->lj_comb.data() + ash*2); } } } @@ -871,6 +839,11 @@ static void nbnxn_atomdata_set_charges(nbnxn_atomdata_t *nbat, const nbnxn_search *nbs, const real *charge) { + if (nbat->XFormat != nbatXYZQ) + { + nbat->paramsDeprecated().q.resize(nbat->numAtoms()); + } + for (const nbnxn_grid_t &grid : nbs->grid) { /* Loop over all columns and copy and fill */ @@ -882,7 +855,7 @@ static void nbnxn_atomdata_set_charges(nbnxn_atomdata_t *nbat, if (nbat->XFormat == nbatXYZQ) { - real *q = nbat->x + ash*STRIDE_XYZQ + ZZ + 1; + real *q = nbat->x().data() + ash*STRIDE_XYZQ + ZZ + 1; int i; for (i = 0; i < na; i++) { @@ -898,7 +871,7 @@ static void nbnxn_atomdata_set_charges(nbnxn_atomdata_t *nbat, } else { - real *q = nbat->q + ash; + real *q = nbat->paramsDeprecated().q.data() + ash; int i; for (i = 0; i < na; i++) { @@ -925,22 +898,24 @@ static void nbnxn_atomdata_set_charges(nbnxn_atomdata_t *nbat, static void nbnxn_atomdata_mask_fep(nbnxn_atomdata_t *nbat, const nbnxn_search *nbs) { - real *q; - int stride_q, nsubc; + nbnxn_atomdata_t::Params ¶ms = nbat->paramsDeprecated(); + real *q; + int stride_q; if (nbat->XFormat == nbatXYZQ) { - q = nbat->x + ZZ + 1; + q = nbat->x().data() + ZZ + 1; stride_q = STRIDE_XYZQ; } else { - q = nbat->q; + q = params.q.data(); stride_q = 1; } for (const nbnxn_grid_t &grid : nbs->grid) { + int nsubc; if (grid.bSimple) { nsubc = 1; @@ -965,8 +940,8 @@ static void nbnxn_atomdata_mask_fep(nbnxn_atomdata_t *nbat, { int ind = c_offset + c*grid.na_c + i; /* Set atom type and charge to non-interacting */ - nbat->type[ind] = nbat->ntype - 1; - q[ind*stride_q] = 0; + params.type[ind] = params.numTypes - 1; + q[ind*stride_q] = 0; } } } @@ -1005,15 +980,17 @@ static void copy_egp_to_nbat_egps(const int *a, int na, int na_round, } /* Set the energy group indices for atoms in nbnxn_atomdata_t */ -static void nbnxn_atomdata_set_energygroups(nbnxn_atomdata_t *nbat, - const nbnxn_search *nbs, - const int *atinfo) +static void nbnxn_atomdata_set_energygroups(nbnxn_atomdata_t::Params *params, + const nbnxn_search *nbs, + const int *atinfo) { - if (nbat->nenergrp == 1) + if (params->nenergrp == 1) { return; } + params->energrp.resize(numAtomsFromGrids(*nbs)); + for (const nbnxn_grid_t &grid : nbs->grid) { /* Loop over all columns and copy and fill */ @@ -1023,8 +1000,9 @@ static void nbnxn_atomdata_set_energygroups(nbnxn_atomdata_t *nbat, int ash = (grid.cell0 + grid.cxy_ind[i])*grid.na_sc; copy_egp_to_nbat_egps(nbs->a.data() + ash, grid.cxy_na[i], ncz*grid.na_sc, - nbat->na_c, nbat->neg_2log, - atinfo, nbat->energrp+(ash>>grid.na_c_2log)); + c_nbnxnCpuIClusterSize, params->neg_2log, + atinfo, + params->energrp.data() + (ash >> grid.na_c_2log)); } } } @@ -1035,7 +1013,9 @@ void nbnxn_atomdata_set(nbnxn_atomdata_t *nbat, const t_mdatoms *mdatoms, const int *atinfo) { - nbnxn_atomdata_set_atomtypes(nbat, nbs, mdatoms->typeA); + nbnxn_atomdata_t::Params ¶ms = nbat->paramsDeprecated(); + + nbnxn_atomdata_set_atomtypes(¶ms, nbs, mdatoms->typeA); nbnxn_atomdata_set_charges(nbat, nbs, mdatoms->chargeA); @@ -1045,9 +1025,9 @@ void nbnxn_atomdata_set(nbnxn_atomdata_t *nbat, } /* This must be done after masking types for FEP */ - nbnxn_atomdata_set_ljcombparams(nbat, nbs); + nbnxn_atomdata_set_ljcombparams(¶ms, nbat->XFormat, nbs); - nbnxn_atomdata_set_energygroups(nbat, nbs, atinfo); + nbnxn_atomdata_set_energygroups(¶ms, nbs, atinfo); } /* Copies the shift vector array to nbnxn_atomdata_t */ @@ -1135,7 +1115,7 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search *nbs, na_fill = na; } copy_rvec_to_nbat_real(nbs->a.data() + ash, na, na_fill, x, - nbat->XFormat, nbat->x, ash); + nbat->XFormat, nbat->x().data(), ash); } } } @@ -1147,7 +1127,7 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search *nbs, } static void -nbnxn_atomdata_clear_reals(real * gmx_restrict dest, +nbnxn_atomdata_clear_reals(gmx::ArrayRef dest, int i0, int i1) { for (int i = i0; i < i1; i++) @@ -1159,7 +1139,7 @@ nbnxn_atomdata_clear_reals(real * gmx_restrict dest, gmx_unused static void nbnxn_atomdata_reduce_reals(real * gmx_restrict dest, gmx_bool bDestSet, - real ** gmx_restrict src, + const real ** gmx_restrict src, int nsrc, int i0, int i1) { @@ -1191,7 +1171,7 @@ nbnxn_atomdata_reduce_reals(real * gmx_restrict dest, gmx_unused static void nbnxn_atomdata_reduce_reals_simd(real gmx_unused * gmx_restrict dest, gmx_bool gmx_unused bDestSet, - real gmx_unused ** gmx_restrict src, + const gmx_unused real ** gmx_restrict src, int gmx_unused nsrc, int gmx_unused i0, int gmx_unused i1) { @@ -1234,7 +1214,7 @@ nbnxn_atomdata_reduce_reals_simd(real gmx_unused * gmx_restrict dest, static void nbnxn_atomdata_add_nbat_f_to_f_part(const nbnxn_search *nbs, const nbnxn_atomdata_t *nbat, - nbnxn_atomdata_output_t *out, + gmx::ArrayRef out, int nfa, int a0, int a1, rvec *f) @@ -1248,7 +1228,7 @@ nbnxn_atomdata_add_nbat_f_to_f_part(const nbnxn_search *nbs, case nbatXYZQ: if (nfa == 1) { - const real *fnb = out[0].f; + const real *fnb = out[0].f.data(); for (int a = a0; a < a1; a++) { @@ -1277,7 +1257,7 @@ nbnxn_atomdata_add_nbat_f_to_f_part(const nbnxn_search *nbs, case nbatX4: if (nfa == 1) { - const real *fnb = out[0].f; + const real *fnb = out[0].f.data(); for (int a = a0; a < a1; a++) { @@ -1306,7 +1286,7 @@ nbnxn_atomdata_add_nbat_f_to_f_part(const nbnxn_search *nbs, case nbatX8: if (nfa == 1) { - const real *fnb = out[0].f; + const real *fnb = out[0].f.data(); for (int a = a0; a < a1; a++) { @@ -1343,14 +1323,16 @@ static inline unsigned char reverse_bits(unsigned char b) return (b * 0x0202020202ULL & 0x010884422010ULL) % 1023; } -static void nbnxn_atomdata_add_nbat_f_to_f_treereduce(const nbnxn_atomdata_t *nbat, - int nth) +static void nbnxn_atomdata_add_nbat_f_to_f_treereduce(nbnxn_atomdata_t *nbat, + int nth) { const nbnxn_buffer_flags_t *flags = &nbat->buffer_flags; - int next_pow2 = 1<<(gmx::log2I(nth-1)+1); + int next_pow2 = 1<<(gmx::log2I(nth-1)+1); - assert(nbat->nout == nth); /* tree-reduce currently only works for nout==nth */ + const int numOutputBuffers = nbat->out.size(); + GMX_ASSERT(numOutputBuffers == nth, + "tree-reduce currently only works for numOutputBuffers==nth"); memset(nbat->syncStep, 0, sizeof(*(nbat->syncStep))*nth); @@ -1404,7 +1386,7 @@ static void nbnxn_atomdata_add_nbat_f_to_f_treereduce(const nbnxn_atomdata_t *nb index[1] = index[0] + group_size/2; /* If no second buffer, nothing to do */ - if (index[1] >= nbat->nout && group_size > 2) + if (index[1] >= numOutputBuffers && group_size > 2) { continue; } @@ -1446,14 +1428,15 @@ static void nbnxn_atomdata_add_nbat_f_to_f_treereduce(const nbnxn_atomdata_t *nb if (bitmask_is_set(flags->flag[b], index[1]) || group_size > 2) { + const real *fIndex1 = nbat->out[index[1]].f.data(); #if GMX_SIMD nbnxn_atomdata_reduce_reals_simd #else nbnxn_atomdata_reduce_reals #endif - (nbat->out[index[0]].f, + (nbat->out[index[0]].f.data(), bitmask_is_set(flags->flag[b], index[0]) || group_size > 2, - &(nbat->out[index[1]].f), 1, i0, i1); + &fIndex1, 1, i0, i1); } else if (!bitmask_is_set(flags->flag[b], index[0])) @@ -1470,8 +1453,8 @@ static void nbnxn_atomdata_add_nbat_f_to_f_treereduce(const nbnxn_atomdata_t *nb } -static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(const nbnxn_atomdata_t *nbat, - int nth) +static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbnxn_atomdata_t *nbat, + int nth) { #pragma omp parallel for num_threads(nth) schedule(static) for (int th = 0; th < nth; th++) @@ -1479,8 +1462,8 @@ static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(const nbnxn_atomdata_t *nba try { const nbnxn_buffer_flags_t *flags; - int nfptr; - real *fptr[NBNXN_BUFFERFLAG_MAX_THREADS]; + int nfptr; + const real *fptr[NBNXN_BUFFERFLAG_MAX_THREADS]; flags = &nbat->buffer_flags; @@ -1494,11 +1477,11 @@ static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(const nbnxn_atomdata_t *nba int i1 = (b+1)*NBNXN_BUFFERFLAG_SIZE*nbat->fstride; nfptr = 0; - for (int out = 1; out < nbat->nout; out++) + for (int out = 1; out < static_cast(nbat->out.size()); out++) { if (bitmask_is_set(flags->flag[b], out)) { - fptr[nfptr++] = nbat->out[out].f; + fptr[nfptr++] = nbat->out[out].f.data(); } } if (nfptr > 0) @@ -1508,7 +1491,7 @@ static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(const nbnxn_atomdata_t *nba #else nbnxn_atomdata_reduce_reals #endif - (nbat->out[0].f, + (nbat->out[0].f.data(), bitmask_is_set(flags->flag[b], 0), fptr, nfptr, i0, i1); @@ -1527,7 +1510,7 @@ static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(const nbnxn_atomdata_t *nba /* Add the force array(s) from nbnxn_atomdata_t to f */ void nbnxn_atomdata_add_nbat_f_to_f(nbnxn_search *nbs, int locality, - const nbnxn_atomdata_t *nbat, + nbnxn_atomdata_t *nbat, rvec *f, gmx_wallcycle *wcycle) { @@ -1556,7 +1539,7 @@ void nbnxn_atomdata_add_nbat_f_to_f(nbnxn_search *nbs, int nth = gmx_omp_nthreads_get(emntNonbonded); - if (nbat->nout > 1) + if (nbat->out.size() > 1) { if (locality != eatAll) { @@ -1600,17 +1583,17 @@ void nbnxn_atomdata_add_nbat_f_to_f(nbnxn_search *nbs, void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat, rvec *fshift) { - const nbnxn_atomdata_output_t * out = nbat->out; + gmx::ArrayRef outputBuffers = nbat->out; for (int s = 0; s < SHIFTS; s++) { rvec sum; clear_rvec(sum); - for (int th = 0; th < nbat->nout; th++) + for (const nbnxn_atomdata_output_t &out : outputBuffers) { - sum[XX] += out[th].fshift[s*DIM+XX]; - sum[YY] += out[th].fshift[s*DIM+YY]; - sum[ZZ] += out[th].fshift[s*DIM+ZZ]; + sum[XX] += out.fshift[s*DIM+XX]; + sum[YY] += out.fshift[s*DIM+YY]; + sum[ZZ] += out.fshift[s*DIM+ZZ]; } rvec_inc(fshift[s], sum); } diff --git a/src/gromacs/mdlib/nbnxn_atomdata.h b/src/gromacs/mdlib/nbnxn_atomdata.h index d336d4b546..f5a8c1a291 100644 --- a/src/gromacs/mdlib/nbnxn_atomdata.h +++ b/src/gromacs/mdlib/nbnxn_atomdata.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -91,11 +91,8 @@ void nbnxn_atomdata_init(const gmx::MDLogger &mdlog, int enbnxninitcombrule, int ntype, const real *nbfp, int n_energygroups, - int nout, - nbnxn_alloc_t *alloc, - nbnxn_free_t *free); + int nout); -/* Copy the atom data to the non-bonded atom data structure */ void nbnxn_atomdata_set(nbnxn_atomdata_t *nbat, const nbnxn_search *nbs, const t_mdatoms *mdatoms, @@ -119,7 +116,7 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search *nbs, /* Add the forces stored in nbat to f, zeros the forces in nbat */ void nbnxn_atomdata_add_nbat_f_to_f(nbnxn_search *nbs, int locality, - const nbnxn_atomdata_t *nbat, + nbnxn_atomdata_t *nbat, rvec *f, gmx_wallcycle *wcycle); diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu index 22e8eee3fd..7a78dda109 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -318,7 +318,8 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t *nb, } /* HtoD x, q */ - cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4, + cu_copy_H2D_async(adat->xq + adat_begin, + static_cast(nbatom->x().data() + adat_begin * 4), adat_len * sizeof(*adat->xq), stream); if (bDoTime) @@ -612,7 +613,7 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t *nb, } void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb, - const nbnxn_atomdata_t *nbatom, + nbnxn_atomdata_t *nbatom, int flags, int aloc, bool haveOtherWork) @@ -654,7 +655,7 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb, } /* DtoH f */ - cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin, + cu_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f + adat_begin, (adat_len)*sizeof(*adat->f), stream); /* After the non-local D2H is launched the nonlocal_done event can be diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu index 40bdc78d20..d9c26a0813 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu @@ -219,14 +219,14 @@ static void set_cutoff_parameters(cu_nbparam_t *nbp, } /*! Initializes the nonbonded parameter data structure. */ -static void init_nbparam(cu_nbparam_t *nbp, - const interaction_const_t *ic, - const NbnxnListParameters *listParams, - const nbnxn_atomdata_t *nbat) +static void init_nbparam(cu_nbparam_t *nbp, + const interaction_const_t *ic, + const NbnxnListParameters *listParams, + const nbnxn_atomdata_t::Params &nbatParams) { int ntypes; - ntypes = nbat->ntype; + ntypes = nbatParams.numTypes; set_cutoff_parameters(nbp, ic, listParams); @@ -245,7 +245,7 @@ static void init_nbparam(cu_nbparam_t *nbp, { case eintmodNONE: case eintmodPOTSHIFT: - switch (nbat->comb_rule) + switch (nbatParams.comb_rule) { case ljcrNONE: nbp->vdwtype = evdwCuCUT; @@ -274,12 +274,12 @@ static void init_nbparam(cu_nbparam_t *nbp, { if (ic->ljpme_comb_rule == ljcrGEOM) { - assert(nbat->comb_rule == ljcrGEOM); + assert(nbatParams.comb_rule == ljcrGEOM); nbp->vdwtype = evdwCuEWALDGEOM; } else { - assert(nbat->comb_rule == ljcrLB); + assert(nbatParams.comb_rule == ljcrLB); nbp->vdwtype = evdwCuEWALDLB; } } @@ -318,14 +318,14 @@ static void init_nbparam(cu_nbparam_t *nbp, if (!useLjCombRule(nbp)) { initParamLookupTable(nbp->nbfp, nbp->nbfp_texobj, - nbat->nbfp, 2*ntypes*ntypes); + nbatParams.nbfp.data(), 2*ntypes*ntypes); } /* set up LJ-PME parameter lookup table */ if (ic->vdwtype == evdwPME) { initParamLookupTable(nbp->nbfp_comb, nbp->nbfp_comb_texobj, - nbat->nbfp_comb, 2*ntypes); + nbatParams.nbfp_comb.data(), 2*ntypes); } } @@ -412,10 +412,10 @@ static void init_timings(gmx_wallclock_gpu_nbnxn_t *t) static void nbnxn_cuda_init_const(gmx_nbnxn_cuda_t *nb, const interaction_const_t *ic, const NbnxnListParameters *listParams, - const nbnxn_atomdata_t *nbat) + const nbnxn_atomdata_t::Params &nbatParams) { - init_atomdata_first(nb->atdat, nbat->ntype); - init_nbparam(nb->nbparam, ic, listParams, nbat); + init_atomdata_first(nb->atdat, nbatParams.numTypes); + init_nbparam(nb->nbparam, ic, listParams, nbatParams); /* clear energy and shift force outputs */ nbnxn_cuda_clear_e_fshift(nb); @@ -504,7 +504,7 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t **p_nb, /* pick L1 cache configuration */ nbnxn_cuda_set_cacheconfig(); - nbnxn_cuda_init_const(nb, ic, listParams, nbat); + nbnxn_cuda_init_const(nb, ic, listParams, nbat->params()); *p_nb = nb; @@ -584,7 +584,7 @@ void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_cuda_t *nb, /* only if we have a dynamic box */ if (nbatom->bDynamicBox || !adat->bShiftVecUploaded) { - cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec, + cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec.data(), SHIFTS * sizeof(*adat->shift_vec), ls); adat->bShiftVecUploaded = true; } @@ -638,7 +638,7 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t *nb, cu_atomdata_t *d_atdat = nb->atdat; cudaStream_t ls = nb->stream[eintLocal]; - natoms = nbat->natoms; + natoms = nbat->numAtoms(); realloced = false; if (bDoTime) @@ -692,12 +692,12 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t *nb, if (useLjCombRule(nb->nbparam)) { - cu_copy_H2D_async(d_atdat->lj_comb, nbat->lj_comb, + cu_copy_H2D_async(d_atdat->lj_comb, nbat->params().lj_comb.data(), natoms*sizeof(*d_atdat->lj_comb), ls); } else { - cu_copy_H2D_async(d_atdat->atom_types, nbat->type, + cu_copy_H2D_async(d_atdat->atom_types, nbat->params().type.data(), natoms*sizeof(*d_atdat->atom_types), ls); } diff --git a/src/gromacs/mdlib/nbnxn_gpu.h b/src/gromacs/mdlib/nbnxn_gpu.h index fcc7888515..b23347b6c5 100644 --- a/src/gromacs/mdlib/nbnxn_gpu.h +++ b/src/gromacs/mdlib/nbnxn_gpu.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014,2015,2017,2018,2019, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -132,7 +132,7 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t gmx_unused *nb, */ GPU_FUNC_QUALIFIER void nbnxn_gpu_launch_cpyback(gmx_nbnxn_gpu_t gmx_unused *nb, - const struct nbnxn_atomdata_t gmx_unused *nbatom, + struct nbnxn_atomdata_t gmx_unused *nbatom, int gmx_unused flags, int gmx_unused aloc, bool gmx_unused haveOtherWork) GPU_FUNC_TERM diff --git a/src/gromacs/mdlib/nbnxn_grid.cpp b/src/gromacs/mdlib/nbnxn_grid.cpp index f33a958d0b..93a93fbb64 100644 --- a/src/gromacs/mdlib/nbnxn_grid.cpp +++ b/src/gromacs/mdlib/nbnxn_grid.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -841,7 +841,7 @@ static void fill_cell(nbnxn_search *nbs, copy_rvec_to_nbat_real(nbs->a.data() + atomStart, numAtoms, grid->na_c, as_rvec_array(x.data()), - nbat->XFormat, nbat->x, atomStart); + nbat->XFormat, nbat->x().data(), atomStart); if (nbat->XFormat == nbatX4) { @@ -852,13 +852,13 @@ static void fill_cell(nbnxn_search *nbs, #if GMX_SIMD && GMX_SIMD_REAL_WIDTH == 2 if (2*grid->na_cj == grid->na_c) { - calc_bounding_box_x_x4_halves(numAtoms, nbat->x + atom_to_x_index(atomStart), bb_ptr, + calc_bounding_box_x_x4_halves(numAtoms, nbat->x().data() + atom_to_x_index(atomStart), bb_ptr, grid->bbj.data() + offset*2); } else #endif { - calc_bounding_box_x_x4(numAtoms, nbat->x + atom_to_x_index(atomStart), bb_ptr); + calc_bounding_box_x_x4(numAtoms, nbat->x().data() + atom_to_x_index(atomStart), bb_ptr); } } else if (nbat->XFormat == nbatX8) @@ -867,7 +867,7 @@ static void fill_cell(nbnxn_search *nbs, size_t offset = (atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log; nbnxn_bb_t *bb_ptr = grid->bb.data() + offset; - calc_bounding_box_x_x8(numAtoms, nbat->x + atom_to_x_index(atomStart), bb_ptr); + calc_bounding_box_x_x8(numAtoms, nbat->x().data() + atom_to_x_index(atomStart), bb_ptr); } #if NBNXN_BBXXXX else if (!grid->bSimple) @@ -883,13 +883,13 @@ static void fill_cell(nbnxn_search *nbs, #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB if (nbat->XFormat == nbatXYZQ) { - calc_bounding_box_xxxx_simd4(numAtoms, nbat->x + atomStart*nbat->xstride, + calc_bounding_box_xxxx_simd4(numAtoms, nbat->x().data() + atomStart*nbat->xstride, bb_work_aligned, pbb_ptr); } else #endif { - calc_bounding_box_xxxx(numAtoms, nbat->xstride, nbat->x + atomStart*nbat->xstride, + calc_bounding_box_xxxx(numAtoms, nbat->xstride, nbat->x().data() + atomStart*nbat->xstride, pbb_ptr); } if (gmx_debug_at) @@ -907,7 +907,7 @@ static void fill_cell(nbnxn_search *nbs, /* Store the bounding boxes as xyz.xyz. */ nbnxn_bb_t *bb_ptr = grid->bb.data() + ((atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log); - calc_bounding_box(numAtoms, nbat->xstride, nbat->x + atomStart*nbat->xstride, + calc_bounding_box(numAtoms, nbat->xstride, nbat->x().data() + atomStart*nbat->xstride, bb_ptr); if (gmx_debug_at) @@ -1226,13 +1226,8 @@ static void resizeForNumberOfCells(const nbnxn_grid_t &grid, */ nbs->a.resize(numNbnxnAtoms + numAtomsMoved); - /* We need padding up to a multiple of the buffer flag size: simply add */ - if (numNbnxnAtoms + NBNXN_BUFFERFLAG_SIZE > nbat->nalloc) - { - nbnxn_atomdata_realloc(nbat, numNbnxnAtoms + NBNXN_BUFFERFLAG_SIZE); - } - - nbat->natoms = numNbnxnAtoms; + /* Make space in nbat for storing the atom coordinates */ + nbat->resizeCoordinateBuffer(numNbnxnAtoms); } /* Determine in which grid cells the atoms should go */ @@ -1441,8 +1436,6 @@ void nbnxn_put_on_grid(nbnxn_search_t nbs, grid->na_sc = (grid->bSimple ? 1 : c_gpuNumClusterPerCell)*grid->na_c; grid->na_c_2log = get_2log(grid->na_c); - nbat->na_c = grid->na_c; - if (ddZone == 0) { grid->cell0 = 0; @@ -1510,7 +1503,12 @@ void nbnxn_put_on_grid(nbnxn_search_t nbs, if (ddZone == 0) { - nbat->natoms_local = nbat->natoms; + nbat->natoms_local = nbat->numAtoms(); + } + if (ddZone == static_cast(nbs->grid.size()) - 1) + { + /* We are done setting up all grids, we can resize the force buffers */ + nbat->resizeForceBuffers(); } nbs_cycle_stop(&nbs->cc[enbsCCgrid]); diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_common.cpp b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_common.cpp index 7a009dc2c1..b4a3f1327f 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_common.cpp +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_common.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014,2015,2017,2019, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -42,9 +42,7 @@ static void clear_f_all(const nbnxn_atomdata_t *nbat, real *f) { - int i; - - for (i = 0; i < nbat->natoms*nbat->fstride; i++) + for (int i = 0; i < nbat->numAtoms()*nbat->fstride; i++) { f[i] = 0; } @@ -103,24 +101,23 @@ reduce_energies_over_lists(const nbnxn_atomdata_t *nbat, real *Vvdw, real *Vc) { - int nb; - int i, j, ind, indr; + const int nenergrp = nbat->params().nenergrp; - for (nb = 0; nb < nlist; nb++) + for (int nb = 0; nb < nlist; nb++) { - for (i = 0; i < nbat->nenergrp; i++) + for (int i = 0; i < nenergrp; i++) { /* Reduce the diagonal terms */ - ind = i*nbat->nenergrp + i; + int ind = i*nenergrp + i; Vvdw[ind] += nbat->out[nb].Vvdw[ind]; Vc[ind] += nbat->out[nb].Vc[ind]; /* Reduce the off-diagonal terms */ - for (j = i+1; j < nbat->nenergrp; j++) + for (int j = i + 1; j < nenergrp; j++) { /* The output should contain only one off-diagonal part */ - ind = i*nbat->nenergrp + j; - indr = j*nbat->nenergrp + i; + int ind = i*nenergrp + j; + int indr = j*nenergrp + i; Vvdw[ind] += nbat->out[nb].Vvdw[ind] + nbat->out[nb].Vvdw[indr]; Vc[ind] += nbat->out[nb].Vc[ind] + nbat->out[nb].Vc[indr]; } diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.cpp b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.cpp index 074f97974d..3f47f2d0b7 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.cpp +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.cpp @@ -66,20 +66,10 @@ */ static void clearGroupEnergies(nbnxn_atomdata_output_t *out) { - for (int i = 0; i < out->nV; i++) - { - out->Vvdw[i] = 0; - out->Vc[i] = 0; - } - - for (int i = 0; i < out->nVS; i++) - { - out->VSvdw[i] = 0; - } - for (int i = 0; i < out->nVS; i++) - { - out->VSc[i] = 0; - } + std::fill(out->Vvdw.begin(), out->Vvdw.end(), 0.0_real); + std::fill(out->Vc.begin(), out->Vc.end(), 0.0_real); + std::fill(out->VSvdw.begin(), out->VSvdw.end(), 0.0_real); + std::fill(out->VSc.begin(), out->VSc.end(), 0.0_real); } /*! \brief Reduce the group-pair energy buffers produced by a SIMD kernel @@ -91,22 +81,21 @@ static void clearGroupEnergies(nbnxn_atomdata_output_t *out) * \tparam unrollj The unroll size for j-particles in the SIMD kernel * \param[in] numGroups The number of energy groups * \param[in] numGroups_2log Log2 of numGroups, rounded up - * \param[in] vVdwSimd SIMD Van der Waals energy buffers - * \param[in] vCoulombSimd SIMD Coulomb energy buffers - * \param[in,out] vVdw Van der Waals energy output buffer - * \param[in,out] vCoulomb Coulomb energy output buffer + * \param[in,out] out Struct with energy buffers */ template static void reduceGroupEnergySimdBuffers(int numGroups, int numGroups_2log, - const real * gmx_restrict vVdwSimd, - const real * gmx_restrict vCoulombSimd, - real * gmx_restrict vVdw, - real * gmx_restrict vCoulomb) + nbnxn_atomdata_output_t *out) { - const int unrollj_half = unrollj/2; + const int unrollj_half = unrollj/2; /* Energies are stored in SIMD registers with size 2^numGroups_2log */ - const int numGroupsStorage = (1 << numGroups_2log); + const int numGroupsStorage = (1 << numGroups_2log); + + const real * gmx_restrict vVdwSimd = out->VSvdw.data(); + const real * gmx_restrict vCoulombSimd = out->VSc.data(); + real * gmx_restrict vVdw = out->Vvdw.data(); + real * gmx_restrict vCoulomb = out->Vc.data(); /* The size of the SIMD energy group buffer array is: * numGroups*numGroups*numGroupsStorage*unrollj_half*simd_width @@ -133,7 +122,7 @@ reduceGroupEnergySimdBuffers(int numGroups, void nbnxn_kernel_cpu(nonbonded_verlet_group_t *nbvg, - const nbnxn_atomdata_t *nbat, + nbnxn_atomdata_t *nbat, const interaction_const_t *ic, rvec *shiftVectors, int forceFlags, @@ -174,6 +163,8 @@ nbnxn_kernel_cpu(nonbonded_verlet_group_t *nbvg, } } + const nbnxn_atomdata_t::Params &nbatParams = nbat->params(); + int vdwkt = 0; if (ic->vdwtype == evdwCUT) { @@ -181,7 +172,7 @@ nbnxn_kernel_cpu(nonbonded_verlet_group_t *nbvg, { case eintmodNONE: case eintmodPOTSHIFT: - switch (nbat->comb_rule) + switch (nbatParams.comb_rule) { case ljcrGEOM: vdwkt = vdwktLJCUT_COMBGEOM; break; case ljcrLB: vdwkt = vdwktLJCUT_COMBLB; break; @@ -231,7 +222,7 @@ nbnxn_kernel_cpu(nonbonded_verlet_group_t *nbvg, if (clearF == enbvClearFYes) { - clear_f(nbat, nb, out->f); + clear_f(nbat, nb, out->f.data()); } real *fshift_p; @@ -241,7 +232,7 @@ nbnxn_kernel_cpu(nonbonded_verlet_group_t *nbvg, } else { - fshift_p = out->fshift; + fshift_p = out->fshift.data(); if (clearF == enbvClearFYes) { @@ -258,7 +249,7 @@ nbnxn_kernel_cpu(nonbonded_verlet_group_t *nbvg, nbnxn_kernel_noener_ref[coulkt][vdwkt](nbl[nb], nbat, ic, shiftVectors, - out->f, + out->f.data(), fshift_p); break; #ifdef GMX_NBNXN_SIMD_2XNN @@ -266,7 +257,7 @@ nbnxn_kernel_cpu(nonbonded_verlet_group_t *nbvg, nbnxn_kernel_noener_simd_2xnn[coulkt][vdwkt](nbl[nb], nbat, ic, shiftVectors, - out->f, + out->f.data(), fshift_p); break; #endif @@ -275,7 +266,7 @@ nbnxn_kernel_cpu(nonbonded_verlet_group_t *nbvg, nbnxn_kernel_noener_simd_4xn[coulkt][vdwkt](nbl[nb], nbat, ic, shiftVectors, - out->f, + out->f.data(), fshift_p); break; #endif @@ -283,7 +274,7 @@ nbnxn_kernel_cpu(nonbonded_verlet_group_t *nbvg, GMX_RELEASE_ASSERT(false, "Unsupported kernel architecture"); } } - else if (out->nV == 1) + else if (out->Vvdw.size() == 1) { /* A single energy group (pair) */ out->Vvdw[0] = 0; @@ -295,20 +286,20 @@ nbnxn_kernel_cpu(nonbonded_verlet_group_t *nbvg, nbnxn_kernel_ener_ref[coulkt][vdwkt](nbl[nb], nbat, ic, shiftVectors, - out->f, + out->f.data(), fshift_p, - out->Vvdw, - out->Vc); + out->Vvdw.data(), + out->Vc.data()); break; #ifdef GMX_NBNXN_SIMD_2XNN case nbnxnk4xN_SIMD_2xNN: nbnxn_kernel_ener_simd_2xnn[coulkt][vdwkt](nbl[nb], nbat, ic, shiftVectors, - out->f, + out->f.data(), fshift_p, - out->Vvdw, - out->Vc); + out->Vvdw.data(), + out->Vc.data()); break; #endif #ifdef GMX_NBNXN_SIMD_4XN @@ -316,10 +307,10 @@ nbnxn_kernel_cpu(nonbonded_verlet_group_t *nbvg, nbnxn_kernel_ener_simd_4xn[coulkt][vdwkt](nbl[nb], nbat, ic, shiftVectors, - out->f, + out->f.data(), fshift_p, - out->Vvdw, - out->Vc); + out->Vvdw.data(), + out->Vc.data()); break; #endif default: @@ -340,10 +331,10 @@ nbnxn_kernel_cpu(nonbonded_verlet_group_t *nbvg, nbnxn_kernel_energrp_ref[coulkt][vdwkt](nbl[nb], nbat, ic, shiftVectors, - out->f, + out->f.data(), fshift_p, - out->Vvdw, - out->Vc); + out->Vvdw.data(), + out->Vc.data()); break; #ifdef GMX_NBNXN_SIMD_2XNN case nbnxnk4xN_SIMD_2xNN: @@ -351,10 +342,10 @@ nbnxn_kernel_cpu(nonbonded_verlet_group_t *nbvg, nbnxn_kernel_energrp_simd_2xnn[coulkt][vdwkt](nbl[nb], nbat, ic, shiftVectors, - out->f, + out->f.data(), fshift_p, - out->VSvdw, - out->VSc); + out->VSvdw.data(), + out->VSc.data()); break; #endif #ifdef GMX_NBNXN_SIMD_4XN @@ -363,10 +354,10 @@ nbnxn_kernel_cpu(nonbonded_verlet_group_t *nbvg, nbnxn_kernel_energrp_simd_4xn[coulkt][vdwkt](nbl[nb], nbat, ic, shiftVectors, - out->f, + out->f.data(), fshift_p, - out->VSvdw, - out->VSc); + out->VSvdw.data(), + out->VSc.data()); break; #endif default: @@ -378,22 +369,19 @@ nbnxn_kernel_cpu(nonbonded_verlet_group_t *nbvg, switch (unrollj) { case 2: - reduceGroupEnergySimdBuffers<2>(nbat->nenergrp, - nbat->neg_2log, - out->VSvdw, out->VSc, - out->Vvdw, out->Vc); + reduceGroupEnergySimdBuffers<2>(nbatParams.nenergrp, + nbatParams.neg_2log, + out); break; case 4: - reduceGroupEnergySimdBuffers<4>(nbat->nenergrp, - nbat->neg_2log, - out->VSvdw, out->VSc, - out->Vvdw, out->Vc); + reduceGroupEnergySimdBuffers<4>(nbatParams.nenergrp, + nbatParams.neg_2log, + out); break; case 8: - reduceGroupEnergySimdBuffers<8>(nbat->nenergrp, - nbat->neg_2log, - out->VSvdw, out->VSc, - out->Vvdw, out->Vc); + reduceGroupEnergySimdBuffers<8>(nbatParams.nenergrp, + nbatParams.neg_2log, + out); break; default: GMX_RELEASE_ASSERT(false, "Unsupported j-unroll size"); diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.h index 5f4ba0cde0..389f5e6daa 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.h +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2017, by the GROMACS development team, led by + * Copyright (c) 2017,2018,2019, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -58,7 +58,7 @@ struct nonbonded_verlet_group_t; * within this function. * * \param[in,out] nbvg The group (local/non-local) to compute interaction for - * \param[in] nbat The atomdata for the interactions + * \param[in,out] nbat The atomdata for the interactions * \param[in] ic Non-bonded interaction constants * \param[in] shiftVectors The PBC shift vectors * \param[in] forceFlags Flags that tell what to compute @@ -69,7 +69,7 @@ struct nonbonded_verlet_group_t; */ void nbnxn_kernel_cpu(nonbonded_verlet_group_t *nbvg, - const nbnxn_atomdata_t *nbat, + nbnxn_atomdata_t *nbat, const interaction_const_t *ic, rvec *shiftVectors, int forceFlags, diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.cpp b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.cpp index 0463f634e2..fcbf9d0195 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.cpp +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.cpp @@ -64,13 +64,12 @@ nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu *nbl, rvec *shift_vec, int force_flags, int clearF, - real * f, + gmx::ArrayRef f, real * fshift, real * Vc, real * Vvdw) { const nbnxn_sci_t *nbln; - const real *x; gmx_bool bEner; gmx_bool bEwald; const real *Ftab = nullptr; @@ -102,9 +101,6 @@ nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu *nbl, int int_bit; real fexcl; real c6, c12; - const real * shiftvec; - real * vdwparam; - int * type; const nbnxn_excl_t *excl[2]; int npair_tot, npair; @@ -117,7 +113,7 @@ nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu *nbl, if (clearF == enbvClearFYes) { - clear_f(nbat, 0, f); + clear_f(nbat, 0, f.data()); } bEner = ((force_flags & GMX_FORCE_ENERGY) != 0); @@ -128,18 +124,18 @@ nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu *nbl, Ftab = iconst->tabq_coul_F; } - rcut2 = iconst->rcoulomb*iconst->rcoulomb; - rvdw2 = iconst->rvdw*iconst->rvdw; + rcut2 = iconst->rcoulomb*iconst->rcoulomb; + rvdw2 = iconst->rvdw*iconst->rvdw; - rlist2 = nbl->rlist*nbl->rlist; + rlist2 = nbl->rlist*nbl->rlist; - type = nbat->type; - facel = iconst->epsfac; - shiftvec = shift_vec[0]; - vdwparam = nbat->nbfp; - ntype = nbat->ntype; + const int *type = nbat->params().type.data(); + facel = iconst->epsfac; + const real *shiftvec = shift_vec[0]; + const real *vdwparam = nbat->params().nbfp.data(); + ntype = nbat->params().numTypes; - x = nbat->x; + const real *x = nbat->x().data(); npair_tot = 0; nhwu = 0; diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h index a4a1b02e22..7a5a1cff26 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h @@ -39,6 +39,7 @@ #include "gromacs/math/vectypes.h" #include "gromacs/mdlib/nbnxn_pairlist.h" #include "gromacs/mdtypes/forcerec.h" +#include "gromacs/utility/arrayref.h" #include "gromacs/utility/real.h" /* Reference (slow) kernel for nb n vs n GPU type pair lists */ @@ -49,7 +50,7 @@ nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu *nbl, rvec *shift_vec, int force_flags, int clearF, - real * f, + gmx::ArrayRef f, real * fshift, real * Vc, real * Vvdw); diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_inner.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_inner.h index 4e5f0481e5..a6d44626a7 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_inner.h +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_inner.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013,2014,2015,2016,2018, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014,2015,2016,2018,2019, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -50,7 +50,7 @@ cj = l_cj[cjind].cj; #ifdef ENERGY_GROUPS - egp_cj = nbat->energrp[cj]; + egp_cj = nbatParams.energrp[cj]; #endif for (i = 0; i < UNROLLI; i++) { @@ -274,7 +274,7 @@ #ifdef CALC_ENERGIES #ifdef ENERGY_GROUPS - Vvdw[egp_sh_i[i]+((egp_cj>>(nbat->neg_2log*j)) & egp_mask)] += VLJ; + Vvdw[egp_sh_i[i] + ((egp_cj >> (nbatParams.neg_2log*j)) & egp_mask)] += VLJ; #else Vvdw_ci += VLJ; /* 1 flop for LJ energy addition */ @@ -333,7 +333,7 @@ #ifdef CALC_ENERGIES #ifdef ENERGY_GROUPS - Vc[egp_sh_i[i]+((egp_cj>>(nbat->neg_2log*j)) & egp_mask)] += vcoul; + Vc[egp_sh_i[i] + ((egp_cj >> (nbatParams.neg_2log*j)) & egp_mask)] += vcoul; #else Vc_ci += vcoul; /* 1 flop for Coulomb energy addition */ diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_outer.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_outer.h index b56f6aecdb..b50a45d9c0 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_outer.h +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_outer.h @@ -107,11 +107,6 @@ NBK_FUNC_NAME(_VgrpF) // NOLINT(misc-definitions-in-headers) ) { const nbnxn_cj_t *l_cj; - const int *type; - const real *q; - const real *shiftvec; - const real *x; - const real *nbfp; real rcut2; #ifdef VDW_CUTOFF_CHECK real rvdw2; @@ -145,7 +140,6 @@ NBK_FUNC_NAME(_VgrpF) // NOLINT(misc-definitions-in-headers) #ifdef CALC_ENERGIES real lje_vc; #endif - const real *ljc; #endif #ifdef CALC_COUL_RF @@ -179,6 +173,8 @@ NBK_FUNC_NAME(_VgrpF) // NOLINT(misc-definitions-in-headers) swF4 = 5*ic->vdw_switch.c5; #endif + const nbnxn_atomdata_t::Params &nbatParams = nbat->params(); + #ifdef LJ_EWALD lje_coeff2 = ic->ewaldcoeff_lj*ic->ewaldcoeff_lj; lje_coeff6_6 = lje_coeff2*lje_coeff2*lje_coeff2/6.0; @@ -186,7 +182,7 @@ NBK_FUNC_NAME(_VgrpF) // NOLINT(misc-definitions-in-headers) lje_vc = ic->sh_lj_ewald; #endif - ljc = nbat->nbfp_comb; + const real *ljc = nbatParams.nbfp_comb.data(); #endif #ifdef CALC_COUL_RF @@ -210,22 +206,22 @@ NBK_FUNC_NAME(_VgrpF) // NOLINT(misc-definitions-in-headers) #endif #ifdef ENERGY_GROUPS - egp_mask = (1<neg_2log) - 1; + egp_mask = (1 << nbatParams.neg_2log) - 1; #endif - rcut2 = ic->rcoulomb*ic->rcoulomb; + rcut2 = ic->rcoulomb*ic->rcoulomb; #ifdef VDW_CUTOFF_CHECK - rvdw2 = ic->rvdw*ic->rvdw; + rvdw2 = ic->rvdw*ic->rvdw; #endif - ntype2 = nbat->ntype*2; - nbfp = nbat->nbfp; - q = nbat->q; - type = nbat->type; - facel = ic->epsfac; - shiftvec = shift_vec[0]; - x = nbat->x; + ntype2 = nbatParams.numTypes*2; + const real *nbfp = nbatParams.nbfp.data(); + const real *q = nbatParams.q.data(); + const int *type = nbatParams.type.data(); + facel = ic->epsfac; + const real *shiftvec = shift_vec[0]; + const real *x = nbat->x().data(); l_cj = nbl->cj.data(); @@ -265,7 +261,7 @@ NBK_FUNC_NAME(_VgrpF) // NOLINT(misc-definitions-in-headers) #else for (i = 0; i < UNROLLI; i++) { - egp_sh_i[i] = ((nbat->energrp[ci]>>(i*nbat->neg_2log)) & egp_mask)*nbat->nenergrp; + egp_sh_i[i] = ((nbatParams.energrp[ci] >> (i*nbatParams.neg_2log)) & egp_mask)*nbatParams.nenergrp; } #endif #endif @@ -303,7 +299,7 @@ NBK_FUNC_NAME(_VgrpF) // NOLINT(misc-definitions-in-headers) { int egp_ind; #ifdef ENERGY_GROUPS - egp_ind = egp_sh_i[i] + ((nbat->energrp[ci]>>(i*nbat->neg_2log)) & egp_mask); + egp_ind = egp_sh_i[i] + ((nbatParams.energrp[ci] >> (i*nbatParams.neg_2log)) & egp_mask); #else egp_ind = 0; #endif @@ -312,7 +308,7 @@ NBK_FUNC_NAME(_VgrpF) // NOLINT(misc-definitions-in-headers) #ifdef LJ_EWALD /* LJ Ewald self interaction */ - Vvdw[egp_ind] += 0.5*nbat->nbfp[nbat->type[ci*UNROLLI+i]*(nbat->ntype + 1)*2]/6*lje_coeff6_6; + Vvdw[egp_ind] += 0.5*nbatParams.nbfp[nbatParams.type[ci*UNROLLI+i]*(nbatParams.numTypes + 1)*2]/6*lje_coeff6_6; #endif } } diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_prune.cpp b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_prune.cpp index 183c2cfed1..45b5eda20a 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_prune.cpp +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref_prune.cpp @@ -60,7 +60,7 @@ nbnxn_kernel_prune_ref(NbnxnPairlistCpu * nbl, nbnxn_cj_t * gmx_restrict cjInner = nbl->cj.data(); const real * gmx_restrict shiftvec = shift_vec[0]; - const real * gmx_restrict x = nbat->x; + const real * gmx_restrict x = nbat->x().data(); const real rlist2 = rlistInner*rlistInner; diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h index 844f55d7f5..38724cb2b5 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h +++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -803,7 +803,7 @@ { int egps_j; #if UNROLLJ == 2 - egps_j = nbat->energrp[cj>>1]; + egps_j = nbatParams.energrp[cj >> 1]; egp_jj[0] = ((egps_j >> ((cj & 1)*egps_jshift)) & egps_jmask)*egps_jstride; #else /* We assume UNROLLI <= UNROLLJ */ @@ -811,7 +811,7 @@ for (jdi = 0; jdi < UNROLLJ/UNROLLI; jdi++) { int jj; - egps_j = nbat->energrp[cj*(UNROLLJ/UNROLLI)+jdi]; + egps_j = nbatParams.energrp[cj*(UNROLLJ/UNROLLI) + jdi]; for (jj = 0; jj < (UNROLLI/2); jj++) { egp_jj[jdi*(UNROLLI/2)+jj] = ((egps_j >> (jj*egps_jshift)) & egps_jmask)*egps_jstride; diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_outer.h b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_outer.h index ced88fcf03..3a289895fb 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_outer.h +++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_outer.h @@ -37,10 +37,6 @@ { using namespace gmx; const nbnxn_cj_t *l_cj; - const real *q; - const real *shiftvec; - const real *x; - real facel; int ci, ci_sh; int ish, ish3; gmx_bool do_LJ, half_LJ, do_coul; @@ -71,7 +67,6 @@ SimdBool diagonal_mask1_S0, diagonal_mask1_S2; #endif - unsigned *exclusion_filter; SimdBitMask filter_S0, filter_S2; SimdReal zero_S(0.0); @@ -132,8 +127,6 @@ #endif #ifdef LJ_COMB_LB - const real *ljc; - SimdReal hsig_i_S0, seps_i_S0; SimdReal hsig_i_S2, seps_i_S2; #else @@ -141,10 +134,6 @@ alignas(GMX_SIMD_ALIGNMENT) real pvdw_c6[2*UNROLLI*UNROLLJ]; real *pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ; #endif - -#if defined LJ_COMB_GEOM || defined LJ_EWALD_GEOM - const real *ljc; -#endif #endif /* LJ_COMB_LB */ SimdReal minRsq_S; @@ -159,17 +148,19 @@ int npair = 0; #endif + const nbnxn_atomdata_t::Params &nbatParams = nbat->params(); + #if defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM - ljc = nbat->lj_comb; + const real * gmx_restrict ljc = nbatParams.lj_comb.data(); #endif #if !(defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined FIX_LJ_C) /* No combination rule used */ - real *nbfp_ptr = nbat->nbfp_aligned; - const int *type = nbat->type; + const real * gmx_restrict nbfp_ptr = nbatParams.nbfp_aligned.data(); + const int * gmx_restrict type = nbatParams.type.data(); #endif /* Load j-i for the first i */ - diagonal_jmi_S = load(nbat->simd_2xnn_diagonal_j_minus_i); + diagonal_jmi_S = load(nbat->simdMasks.diagonal_2xnn_j_minus_i.data()); /* Generate all the diagonal masks as comparison results */ #if UNROLLI == UNROLLJ diagonal_mask_S0 = (zero_S < diagonal_jmi_S); @@ -194,9 +185,9 @@ /* Load masks for topology exclusion masking. filter_stride is static const, so the conditional will be optimized away. */ #if GMX_DOUBLE && !GMX_SIMD_HAVE_INT32_LOGICAL - exclusion_filter = nbat->simd_exclusion_filter64; + const std::uint64_t * gmx_restrict exclusion_filter = nbat->simdMasks.exclusion_filter64.data(); #else - exclusion_filter = nbat->simd_exclusion_filter; + const std::uint32_t * gmx_restrict exclusion_filter = nbat->simdMasks.exclusion_filter.data(); #endif /* Here we cast the exclusion filters from unsigned * to int * or real *. @@ -305,12 +296,12 @@ rcvdw2_S = SimdReal(ic->rvdw*ic->rvdw); #endif - minRsq_S = SimdReal(NBNXN_MIN_RSQ); + minRsq_S = SimdReal(NBNXN_MIN_RSQ); - q = nbat->q; - facel = ic->epsfac; - shiftvec = shift_vec[0]; - x = nbat->x; + const real * gmx_restrict q = nbatParams.q.data(); + const real facel = ic->epsfac; + const real * gmx_restrict shiftvec = shift_vec[0]; + const real * gmx_restrict x = nbat->x().data(); #ifdef FIX_LJ_C @@ -338,13 +329,13 @@ #endif /* FIX_LJ_C */ #ifdef ENERGY_GROUPS - egps_ishift = nbat->neg_2log; + egps_ishift = nbatParams.neg_2log; egps_imask = (1<neg_2log; + egps_jshift = 2*nbatParams.neg_2log; egps_jmask = (1<>1)*UNROLLJ; /* Major division is over i-particle energy groups, determine the stride */ - Vstride_i = nbat->nenergrp*(1<neg_2log)*egps_jstride; + Vstride_i = nbatParams.nenergrp*(1 << nbatParams.neg_2log)*egps_jstride; #endif l_cj = nbl->cj.data(); @@ -389,7 +380,7 @@ half_LJ = (((ciEntry.shift & NBNXN_CI_HALF_LJ(0)) != 0) || !do_LJ) && do_coul; #ifdef ENERGY_GROUPS - egps_i = nbat->energrp[ci]; + egps_i = nbatParams.energrp[ci]; { int ia, egp_ia; @@ -457,7 +448,7 @@ { real c6_i; - c6_i = nbat->nbfp[nbat->type[sci+ia]*(nbat->ntype + 1)*2]/6; + c6_i = nbatParams.nbfp[nbatParams.type[sci+ia]*(nbatParams.numTypes + 1)*2]/6; #ifdef ENERGY_GROUPS vvdwtp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride] #else @@ -520,13 +511,14 @@ c12s_S2 = loadU1DualHsimd(ljc+sci2+STRIDE+2); } #elif !defined LJ_COMB_LB && !defined FIX_LJ_C - const real *nbfp0 = nbfp_ptr + type[sci ]*nbat->ntype*c_simdBestPairAlignment; - const real *nbfp1 = nbfp_ptr + type[sci+1]*nbat->ntype*c_simdBestPairAlignment; + const int numTypes = nbatParams.numTypes; + const real *nbfp0 = nbfp_ptr + type[sci ]*numTypes*c_simdBestPairAlignment; + const real *nbfp1 = nbfp_ptr + type[sci+1]*numTypes*c_simdBestPairAlignment; const real *nbfp2 = nullptr, *nbfp3 = nullptr; if (!half_LJ) { - nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*c_simdBestPairAlignment; - nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*c_simdBestPairAlignment; + nbfp2 = nbfp_ptr + type[sci+2]*numTypes*c_simdBestPairAlignment; + nbfp3 = nbfp_ptr + type[sci+3]*numTypes*c_simdBestPairAlignment; } #endif #endif diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_prune.cpp b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_prune.cpp index 96323e49a4..daca1cd809 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_prune.cpp +++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_prune.cpp @@ -67,7 +67,7 @@ nbnxn_kernel_prune_2xnn(NbnxnPairlistCpu * nbl, nbnxn_cj_t * gmx_restrict cjInner = nbl->cj.data(); const real * gmx_restrict shiftvec = shift_vec[0]; - const real * gmx_restrict x = nbat->x; + const real * gmx_restrict x = nbat->x().data(); const SimdReal rlist2_S(rlistInner*rlistInner); diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h index b787b9d79a..7ac6e4f62a 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h +++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h @@ -104,7 +104,7 @@ gmx_load_simd_4xn_interactions(int excl, SimdBitMask gmx_unused filter_S1, SimdBitMask gmx_unused filter_S2, SimdBitMask gmx_unused filter_S3, - real gmx_unused *simd_interaction_array, + const real gmx_unused *simd_interaction_array, gmx::SimdBool *interact_S0, gmx::SimdBool *interact_S1, gmx::SimdBool *interact_S2, diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h index 5bc77035f0..8b84291627 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h +++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -271,7 +271,7 @@ gmx_load_simd_4xn_interactions(l_cj[cjind].excl, filter_S0, filter_S1, filter_S2, filter_S3, - nbat->simd_interaction_array, + nbat->simdMasks.interaction_array.data(), &interact_S0, &interact_S1, &interact_S2, &interact_S3); #endif /* CHECK_EXCLS */ @@ -1066,7 +1066,7 @@ { int egps_j; #if UNROLLJ == 2 - egps_j = nbat->energrp[cj>>1]; + egps_j = nbatParams.energrp[cj >> 1]; egp_jj[0] = ((egps_j >> ((cj & 1)*egps_jshift)) & egps_jmask)*egps_jstride; #else /* We assume UNROLLI <= UNROLLJ */ @@ -1074,7 +1074,7 @@ for (jdi = 0; jdi < UNROLLJ/UNROLLI; jdi++) { int jj; - egps_j = nbat->energrp[cj*(UNROLLJ/UNROLLI)+jdi]; + egps_j = nbatParams.energrp[cj*(UNROLLJ/UNROLLI) + jdi]; for (jj = 0; jj < (UNROLLI/2); jj++) { egp_jj[jdi*(UNROLLI/2)+jj] = ((egps_j >> (jj*egps_jshift)) & egps_jmask)*egps_jstride; diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_outer.h b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_outer.h index ad4a1b499d..df29ca0b56 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_outer.h +++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_outer.h @@ -37,10 +37,6 @@ { using namespace gmx; const nbnxn_cj_t *l_cj; - const real * q; - const real *shiftvec; - const real *x; - real facel; int ci, ci_sh; int ish, ish3; gmx_bool do_LJ, half_LJ, do_coul; @@ -75,11 +71,6 @@ SimdBool diagonal_mask1_S0, diagonal_mask1_S1, diagonal_mask1_S2, diagonal_mask1_S3; #endif -#if GMX_DOUBLE && !GMX_SIMD_HAVE_INT32_LOGICAL - std::uint64_t *exclusion_filter; -#else - std::uint32_t *exclusion_filter; -#endif SimdBitMask filter_S0, filter_S1, filter_S2, filter_S3; SimdReal zero_S(0.0); @@ -141,17 +132,10 @@ #endif #ifdef LJ_COMB_LB - const real *ljc; - SimdReal hsig_i_S0, seps_i_S0; SimdReal hsig_i_S1, seps_i_S1; SimdReal hsig_i_S2, seps_i_S2; SimdReal hsig_i_S3, seps_i_S3; -#else - -#if defined LJ_COMB_GEOM || defined LJ_EWALD_GEOM - const real *ljc; -#endif #endif /* LJ_COMB_LB */ SimdReal minRsq_S; @@ -166,17 +150,19 @@ int npair = 0; #endif + const nbnxn_atomdata_t::Params &nbatParams = nbat->params(); + #if defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM - ljc = nbat->lj_comb; + const real * gmx_restrict ljc = nbatParams.lj_comb.data(); #endif #if !(defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined FIX_LJ_C) /* No combination rule used */ - real *nbfp_ptr = nbat->nbfp_aligned; - const int *type = nbat->type; + const real * gmx_restrict nbfp_ptr = nbatParams.nbfp_aligned.data(); + const int * gmx_restrict type = nbatParams.type.data(); #endif /* Load j-i for the first i */ - diagonal_jmi_S = load(nbat->simd_4xn_diagonal_j_minus_i); + diagonal_jmi_S = load(nbat->simdMasks.diagonal_4xn_j_minus_i.data()); /* Generate all the diagonal masks as comparison results */ #if UNROLLI == UNROLLJ diagonal_mask_S0 = (zero_S < diagonal_jmi_S); @@ -199,7 +185,7 @@ #if UNROLLI == 2*UNROLLJ /* Load j-i for the second half of the j-cluster */ - diagonal_jmi_S = load(nbat->simd_4xn_diagonal_j_minus_i + UNROLLJ); + diagonal_jmi_S = load(nbat->simdMasks.diagonal_4xn_j_minus_i.data() + UNROLLJ); #endif diagonal_mask1_S0 = (zero_S < diagonal_jmi_S); @@ -213,9 +199,9 @@ #endif #if GMX_DOUBLE && !GMX_SIMD_HAVE_INT32_LOGICAL - exclusion_filter = nbat->simd_exclusion_filter64; + const std::uint64_t * gmx_restrict exclusion_filter = nbat->simdMasks.exclusion_filter64.data(); #else - exclusion_filter = nbat->simd_exclusion_filter; + const std::uint32_t * gmx_restrict exclusion_filter = nbat->simdMasks.exclusion_filter.data(); #endif /* Here we cast the exclusion filters from unsigned * to int * or real *. @@ -326,12 +312,12 @@ rcvdw2_S = SimdReal(ic->rvdw*ic->rvdw); #endif - minRsq_S = SimdReal(NBNXN_MIN_RSQ); + minRsq_S = SimdReal(NBNXN_MIN_RSQ); - q = nbat->q; - facel = ic->epsfac; - shiftvec = shift_vec[0]; - x = nbat->x; + const real * gmx_restrict q = nbatParams.q.data(); + const real facel = ic->epsfac; + const real * gmx_restrict shiftvec = shift_vec[0]; + const real * gmx_restrict x = nbat->x().data(); #ifdef FIX_LJ_C alignas(GMX_SIMD_ALIGNMENT) real pvdw_c6[2*UNROLLI*UNROLLJ]; @@ -361,13 +347,13 @@ #endif /* FIX_LJ_C */ #ifdef ENERGY_GROUPS - egps_ishift = nbat->neg_2log; + egps_ishift = nbatParams.neg_2log; egps_imask = (1<neg_2log; + egps_jshift = 2*nbatParams.neg_2log; egps_jmask = (1<>1)*UNROLLJ; /* Major division is over i-particle energy groups, determine the stride */ - Vstride_i = nbat->nenergrp*(1<neg_2log)*egps_jstride; + Vstride_i = nbatParams.nenergrp*(1 << nbatParams.neg_2log)*egps_jstride; #endif l_cj = nbl->cj.data(); @@ -413,7 +399,7 @@ half_LJ = (((ciEntry.shift & NBNXN_CI_HALF_LJ(0)) != 0) || !do_LJ) && do_coul; #ifdef ENERGY_GROUPS - egps_i = nbat->energrp[ci]; + egps_i = nbatParams.energrp[ci]; { int ia, egp_ia; @@ -484,7 +470,7 @@ { real c6_i; - c6_i = nbat->nbfp[nbat->type[sci+ia]*(nbat->ntype + 1)*2]/6; + c6_i = nbatParams.nbfp[nbatParams.type[sci+ia]*(nbatParams.numTypes + 1)*2]/6; #ifdef ENERGY_GROUPS vvdwtp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride] #else @@ -559,13 +545,14 @@ c12s_S3 = setZero(); } #else - const real *nbfp0 = nbfp_ptr + type[sci ]*nbat->ntype*c_simdBestPairAlignment; - const real *nbfp1 = nbfp_ptr + type[sci+1]*nbat->ntype*c_simdBestPairAlignment; + const int numTypes = nbatParams.numTypes; + const real *nbfp0 = nbfp_ptr + type[sci ]*numTypes*c_simdBestPairAlignment; + const real *nbfp1 = nbfp_ptr + type[sci+1]*numTypes*c_simdBestPairAlignment; const real *nbfp2 = nullptr, *nbfp3 = nullptr; if (!half_LJ) { - nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*c_simdBestPairAlignment; - nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*c_simdBestPairAlignment; + nbfp2 = nbfp_ptr + type[sci+2]*numTypes*c_simdBestPairAlignment; + nbfp3 = nbfp_ptr + type[sci+3]*numTypes*c_simdBestPairAlignment; } #endif #endif diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_prune.cpp b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_prune.cpp index 8a84c93484..e7f829c47b 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_prune.cpp +++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_prune.cpp @@ -67,7 +67,7 @@ nbnxn_kernel_prune_4xn(NbnxnPairlistCpu * nbl, nbnxn_cj_t * gmx_restrict cjInner = nbl->cj.data(); const real * gmx_restrict shiftvec = shift_vec[0]; - const real * gmx_restrict x = nbat->x; + const real * gmx_restrict x = nbat->x().data(); const SimdReal rlist2_S(rlistInner*rlistInner); diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp index 22c6ba169e..8908f3fb2e 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -410,7 +410,7 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t *nb, } /* HtoD x, q */ - ocl_copy_H2D_async(adat->xq, nbatom->x + adat_begin * 4, adat_begin*sizeof(float)*4, + ocl_copy_H2D_async(adat->xq, nbatom->x().data() + adat_begin * 4, adat_begin*sizeof(float)*4, adat_len * sizeof(float) * 4, stream, bDoTime ? t->nb_h2d[iloc].fetchNextEvent() : nullptr); if (bDoTime) @@ -729,7 +729,7 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t *nb, * (and energies/shift forces if required). */ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb, - const struct nbnxn_atomdata_t *nbatom, + struct nbnxn_atomdata_t *nbatom, int flags, int aloc, bool haveOtherWork) @@ -780,7 +780,7 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb, } /* DtoH f */ - ocl_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f, adat_begin*3*sizeof(float), + ocl_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f, adat_begin*3*sizeof(float), (adat_len)* adat->f_elem_size, stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr); /* kick off work */ diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp index 9547252b74..4e1b973d67 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp @@ -145,7 +145,7 @@ static void init_atomdata_first(cl_atomdata_t *ad, int ntypes, gmx_device_runtim /* An element of the shift_vec device buffer has the same size as one element of the host side shift_vec buffer. */ - ad->shift_vec_elem_size = sizeof(*nbnxn_atomdata_t::shift_vec); + ad->shift_vec_elem_size = sizeof(*nbnxn_atomdata_t::shift_vec.data()); ad->shift_vec = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, SHIFTS * ad->shift_vec_elem_size, nullptr, &cl_error); @@ -291,7 +291,7 @@ map_interaction_types_to_gpu_kernel_flavors(const interaction_const_t *ic, static void init_nbparam(cl_nbparam_t *nbp, const interaction_const_t *ic, const NbnxnListParameters *listParams, - const nbnxn_atomdata_t *nbat, + const nbnxn_atomdata_t::Params &nbatParams, const gmx_device_runtime_data_t *runData) { cl_int cl_error; @@ -299,7 +299,7 @@ static void init_nbparam(cl_nbparam_t *nbp, set_cutoff_parameters(nbp, ic, listParams); map_interaction_types_to_gpu_kernel_flavors(ic, - nbat->comb_rule, + nbatParams.comb_rule, &(nbp->eeltype), &(nbp->vdwtype)); @@ -307,11 +307,11 @@ static void init_nbparam(cl_nbparam_t *nbp, { if (ic->ljpme_comb_rule == ljcrGEOM) { - GMX_ASSERT(nbat->comb_rule == ljcrGEOM, "Combination rule mismatch!"); + GMX_ASSERT(nbatParams.comb_rule == ljcrGEOM, "Combination rule mismatch!"); } else { - GMX_ASSERT(nbat->comb_rule == ljcrLB, "Combination rule mismatch!"); + GMX_ASSERT(nbatParams.comb_rule == ljcrLB, "Combination rule mismatch!"); } } /* generate table for PME */ @@ -342,8 +342,8 @@ static void init_nbparam(cl_nbparam_t *nbp, ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str()); } - int nnbfp = 2*nbat->ntype*nbat->ntype; - int nnbfp_comb = 2*nbat->ntype; + const int nnbfp = 2*nbatParams.numTypes*nbatParams.numTypes; + const int nnbfp_comb = 2*nbatParams.numTypes; { /* Switched from using textures to using buffers */ @@ -358,8 +358,12 @@ static void init_nbparam(cl_nbparam_t *nbp, &array_format, nnbfp, 1, 0, nbat->nbfp, &cl_error); */ - nbp->nbfp_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, - nnbfp*sizeof(cl_float), nbat->nbfp, &cl_error); + nbp->nbfp_climg2d = + clCreateBuffer(runData->context, + CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, + nnbfp*sizeof(cl_float), + const_cast(nbatParams.nbfp.data()), + &cl_error); GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str()); @@ -369,8 +373,12 @@ static void init_nbparam(cl_nbparam_t *nbp, // TODO: decide which alternative is most efficient - textures or buffers. /* nbp->nbfp_comb_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &array_format, nnbfp_comb, 1, 0, nbat->nbfp_comb, &cl_error);*/ - nbp->nbfp_comb_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, - nnbfp_comb*sizeof(cl_float), nbat->nbfp_comb, &cl_error); + nbp->nbfp_comb_climg2d = + clCreateBuffer(runData->context, + CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, + nnbfp_comb*sizeof(cl_float), + const_cast(nbatParams.nbfp_comb.data()), + &cl_error); GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str()); } @@ -598,10 +606,10 @@ static void nbnxn_gpu_init_kernels(gmx_nbnxn_ocl_t *nb) static void nbnxn_ocl_init_const(gmx_nbnxn_ocl_t *nb, const interaction_const_t *ic, const NbnxnListParameters *listParams, - const nbnxn_atomdata_t *nbat) + const nbnxn_atomdata_t::Params &nbatParams) { - init_atomdata_first(nb->atdat, nbat->ntype, nb->dev_rundata); - init_nbparam(nb->nbparam, ic, listParams, nbat, nb->dev_rundata); + init_atomdata_first(nb->atdat, nbatParams.numTypes, nb->dev_rundata); + init_nbparam(nb->nbparam, ic, listParams, nbatParams, nb->dev_rundata); } @@ -695,7 +703,7 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t **p_nb, init_timings(nb->timings); } - nbnxn_ocl_init_const(nb, ic, listParams, nbat); + nbnxn_ocl_init_const(nb, ic, listParams, nbat->params()); /* Enable LJ param manual prefetch for AMD or Intel or if we request through env. var. * TODO: decide about NVIDIA @@ -839,7 +847,7 @@ void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_ocl_t *nb, /* only if we have a dynamic box */ if (nbatom->bDynamicBox || !adat->bShiftVecUploaded) { - ocl_copy_H2D_async(adat->shift_vec, nbatom->shift_vec, 0, + ocl_copy_H2D_async(adat->shift_vec, nbatom->shift_vec.data(), 0, SHIFTS * adat->shift_vec_elem_size, ls, nullptr); adat->bShiftVecUploaded = CL_TRUE; } @@ -857,7 +865,7 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t *nb, cl_atomdata_t *d_atdat = nb->atdat; cl_command_queue ls = nb->stream[eintLocal]; - natoms = nbat->natoms; + natoms = nbat->numAtoms(); realloced = false; if (bDoTime) @@ -923,12 +931,12 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t *nb, if (useLjCombRule(nb->nbparam->vdwtype)) { - ocl_copy_H2D_async(d_atdat->lj_comb, nbat->lj_comb, 0, + ocl_copy_H2D_async(d_atdat->lj_comb, nbat->params().lj_comb.data(), 0, natoms*sizeof(cl_float2), ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr); } else { - ocl_copy_H2D_async(d_atdat->atom_types, nbat->type, 0, + ocl_copy_H2D_async(d_atdat->atom_types, nbat->params().type.data(), 0, natoms*sizeof(int), ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr); } diff --git a/src/gromacs/mdlib/nbnxn_pairlist.h b/src/gromacs/mdlib/nbnxn_pairlist.h index 9c01c7125b..1ac278db91 100644 --- a/src/gromacs/mdlib/nbnxn_pairlist.h +++ b/src/gromacs/mdlib/nbnxn_pairlist.h @@ -40,6 +40,7 @@ #include +#include "gromacs/gpu_utils/hostallocator.h" #include "gromacs/math/vectypes.h" #include "gromacs/mdlib/nbnxn_consts.h" #include "gromacs/mdtypes/nblist.h" @@ -52,6 +53,10 @@ struct NbnxnPairlistCpuWork; struct NbnxnPairlistGpuWork; struct tMPI_Atomic; +/* Convenience type for vector with aligned memory */ +template +using AlignedVector = std::vector < T, gmx::AlignedAllocator < T>>; + /* Convenience type for vector that avoids initialization at resize() */ template using FastVector = std::vector < T, gmx::DefaultInitializationAllocator < T>>; @@ -258,16 +263,28 @@ enum { nbatXYZ, nbatXYZQ, nbatX4, nbatX8 }; -typedef struct { - real *f; /* f, size natoms*fstride */ - real *fshift; /* Shift force array, size SHIFTS*DIM */ - int nV; /* The size of *Vvdw and *Vc */ - real *Vvdw; /* Temporary Van der Waals group energy storage */ - real *Vc; /* Temporary Coulomb group energy storage */ - int nVS; /* The size of *VSvdw and *VSc */ - real *VSvdw; /* Temporary SIMD Van der Waals group energy storage */ - real *VSc; /* Temporary SIMD Coulomb group energy storage */ -} nbnxn_atomdata_output_t; +// Struct that holds force and energy output buffers +struct nbnxn_atomdata_output_t +{ + /* Constructor + * + * \param[in] nb_kernel_type Type of non-bonded kernel + * \param[in] numEnergyGroups The number of energy groups + * \param[in] simdEnergyBufferStride Stride for entries in the energy buffers for SIMD kernels + * \param[in] pinningPolicy Sets the pinning policy for all buffers used on the GPU + */ + nbnxn_atomdata_output_t(int nb_kernel_type, + int numEnergyGroups, + int simdEnergyBUfferStride, + gmx::PinningPolicy pinningPolicy); + + gmx::HostVector f; // f, size natoms*fstride + gmx::HostVector fshift; // Shift force array, size SHIFTS*DIM + gmx::HostVector Vvdw; // Temporary Van der Waals group energy storage + gmx::HostVector Vc; // Temporary Coulomb group energy storage + AlignedVector VSvdw; // Temporary SIMD Van der Waals group energy storage + AlignedVector VSc; // Temporary SIMD Coulomb group energy storage +}; /* Block size in atoms for the non-bonded thread force-buffer reduction, * should be a multiple of all cell and x86 SIMD sizes (i.e. 2, 4 and 8). @@ -298,49 +315,132 @@ enum { ljcrGEOM, ljcrLB, ljcrNONE, ljcrNR }; -typedef struct nbnxn_atomdata_t { //NOLINT(clang-analyzer-optin.performance.Padding) - nbnxn_alloc_t *alloc; - nbnxn_free_t *free; - int ntype; /* The number of different atom types */ - real *nbfp; /* Lennard-Jones 6*C6 and 12*C12 params, size ntype^2*2 */ - int comb_rule; /* Combination rule, see enum above */ - real *nbfp_comb; /* LJ parameter per atom type, size ntype*2 */ - real *nbfp_aligned; /* As nbfp, but with an alignment (stride) suitable - * for the present SIMD architectures - */ - int natoms; /* Number of atoms */ - int natoms_local; /* Number of local atoms */ - int *type; /* Atom types */ - real *lj_comb; /* LJ parameters per atom for combining for pairs */ - int XFormat; /* The format of x (and q), enum */ - int FFormat; /* The format of f, enum */ - real *q; /* Charges, can be NULL if incorporated in x */ - int na_c; /* The number of atoms per cluster */ - int nenergrp; /* The number of energy groups */ - int neg_2log; /* Log2 of nenergrp */ - int *energrp; /* The energy groups per cluster, can be NULL */ - gmx_bool bDynamicBox; /* Do we need to update shift_vec every step? */ - rvec *shift_vec; /* Shift vectors, copied from t_forcerec */ - int xstride; /* stride for a coordinate in x (usually 3 or 4) */ - int fstride; /* stride for a coordinate in f (usually 3 or 4) */ - real *x; /* x and possibly q, size natoms*xstride */ - - /* j-atom minus i-atom index for generating self and Newton exclusions - * cluster-cluster pairs of the diagonal, for 4xn and 2xnn kernels. +/* Struct that stores atom related data for the nbnxn module + * + * Note: performance would improve slightly when all std::vector containers + * in this struct would not initialize during resize(). + */ +struct nbnxn_atomdata_t +{ //NOLINT(clang-analyzer-optin.performance.Padding) + struct Params + { + /* Constructor + * + * \param[in] pinningPolicy Sets the pinning policy for all data that might be transfered to a GPU + */ + Params(gmx::PinningPolicy pinningPolicy); + + // The number of different atom types + int numTypes; + // Lennard-Jone 6*C6 and 12*C12 parameters, size numTypes*2*2 + gmx::HostVector nbfp; + // Combination rule, see enum defined above + int comb_rule; + // LJ parameters per atom type, size numTypes*2 + gmx::HostVector nbfp_comb; + // As nbfp, but with a stride for the present SIMD architecture + AlignedVector nbfp_aligned; + // Atom types per atom + gmx::HostVector type; + // LJ parameters per atom for fast SIMD loading + gmx::HostVector lj_comb; + // Charges per atom, not set with format nbatXYZQ + gmx::HostVector q; + // The number of energy groups + int nenergrp; + // 2log(nenergrp) + int neg_2log; + // The energy groups, one int entry per cluster, only set when needed + gmx::HostVector energrp; + }; + + // Diagonal and topology exclusion helper data for all SIMD kernels + struct SimdMasks + { + SimdMasks(); + + // Helper data for setting up diagonal exclusion masks in the SIMD 4xN kernels + AlignedVector diagonal_4xn_j_minus_i; + // Helper data for setting up diaginal exclusion masks in the SIMD 2xNN kernels + AlignedVector diagonal_2xnn_j_minus_i; + // Filters for topology exclusion masks for the SIMD kernels + AlignedVector exclusion_filter; + // Filters for topology exclusion masks for double SIMD kernels without SIMD int32 logical support + AlignedVector exclusion_filter64; + // Array of masks needed for exclusions + AlignedVector interaction_array; + }; + + /* Constructor + * + * \param[in] pinningPolicy Sets the pinning policy for all data that might be transfered to a GPU */ - real *simd_4xn_diagonal_j_minus_i; - real *simd_2xnn_diagonal_j_minus_i; - /* Filters for topology exclusion masks for the SIMD kernels. */ - uint32_t *simd_exclusion_filter; - uint64_t *simd_exclusion_filter64; //!< Used for double w/o SIMD int32 logical support - real *simd_interaction_array; /* Array of masks needed for exclusions */ - int nout; /* The number of force arrays */ - nbnxn_atomdata_output_t *out; /* Output data structures */ - int nalloc; /* Allocation size of all arrays (for x/f *x/fstride) */ - gmx_bool bUseBufferFlags; /* Use the flags or operate on all atoms */ - nbnxn_buffer_flags_t buffer_flags; /* Flags for buffer zeroing+reduc. */ - gmx_bool bUseTreeReduce; /* Use tree for force reduction */ - tMPI_Atomic *syncStep; /* Synchronization step for tree reduce */ -} nbnxn_atomdata_t; + nbnxn_atomdata_t(gmx::PinningPolicy pinningPolicy); + + /* Returns a const reference to the parameters */ + const Params ¶ms() const + { + return params_; + } + + /* Returns a non-const reference to the parameters */ + Params ¶msDeprecated() + { + return params_; + } + + /* Returns the current total number of atoms stored */ + int numAtoms() const + { + return numAtoms_; + } + + /* Return the coordinate buffer, and q with xFormat==nbatXYZQ */ + gmx::ArrayRef x() const + { + return x_; + } + + /* Return the coordinate buffer, and q with xFormat==nbatXYZQ */ + gmx::ArrayRef x() + { + return x_; + } + + /* Resizes the coordinate buffer and sets the number of atoms */ + void resizeCoordinateBuffer(int numAtoms); + + /* Resizes the force buffers for the current number of atoms */ + void resizeForceBuffers(); + + private: + // The LJ and charge parameters + Params params_; + // The total number of atoms currently stored + int numAtoms_; + public: + int natoms_local; /* Number of local atoms */ + int XFormat; /* The format of x (and q), enum */ + int FFormat; /* The format of f, enum */ + gmx_bool bDynamicBox; /* Do we need to update shift_vec every step? */ + gmx::HostVector shift_vec; /* Shift vectors, copied from t_forcerec */ + int xstride; /* stride for a coordinate in x (usually 3 or 4) */ + int fstride; /* stride for a coordinate in f (usually 3 or 4) */ + private: + gmx::HostVector x_; /* x and possibly q, size natoms*xstride */ + + public: + // Masks for handling exclusions in the SIMD kernels + const SimdMasks simdMasks; + + /* Output data */ + std::vector out; /* Output data structures, 1 per thread */ + + /* Reduction related data */ + gmx_bool bUseBufferFlags; /* Use the flags or operate on all atoms */ + nbnxn_buffer_flags_t buffer_flags; /* Flags for buffer zeroing+reduc. */ + gmx_bool bUseTreeReduce; /* Use tree for force reduction */ + tMPI_Atomic *syncStep; /* Synchronization step for tree reduce */ +}; #endif diff --git a/src/gromacs/mdlib/nbnxn_search.cpp b/src/gromacs/mdlib/nbnxn_search.cpp index 5b7efff423..4a5d6da7bc 100644 --- a/src/gromacs/mdlib/nbnxn_search.cpp +++ b/src/gromacs/mdlib/nbnxn_search.cpp @@ -1772,7 +1772,9 @@ static void make_fep_list(const nbnxn_search *nbs, reallocate_nblist(nlist); } - ngid = nbat->nenergrp; + const nbnxn_atomdata_t::Params &nbatParams = nbat->params(); + + ngid = nbatParams.nenergrp; if (ngid*jGrid.na_cj > gmx::index(sizeof(gid_cj)*8)) { @@ -1780,8 +1782,8 @@ static void make_fep_list(const nbnxn_search *nbs, iGrid.na_c, jGrid.na_cj, (sizeof(gid_cj)*8)/jGrid.na_cj); } - egp_shift = nbat->neg_2log; - egp_mask = (1<neg_2log) - 1; + egp_shift = nbatParams.neg_2log; + egp_mask = (1 << egp_shift) - 1; /* Loop over the atoms in the i sub-cell */ bFEP_i_all = TRUE; @@ -1811,7 +1813,7 @@ static void make_fep_list(const nbnxn_search *nbs, if (ngid > 1) { - gid_i = (nbat->energrp[ci] >> (egp_shift*i)) & egp_mask; + gid_i = (nbatParams.energrp[ci] >> (egp_shift*i)) & egp_mask; } for (int cj_ind = cj_ind_start; cj_ind < cj_ind_end; cj_ind++) @@ -1826,7 +1828,7 @@ static void make_fep_list(const nbnxn_search *nbs, fep_cj = jGrid.fep[cjr]; if (ngid > 1) { - gid_cj = nbat->energrp[cja]; + gid_cj = nbatParams.energrp[cja]; } } else if (2*jGrid.na_cj == jGrid.na_c) @@ -1836,7 +1838,7 @@ static void make_fep_list(const nbnxn_search *nbs, fep_cj = (jGrid.fep[cjr>>1] >> ((cjr&1)*jGrid.na_cj)) & ((1< 1) { - gid_cj = nbat->energrp[cja>>1] >> ((cja&1)*jGrid.na_cj*egp_shift) & ((1<<(jGrid.na_cj*egp_shift)) - 1); + gid_cj = nbatParams.energrp[cja>>1] >> ((cja&1)*jGrid.na_cj*egp_shift) & ((1<<(jGrid.na_cj*egp_shift)) - 1); } } else @@ -1846,7 +1848,7 @@ static void make_fep_list(const nbnxn_search *nbs, fep_cj = jGrid.fep[cjr*2] + (jGrid.fep[cjr*2+1] << jGrid.na_c); if (ngid > 1) { - gid_cj = nbat->energrp[cja*2] + (nbat->energrp[cja*2+1] << (jGrid.na_c*egp_shift)); + gid_cj = nbatParams.energrp[cja*2] + (nbatParams.energrp[cja*2+1] << (jGrid.na_c*egp_shift)); } } @@ -2001,9 +2003,9 @@ static void make_fep_list(const nbnxn_search *nbs, bFEP_i = ((iGrid.fep[c_abs - iGrid.cell0*c_gpuNumClusterPerCell] & (1 << i)) != 0u); - xi = nbat->x[ind_i*nbat->xstride+XX] + shx; - yi = nbat->x[ind_i*nbat->xstride+YY] + shy; - zi = nbat->x[ind_i*nbat->xstride+ZZ] + shz; + xi = nbat->x()[ind_i*nbat->xstride+XX] + shx; + yi = nbat->x()[ind_i*nbat->xstride+YY] + shy; + zi = nbat->x()[ind_i*nbat->xstride+ZZ] + shz; if ((nlist->nrj + cj4_ind_end - cj4_ind_start)*c_nbnxnGpuJgroupSize*nbl->na_cj > nlist->maxnrj) { @@ -2052,9 +2054,9 @@ static void make_fep_list(const nbnxn_search *nbs, excl_pair = a_mod_wj(j)*nbl->na_ci + i; excl_bit = (1U << (gcj*c_gpuNumClusterPerCell + c)); - dx = nbat->x[ind_j*nbat->xstride+XX] - xi; - dy = nbat->x[ind_j*nbat->xstride+YY] - yi; - dz = nbat->x[ind_j*nbat->xstride+ZZ] - zi; + dx = nbat->x()[ind_j*nbat->xstride+XX] - xi; + dy = nbat->x()[ind_j*nbat->xstride+YY] - yi; + dz = nbat->x()[ind_j*nbat->xstride+ZZ] - zi; /* The unpruned GPU list has more than 2/3 * of the atom pairs beyond rlist. Using @@ -3323,7 +3325,7 @@ static void makeClusterListWrapper(NbnxnPairlistCpu *nbl, makeClusterListSimple(jGrid, nbl, ci, firstCell, lastCell, excludeSubDiagonal, - nbat->x, + nbat->x().data(), rlist2, rbb2, numDistanceChecks); break; @@ -3332,7 +3334,7 @@ static void makeClusterListWrapper(NbnxnPairlistCpu *nbl, makeClusterListSimd4xn(jGrid, nbl, ci, firstCell, lastCell, excludeSubDiagonal, - nbat->x, + nbat->x().data(), rlist2, rbb2, numDistanceChecks); break; @@ -3342,7 +3344,7 @@ static void makeClusterListWrapper(NbnxnPairlistCpu *nbl, makeClusterListSimd2xnn(jGrid, nbl, ci, firstCell, lastCell, excludeSubDiagonal, - nbat->x, + nbat->x().data(), rlist2, rbb2, numDistanceChecks); break; @@ -3368,7 +3370,7 @@ static void makeClusterListWrapper(NbnxnPairlistGpu *nbl, make_cluster_list_supersub(iGrid, jGrid, nbl, ci, cj, excludeSubDiagonal, - nbat->xstride, nbat->x, + nbat->xstride, nbat->x().data(), rlist2, rbb2, numDistanceChecks); } @@ -3739,7 +3741,7 @@ static void nbnxn_make_pairlist_part(const nbnxn_search *nbs, nbl->work); icell_set_x(cell0_i+ci, shx, shy, shz, - nbat->xstride, nbat->x, + nbat->xstride, nbat->x().data(), nb_kernel_type, nbl->work); @@ -4265,11 +4267,11 @@ void nbnxn_make_pairlist(nbnxn_search *nbs, fprintf(debug, "ns making %d nblists\n", nnbl); } - nbat->bUseBufferFlags = (nbat->nout > 1); + nbat->bUseBufferFlags = (nbat->out.size() > 1); /* We should re-init the flags before making the first list */ if (nbat->bUseBufferFlags && LOCAL_I(iloc)) { - init_buffer_flags(&nbat->buffer_flags, nbat->natoms); + init_buffer_flags(&nbat->buffer_flags, nbat->numAtoms()); } int nzi; @@ -4360,7 +4362,7 @@ void nbnxn_make_pairlist(nbnxn_search *nbs, */ if (nbat->bUseBufferFlags && ((zi == 0 && zj == 0))) { - init_buffer_flags(&nbs->work[th].buffer_flags, nbat->natoms); + init_buffer_flags(&nbs->work[th].buffer_flags, nbat->numAtoms()); } if (CombineNBLists && th > 0)