-/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+/*
+ * This file is part of the GROMACS molecular simulation package.
*
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
*
- * This source code is part of
- *
- * G R O M A C S
- *
- * GROningen MAchine for Chemical Simulations
- *
- * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2012, The GROMACS development team,
- * check out http://www.gromacs.org for more information.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
- * If you want to redistribute modifications, please consider that
- * scientific software is very special. Version control is crucial -
- * bugs must be traceable. We will be happy to consider code for
- * inclusion in the official distribution, but derived work must not
- * be called official GROMACS. Details are found in the README & COPYING
- * files - if they are missing, get the official version at www.gromacs.org.
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
*
- * To help us fund GROMACS development, we humbly ask that you cite
- * the papers on the package - you can find them in the top README file.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
- * For more info, check our website at http://www.gromacs.org
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
*
- * And Hey:
- * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
*/
+#include "gmxpre.h"
+
+#include "nbnxn_cuda.h"
+
+#include "config.h"
-#include <stdlib.h>
#include <assert.h>
+#include <stdlib.h>
#if defined(_MSVC)
#include <limits>
#include <cuda.h>
-#include "types/simple.h"
-#include "types/nbnxn_pairlist.h"
-#include "types/nb_verlet.h"
-#include "types/ishift.h"
-#include "types/force_flags.h"
-#include "../nbnxn_consts.h"
-
#ifdef TMPI_ATOMICS
#include "thread_mpi/atomic.h"
#endif
+#include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
+#include "gromacs/legacyheaders/types/force_flags.h"
+#include "gromacs/legacyheaders/types/simple.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/utility/cstringutil.h"
+
#include "nbnxn_cuda_types.h"
-#include "../../gmxlib/cuda_tools/cudautils.cuh"
-#include "nbnxn_cuda.h"
-#include "nbnxn_cuda_data_mgmt.h"
+#if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ >= 300
+#define USE_TEXOBJ
+#endif
+
+/*! Texture reference for LJ C6/C12 parameters; bound to cu_nbparam_t.nbfp */
+texture<float, 1, cudaReadModeElementType> nbfp_texref;
-/*! Texture reference for nonbonded parameters; bound to cu_nbparam_t.nbfp*/
-texture<float, 1, cudaReadModeElementType> tex_nbfp;
+/*! Texture reference for LJ-PME parameters; bound to cu_nbparam_t.nbfp_comb */
+texture<float, 1, cudaReadModeElementType> nbfp_comb_texref;
/*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
-texture<float, 1, cudaReadModeElementType> tex_coulomb_tab;
+texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
/* Convenience defines */
#define NCL_PER_SUPERCL (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
#define CL_SIZE (NBNXN_GPU_CLUSTER_SIZE)
/***** The kernels come here *****/
-#include "nbnxn_cuda_kernel_utils.cuh"
-
-/* Generate all combinations of kernels through multiple inclusion:
- F, F + E, F + prune, F + E + prune. */
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh"
+
+/* Top-level kernel generation: will generate through multiple inclusion the
+ * following flavors for all kernels:
+ * - force-only output;
+ * - force and energy output;
+ * - force-only with pair list pruning;
+ * - force and energy output with pair list pruning.
+ */
/** Force only **/
-#include "nbnxn_cuda_kernels.cuh"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
/** Force & energy **/
#define CALC_ENERGIES
-#include "nbnxn_cuda_kernels.cuh"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
#undef CALC_ENERGIES
/*** Pair-list pruning kernels ***/
/** Force only **/
#define PRUNE_NBL
-#include "nbnxn_cuda_kernels.cuh"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
/** Force & energy **/
#define CALC_ENERGIES
-#include "nbnxn_cuda_kernels.cuh"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
#undef CALC_ENERGIES
#undef PRUNE_NBL
/* do we exceed the grid x dimension limit? */
if (nwork_units > max_grid_x_size)
{
- gmx_fatal(FARGS, "Watch out system too large to simulate!\n"
+ gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
"The number of nonbonded work units (=number of super-clusters) exceeds the"
"maximum grid size in x dimension (%d > %d)!", nwork_units, max_grid_x_size);
}
/* Constant arrays listing all kernel function pointers and enabling selection
of a kernel in an elegant manner. */
-static const int nEnergyKernelTypes = 2; /* 0 - no energy, 1 - energy */
-static const int nPruneKernelTypes = 2; /* 0 - no prune, 1 - prune */
+/*! Pointers to the non-bonded kernels organized in 2-dim arrays by:
+ * electrostatics and VDW type.
+ *
+ * Note that the row- and column-order of function pointers has to match the
+ * order of corresponding enumerated electrostatics and vdw types, resp.,
+ * defined in nbnxn_cuda_types.h.
+ */
+
+/*! Force-only kernel function pointers. */
+static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_noprune_ptr[eelCuNR][evdwCuNR] =
+{
+ { nbnxn_kernel_ElecCut_VdwLJ_F_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_F_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_cuda },
+ { nbnxn_kernel_ElecRF_VdwLJ_F_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_F_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_F_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_cuda },
+ { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_cuda },
+ { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_cuda },
+ { nbnxn_kernel_ElecEw_VdwLJ_F_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_F_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_cuda },
+ { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_cuda }
+};
-/* Default kernels */
-static const nbnxn_cu_kfunc_ptr_t
-nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
+/*! Force + energy kernel function pointers. */
+static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_noprune_ptr[eelCuNR][evdwCuNR] =
{
- { { k_nbnxn_ewald, k_nbnxn_ewald_prune },
- { k_nbnxn_ewald_ener, k_nbnxn_ewald_ener_prune } },
- { { k_nbnxn_ewald_twin, k_nbnxn_ewald_twin_prune },
- { k_nbnxn_ewald_twin_ener, k_nbnxn_ewald_twin_ener_prune } },
- { { k_nbnxn_rf, k_nbnxn_rf_prune },
- { k_nbnxn_rf_ener, k_nbnxn_rf_ener_prune } },
- { { k_nbnxn_cutoff, k_nbnxn_cutoff_prune },
- { k_nbnxn_cutoff_ener, k_nbnxn_cutoff_ener_prune } },
+ { nbnxn_kernel_ElecCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_cuda },
+ { nbnxn_kernel_ElecRF_VdwLJ_VF_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_cuda },
+ { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_cuda },
+ { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_cuda },
+ { nbnxn_kernel_ElecEw_VdwLJ_VF_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_cuda },
+ { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_cuda }
};
-/* Legacy kernels */
-static const nbnxn_cu_kfunc_ptr_t
-nb_legacy_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
+/*! Force + pruning kernel function pointers. */
+static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_prune_ptr[eelCuNR][evdwCuNR] =
{
- { { k_nbnxn_ewald_legacy, k_nbnxn_ewald_prune_legacy },
- { k_nbnxn_ewald_ener_legacy, k_nbnxn_ewald_ener_prune_legacy } },
- { { k_nbnxn_ewald_twin_legacy, k_nbnxn_ewald_twin_prune_legacy },
- { k_nbnxn_ewald_twin_ener_legacy, k_nbnxn_ewald_twin_ener_prune_legacy } },
- { { k_nbnxn_rf_legacy, k_nbnxn_rf_prune_legacy },
- { k_nbnxn_rf_ener_legacy, k_nbnxn_rf_ener_prune_legacy } },
- { { k_nbnxn_cutoff_legacy, k_nbnxn_cutoff_prune_legacy },
- { k_nbnxn_cutoff_ener_legacy, k_nbnxn_cutoff_ener_prune_legacy } },
+ { nbnxn_kernel_ElecCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_cuda },
+ { nbnxn_kernel_ElecRF_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_cuda },
+ { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_cuda },
+ { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_cuda },
+ { nbnxn_kernel_ElecEw_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_cuda },
+ { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_cuda }
+};
+
+/*! Force + energy + pruning kernel function pointers. */
+static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_prune_ptr[eelCuNR][evdwCuNR] =
+{
+ { nbnxn_kernel_ElecCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_cuda },
+ { nbnxn_kernel_ElecRF_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_cuda },
+ { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_cuda },
+ { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_cuda },
+ { nbnxn_kernel_ElecEw_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_cuda },
+ { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_cuda }
};
/*! Return a pointer to the kernel version to be executed at the current step. */
-static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int kver, int eeltype,
- bool bDoEne, bool bDoPrune)
+static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int eeltype,
+ int evdwtype,
+ bool bDoEne,
+ bool bDoPrune)
{
- assert(kver < eNbnxnCuKNR);
+ nbnxn_cu_kfunc_ptr_t res;
+
assert(eeltype < eelCuNR);
+ assert(evdwtype < eelCuNR);
- if (NBNXN_KVER_LEGACY(kver))
+ if (bDoEne)
{
- return nb_legacy_kfunc_ptr[eeltype][bDoEne][bDoPrune];
+ if (bDoPrune)
+ {
+ res = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
+ }
+ else
+ {
+ res = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
+ }
}
else
{
- return nb_default_kfunc_ptr[eeltype][bDoEne][bDoPrune];
+ if (bDoPrune)
+ {
+ res = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
+ }
+ else
+ {
+ res = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
+ }
}
+
+ return res;
}
-/*! Calculates the amount of shared memory required for kernel version in use. */
-static inline int calc_shmem_required(int kver)
+/*! Calculates the amount of shared memory required by the CUDA kernel in use. */
+static inline int calc_shmem_required()
{
int shmem;
/* size of shmem (force-buffers/xq/atom type preloading) */
- if (NBNXN_KVER_LEGACY(kver))
- {
- /* i-atom x+q in shared memory */
- shmem = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
- /* force reduction buffers in shared memory */
- shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
- }
- else
- {
- /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
- /* i-atom x+q in shared memory */
- shmem = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
- /* cj in shared memory, for both warps separately */
- shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
+ /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
+ /* i-atom x+q in shared memory */
+ shmem = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
+ /* cj in shared memory, for both warps separately */
+ shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
#ifdef IATYPE_SHMEM
- /* i-atom types in shared memory */
- shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
+ /* i-atom types in shared memory */
+ shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
#endif
#if __CUDA_ARCH__ < 300
- /* force reduction buffers in shared memory */
- shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
+ /* force reduction buffers in shared memory */
+ shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
#endif
- }
return shmem;
}
-/*! As we execute nonbonded workload in separate streams, before launching
+/*! As we execute nonbonded workload in separate streams, before launching
the kernel we need to make sure that he following operations have completed:
- atomdata allocation and related H2D transfers (every nstlist step);
- pair list H2D transfer (every nstlist step);
However, for the sake of having a future-proof implementation, we use the
misc_ops_done event to record the point in time when the above operations
are finished and synchronize with this event in the non-local stream.
-*/
-void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
+ */
+void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
const nbnxn_atomdata_t *nbatom,
- int flags,
- int iloc)
+ int flags,
+ int iloc)
{
- cudaError_t stat;
- int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
+ cudaError_t stat;
+ int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
/* CUDA kernel launch-related stuff */
- int shmem, nblock;
- dim3 dim_block, dim_grid;
+ int shmem, nblock;
+ dim3 dim_block, dim_grid;
nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
- cu_atomdata_t *adat = cu_nb->atdat;
- cu_nbparam_t *nbp = cu_nb->nbparam;
- cu_plist_t *plist = cu_nb->plist[iloc];
- cu_timers_t *t = cu_nb->timers;
- cudaStream_t stream = cu_nb->stream[iloc];
+ cu_atomdata_t *adat = cu_nb->atdat;
+ cu_nbparam_t *nbp = cu_nb->nbparam;
+ cu_plist_t *plist = cu_nb->plist[iloc];
+ cu_timers_t *t = cu_nb->timers;
+ cudaStream_t stream = cu_nb->stream[iloc];
- bool bCalcEner = flags & GMX_FORCE_VIRIAL;
- bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
- bool bDoTime = cu_nb->bDoTime;
+ bool bCalcEner = flags & GMX_FORCE_VIRIAL;
+ bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
+ bool bDoTime = cu_nb->bDoTime;
/* turn energy calculation always on/off (for debugging/testing only) */
bCalcEner = (bCalcEner || always_ener) && !never_ener;
/* HtoD x, q */
cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
- adat_len * sizeof(*adat->xq), stream);
+ adat_len * sizeof(*adat->xq), stream);
if (bDoTime)
{
}
/* get the pointer to the kernel flavor we need to use */
- nb_kernel = select_nbnxn_kernel(cu_nb->kernel_ver, nbp->eeltype, bCalcEner,
+ nb_kernel = select_nbnxn_kernel(nbp->eeltype,
+ nbp->vdwtype,
+ bCalcEner,
plist->bDoPrune || always_prune);
/* kernel launch config */
nblock = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
dim_block = dim3(CL_SIZE, CL_SIZE, 1);
dim_grid = dim3(nblock, 1, 1);
- shmem = calc_shmem_required(cu_nb->kernel_ver);
+ shmem = calc_shmem_required();
if (debug)
{
NCL_PER_SUPERCL, plist->na_c);
}
- nb_kernel<<<dim_grid, dim_block, shmem, stream>>>(*adat, *nbp, *plist, bCalcFshift);
+ nb_kernel<<< dim_grid, dim_block, shmem, stream>>> (*adat, *nbp, *plist, bCalcFshift);
CU_LAUNCH_ERR("k_calc_nb");
if (bDoTime)
}
}
-void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
+void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
const nbnxn_atomdata_t *nbatom,
- int flags,
- int aloc)
+ int flags,
+ int aloc)
{
cudaError_t stat;
- int adat_begin, adat_len, adat_end; /* local/nonlocal offset and length used for xq and f */
- int iloc = -1;
+ int adat_begin, adat_len, adat_end; /* local/nonlocal offset and length used for xq and f */
+ int iloc = -1;
/* determine interaction locality from atom locality */
if (LOCAL_A(aloc))
gmx_incons(stmp);
}
- cu_atomdata_t *adat = cu_nb->atdat;
- cu_timers_t *t = cu_nb->timers;
- bool bDoTime = cu_nb->bDoTime;
- cudaStream_t stream = cu_nb->stream[iloc];
+ cu_atomdata_t *adat = cu_nb->atdat;
+ cu_timers_t *t = cu_nb->timers;
+ bool bDoTime = cu_nb->bDoTime;
+ cudaStream_t stream = cu_nb->stream[iloc];
- bool bCalcEner = flags & GMX_FORCE_VIRIAL;
- bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
+ bool bCalcEner = flags & GMX_FORCE_VIRIAL;
+ bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
/* don't launch copy-back if there was no work to do */
if (cu_nb->plist[iloc]->nsci == 0)
*(unsigned int*)&nbatom->out[0].f[adat_end*3 - 1] = poll_wait_pattern;
}
- /* With DD the local D2H transfer can only start after the non-local
+ /* With DD the local D2H transfer can only start after the non-local
has been launched. */
if (iloc == eintLocal && cu_nb->bUseTwoStreams)
{
}
/* DtoH f */
- cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin,
+ cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin,
(adat_len)*sizeof(*adat->f), stream);
/* After the non-local D2H is launched the nonlocal_done event can be
* polling wait for the GPU.
*/
static inline bool atomic_cas(volatile unsigned int *ptr,
- unsigned int oldval,
- unsigned int newval)
+ unsigned int oldval,
+ unsigned int newval)
{
assert(ptr);
real *e_lj, real *e_el, rvec *fshift)
{
/* NOTE: only implemented for single-precision at this time */
- cudaError_t stat;
- int i, adat_end, iloc = -1;
+ cudaError_t stat;
+ int i, adat_end, iloc = -1;
volatile unsigned int *poll_word;
/* determine interaction locality from atom locality */
gmx_incons(stmp);
}
- cu_plist_t *plist = cu_nb->plist[iloc];
- cu_timers_t *timers = cu_nb->timers;
- wallclock_gpu_t *timings = cu_nb->timings;
- nb_staging nbst = cu_nb->nbst;
+ cu_plist_t *plist = cu_nb->plist[iloc];
+ cu_timers_t *timers = cu_nb->timers;
+ wallclock_gpu_t *timings = cu_nb->timings;
+ nb_staging nbst = cu_nb->nbst;
- bool bCalcEner = flags & GMX_FORCE_VIRIAL;
- bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
+ bool bCalcEner = flags & GMX_FORCE_VIRIAL;
+ bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
/* turn energy calculation always on/off (for debugging/testing only) */
- bCalcEner = (bCalcEner || always_ener) && !never_ener;
+ bCalcEner = (bCalcEner || always_ener) && !never_ener;
/* don't launch wait/update timers & counters if there was no work to do
stat = cudaStreamSynchronize(cu_nb->stream[iloc]);
CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
}
- else
+ else
{
/* Busy-wait until we get the signal pattern set in last byte
* of the l/nl float vector. This pattern corresponds to a floating
* The polling uses atomic compare-exchange.
*/
poll_word = (volatile unsigned int*)&nbatom->out[0].f[adat_end*3 - 1];
- while (atomic_cas(poll_word, poll_wait_pattern, poll_wait_pattern)) {}
+ while (atomic_cas(poll_word, poll_wait_pattern, poll_wait_pattern))
+ {
+ }
}
/* timing data accumulation */
/* X/q H2D and F D2H timings */
timings->nb_h2d_t += cu_event_elapsed(timers->start_nb_h2d[iloc],
- timers->stop_nb_h2d[iloc]);
+ timers->stop_nb_h2d[iloc]);
timings->nb_d2h_t += cu_event_elapsed(timers->start_nb_d2h[iloc],
- timers->stop_nb_d2h[iloc]);
+ timers->stop_nb_d2h[iloc]);
/* only count atdat and pair-list H2D at pair-search step */
if (plist->bDoPrune)
{
timings->pl_h2d_c++;
timings->pl_h2d_t += cu_event_elapsed(timers->start_atdat,
- timers->stop_atdat);
+ timers->stop_atdat);
}
timings->pl_h2d_t += cu_event_elapsed(timers->start_pl_h2d[iloc],
- timers->stop_pl_h2d[iloc]);
+ timers->stop_pl_h2d[iloc]);
}
}
}
/*! Return the reference to the nbfp texture. */
-const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref()
+const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref()
+{
+ return nbfp_texref;
+}
+
+/*! Return the reference to the nbfp_comb texture. */
+const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref()
{
- return tex_nbfp;
+ return nbfp_comb_texref;
}
/*! Return the reference to the coulomb_tab. */
-const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref()
+const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref()
{
- return tex_coulomb_tab;
+ return coulomb_tab_texref;
}
/*! Set up the cache configuration for the non-bonded kernels,
cudaError_t stat;
for (int i = 0; i < eelCuNR; i++)
- for (int j = 0; j < nEnergyKernelTypes; j++)
- for (int k = 0; k < nPruneKernelTypes; k++)
+ {
+ for (int j = 0; j < evdwCuNR; j++)
+ {
+ if (devinfo->prop.major >= 3)
{
- /* Legacy kernel 16/48 kB Shared/L1 */
- stat = cudaFuncSetCacheConfig(nb_legacy_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
- CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
-
- if (devinfo->prop.major >= 3)
- {
- /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
- stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferShared);
- }
- else
- {
- /* On Fermi prefer L1 gives 2% higher performance */
- /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
- stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
- }
- CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
+ /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
+ cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferShared);
+ cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferShared);
+ cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferShared);
+ stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferShared);
}
+ else
+ {
+ /* On Fermi prefer L1 gives 2% higher performance */
+ /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
+ cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferL1);
+ cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferL1);
+ cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferL1);
+ stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferL1);
+ }
+ CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
+ }
+ }
}