# includes: Nothing to build, just installation
file(GLOB ROOT_LEGACY_HEADERS *.h)
file(GLOB ROOT_LEGACY_HEADERS_PRIVATE thread_mpi.h tmpi.h gmx_hash.h
- gmx_ga2la.h gpu_utils.h pmalloc_cuda.h nbnxn_cuda_data_mgmt.h)
+ gmx_ga2la.h gpu_utils.h pmalloc_cuda.h)
file(GLOB TYPES_LEGACY_HEADERS types/*.h)
file(GLOB TYPES_LEGACY_HEADERS_PRIVATE types/commrec.h)
list(REMOVE_ITEM ROOT_LEGACY_HEADERS ${ROOT_LEGACY_HEADERS_PRIVATE})
* message to fplog/stderr.
*/
-gmx_bool uses_simple_tables(int cutoff_scheme,
- nonbonded_verlet_t *nbv,
- int group);
+gmx_bool uses_simple_tables(int cutoff_scheme,
+ struct nonbonded_verlet_t *nbv,
+ int group);
/* Returns whether simple tables (i.e. not for use with GPUs) are used
* with the type of kernel indicated.
*/
float *cycles_pme);
/* Call all the force routines */
+void free_gpu_resources(const t_forcerec *fr,
+ const t_commrec *cr);
+
#ifdef __cplusplus
}
#endif
t_inputrec *inputrec,
t_nrnb nrnb[], gmx_wallcycle_t wcycle,
gmx_walltime_accounting_t walltime_accounting,
- wallclock_gpu_t *gputimes,
+ struct nonbonded_verlet_t *nbv,
gmx_bool bWriteStat);
void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr);
gmx_bool *bSimAnn, t_vcm **vcm, unsigned long Flags);
/* Routine in sim_util.c */
+gmx_bool use_GPU(const struct nonbonded_verlet_t *nbv);
+
#ifdef __cplusplus
}
#endif
#include "types/inputrec.h"
#include "types/nrnb.h"
#include "types/nblist.h"
-#include "types/nbnxn_pairlist.h"
#include "types/nsgrid.h"
#include "types/forcerec.h"
#include "types/fcdata.h"
#include "genborn.h"
#include "qmmmrec.h"
#include "../../topology/idef.h"
-#include "nb_verlet.h"
#include "interaction_const.h"
#include "hw_info.h"
/* Abstract type for PME that is defined only in the routine that use them. */
typedef struct gmx_pme *gmx_pme_t;
-
-
+struct nonbonded_verlet_t;
/* Structure describing the data in a single table */
typedef struct
rvec *shift_vec;
/* The neighborlists including tables */
- int nnblists;
- int *gid2nblists;
- t_nblists *nblists;
+ int nnblists;
+ int *gid2nblists;
+ t_nblists *nblists;
- int cutoff_scheme; /* group- or Verlet-style cutoff */
- gmx_bool bNonbonded; /* true if nonbonded calculations are *not* turned off */
- nonbonded_verlet_t *nbv;
+ int cutoff_scheme; /* group- or Verlet-style cutoff */
+ gmx_bool bNonbonded; /* true if nonbonded calculations are *not* turned off */
+ struct nonbonded_verlet_t *nbv;
/* The wall tables (if used) */
int nwall;
#include "gromacs/fileio/gmxfio.h"
#include "gromacs/fileio/pdbio.h"
#include "gromacs/imd/imd.h"
+#include "gromacs/mdlib/nb_verlet.h"
#include "gromacs/pbcutil/ishift.h"
#include "gromacs/pbcutil/pbc.h"
#include "gromacs/pulling/pull.h"
#include "types/nbnxn_cuda_types_ext.h"
#include "gpu_utils.h"
-#include "nbnxn_cuda_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
#include "pmalloc_cuda.h"
+#include "nb_verlet.h"
t_forcerec *mk_forcerec(void)
{
fr->excl_load[t] = i;
}
}
+
+/* Frees GPU memory and destroys the CUDA context.
+ *
+ * Note that this function needs to be called even if GPUs are not used
+ * in this run because the PME ranks have no knowledge of whether GPUs
+ * are used or not, but all ranks need to enter the barrier below.
+ */
+void free_gpu_resources(const t_forcerec *fr,
+ const t_commrec *cr)
+{
+ gmx_bool bIsPPrankUsingGPU;
+ char gpu_err_str[STRLEN];
+
+ bIsPPrankUsingGPU = (cr->duty & DUTY_PP) && fr && fr->nbv && fr->nbv->bUseGPU;
+
+ if (bIsPPrankUsingGPU)
+ {
+ /* free nbnxn data in GPU memory */
+ nbnxn_cuda_free(fr->nbv->cu_nbv);
+
+ /* With tMPI we need to wait for all ranks to finish deallocation before
+ * destroying the context in free_gpu() as some ranks may be sharing
+ * GPU and context.
+ * Note: as only PP ranks need to free GPU resources, so it is safe to
+ * not call the barrier on PME ranks.
+ */
+#ifdef GMX_THREAD_MPI
+ if (PAR(cr))
+ {
+ gmx_barrier(cr);
+ }
+#endif /* GMX_THREAD_MPI */
+
+ /* uninitialize GPU (by destroying the context) */
+ if (!free_gpu(gpu_err_str))
+ {
+ gmx_warning("On rank %d failed to free GPU #%d: %s",
+ cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
+ }
+ }
+}
#define NB_VERLET_H
#include "nbnxn_pairlist.h"
-#include "nbnxn_cuda_types_ext.h"
+#include "types/nbnxn_cuda_types_ext.h"
#ifdef __cplusplus
extern "C" {
enbvClearFNo, enbvClearFYes
};
-typedef struct {
+typedef struct nonbonded_verlet_group_t {
nbnxn_pairlist_set_t nbl_lists; /* pair list(s) */
nbnxn_atomdata_t *nbat; /* atom data */
int kernel_type; /* non-bonded kernel - see enum above */
} nonbonded_verlet_group_t;
/* non-bonded data structure with Verlet-type cut-off */
-typedef struct {
+typedef struct nonbonded_verlet_t {
nbnxn_search_t nbs; /* n vs n atom pair searching data */
int ngrp; /* number of interaction groups */
nonbonded_verlet_group_t grp[2]; /* local and non-local interaction group */
#include "gmx_omp_nthreads.h"
#include "thread_mpi/atomic.h"
+#include "gromacs/mdlib/nb_verlet.h"
#include "gromacs/pbcutil/ishift.h"
#include "gromacs/utility/gmxomp.h"
#include "gromacs/utility/smalloc.h"
#define _nbnxn_atomdata_h
#include "typedefs.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
#ifdef __cplusplus
extern "C" {
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013,2014 by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#define NBNXN_GPU_CLUSTER_SIZE 8
/* With GPU kernels we group cluster pairs in 4 to optimize memory usage.
- * To change this, also change nbnxn_cj4_t in include/types/nbnxn_pairlist.h.
+ * To change this, also change nbnxn_cj4_t in gromacs/mdlib/nbnxn_pairlist.h.
*/
#define NBNXN_GPU_JGROUP_SIZE 4
#define NBNXN_GPU_JGROUP_SIZE_2LOG 2
#include <cuda.h>
#include "types/simple.h"
-#include "types/nbnxn_pairlist.h"
-#include "types/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
+#include "gromacs/mdlib/nb_verlet.h"
#include "types/force_flags.h"
#include "../nbnxn_consts.h"
#include "nbnxn_cuda_types.h"
#include "../../gmxlib/cuda_tools/cudautils.cuh"
#include "nbnxn_cuda.h"
-#include "nbnxn_cuda_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
#include "gromacs/pbcutil/ishift.h"
#include "gromacs/utility/cstringutil.h"
#include "tables.h"
#include "typedefs.h"
#include "types/enums.h"
-#include "types/nb_verlet.h"
+#include "gromacs/mdlib/nb_verlet.h"
#include "types/interaction_const.h"
#include "types/force_flags.h"
#include "../nbnxn_consts.h"
#include "nbnxn_cuda_types.h"
#include "../../gmxlib/cuda_tools/cudautils.cuh"
-#include "nbnxn_cuda_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
#include "pmalloc_cuda.h"
#include "gpu_utils.h"
/*! Re-generate the GPU Ewald force table, resets rlist, and update the
* electrostatic type switching to twin cut-off (or back) if needed. */
-void nbnxn_cuda_pme_loadbal_update_param(nbnxn_cuda_ptr_t cu_nb,
- const interaction_const_t *ic)
+void nbnxn_cuda_pme_loadbal_update_param(const nonbonded_verlet_t *nbv,
+ const interaction_const_t *ic)
{
- cu_nbparam_t *nbp = cu_nb->nbparam;
+ if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_CUDA)
+ {
+ return;
+ }
+ nbnxn_cuda_ptr_t cu_nb = nbv->cu_nbv;
+ cu_nbparam_t *nbp = cu_nb->nbparam;
set_cutoff_parameters(nbp, ic);
return (cu_nb != NULL && cu_nb->bDoTime) ? cu_nb->timings : NULL;
}
-void nbnxn_cuda_reset_timings(nbnxn_cuda_ptr_t cu_nb)
+void nbnxn_cuda_reset_timings(nonbonded_verlet_t* nbv)
{
- if (cu_nb->bDoTime)
+ if (nbv->cu_nbv && nbv->cu_nbv->bDoTime)
{
- init_timings(cu_nb->timings);
+ init_timings(nbv->cu_nbv->timings);
}
}
#include "types/interaction_const.h"
#include "types/nbnxn_cuda_types_ext.h"
#include "types/hw_info.h"
-#include "types/nb_verlet.h"
#ifdef GMX_GPU
#define FUNC_TERM ;
extern "C" {
#endif
+struct nonbonded_verlet_group_t;
+struct nbnxn_pairlist_t;
+struct nbnxn_atomdata_t;
+
/** Initializes the data structures related to CUDA nonbonded calculations. */
FUNC_QUALIFIER
void nbnxn_cuda_init(FILE gmx_unused *fplog,
/** Initializes simulation constant data. */
FUNC_QUALIFIER
-void nbnxn_cuda_init_const(nbnxn_cuda_ptr_t gmx_unused cu_nb,
- const interaction_const_t gmx_unused *ic,
- const nonbonded_verlet_group_t gmx_unused *nbv_group) FUNC_TERM
+void nbnxn_cuda_init_const(nbnxn_cuda_ptr_t gmx_unused cu_nb,
+ const interaction_const_t gmx_unused *ic,
+ const struct nonbonded_verlet_group_t gmx_unused *nbv_group) FUNC_TERM
/** Initializes pair-list data for GPU, called at every pair search step. */
FUNC_QUALIFIER
-void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t gmx_unused cu_nb,
- const nbnxn_pairlist_t gmx_unused *h_nblist,
- int gmx_unused iloc) FUNC_TERM
+void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t gmx_unused cu_nb,
+ const struct nbnxn_pairlist_t gmx_unused *h_nblist,
+ int gmx_unused iloc) FUNC_TERM
/** Initializes atom-data on the GPU, called at every pair search step. */
FUNC_QUALIFIER
-void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t gmx_unused cu_nb,
- const nbnxn_atomdata_t gmx_unused *atomdata) FUNC_TERM
+void nbnxn_cuda_init_atomdata(const nbnxn_cuda_ptr_t gmx_unused cu_nb,
+ const struct nbnxn_atomdata_t gmx_unused *atomdata) FUNC_TERM
/*! \brief Update parameters during PP-PME load balancing. */
FUNC_QUALIFIER
-void nbnxn_cuda_pme_loadbal_update_param(nbnxn_cuda_ptr_t gmx_unused cu_nb,
- const interaction_const_t gmx_unused *ic) FUNC_TERM
+void nbnxn_cuda_pme_loadbal_update_param(const struct nonbonded_verlet_t gmx_unused *nbv,
+ const interaction_const_t gmx_unused *ic) FUNC_TERM
/** Uploads shift vector to the GPU if the box is dynamic (otherwise just returns). */
FUNC_QUALIFIER
-void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t gmx_unused cu_nb,
- const nbnxn_atomdata_t gmx_unused *nbatom) FUNC_TERM
+void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t gmx_unused cu_nb,
+ const struct nbnxn_atomdata_t gmx_unused *nbatom) FUNC_TERM
/** Clears GPU outputs: nonbonded force, shift force and energy. */
FUNC_QUALIFIER
/** Resets nonbonded GPU timings. */
FUNC_QUALIFIER
-void nbnxn_cuda_reset_timings(nbnxn_cuda_ptr_t gmx_unused cu_nb) FUNC_TERM
+void nbnxn_cuda_reset_timings(struct nonbonded_verlet_t gmx_unused *nbv) FUNC_TERM
/** Calculates the minimum size of proximity lists to improve SM load balance
* with CUDA non-bonded kernels. */
#define NBNXN_CUDA_TYPES_H
#include "types/interaction_const.h"
-#include "types/nbnxn_pairlist.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
#include "types/nbnxn_cuda_types_ext.h"
#include "../../gmxlib/cuda_tools/cudautils.cuh"
#include "nbnxn_simd.h"
#include "domdec.h"
#include "gromacs/timing/cyclecounter.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
/* Bounding box calculations are (currently) always in single precision, so
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#define _nbnxn_kernel_common_h
#include "typedefs.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
#ifdef __cplusplus
extern "C" {
#include "nbnxn_kernel_gpu_ref.h"
#include "../nbnxn_consts.h"
#include "nbnxn_kernel_common.h"
+#include "gromacs/mdlib/nb_verlet.h"
#define NCL_PER_SUPERCL (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
#define CL_SIZE (NBNXN_GPU_CLUSTER_SIZE)
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#define _nbnxn_kernel_gpu_ref_h
#include "typedefs.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
#ifdef __cplusplus
extern "C" {
#include "nbnxn_kernel_ref.h"
#include "../nbnxn_consts.h"
#include "nbnxn_kernel_common.h"
+#include "gromacs/mdlib/nb_verlet.h"
/*! \brief Typedefs for declaring lookup tables of kernel functions.
*/
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#define _nbnxn_kernel_ref_h
#include "typedefs.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
#ifdef __cplusplus
extern "C" {
#include "typedefs.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
#include "gromacs/mdlib/nbnxn_simd.h"
#ifdef GMX_NBNXN_SIMD_2XNN
#include "typedefs.h"
+#include "gromacs/mdlib/nb_verlet.h"
#include "gromacs/mdlib/nbnxn_simd.h"
#ifdef __cplusplus
#include "typedefs.h"
+#include "gromacs/mdlib/nb_verlet.h"
#include "gromacs/mdlib/nbnxn_simd.h"
#ifdef GMX_NBNXN_SIMD_4XN
#include "typedefs.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
#include "gromacs/mdlib/nbnxn_simd.h"
#ifdef __cplusplus
#ifndef _nbnxn_pairlist_h
#define _nbnxn_pairlist_h
-#include "nblist.h"
+#include "thread_mpi/atomic.h"
+#include "types/nblist.h"
#ifdef __cplusplus
extern "C" {
*/
} nbnxn_excl_t;
-typedef struct {
+typedef struct nbnxn_pairlist_t {
gmx_cache_protect_t cp0;
nbnxn_alloc_t *alloc;
ljcrGEOM, ljcrLB, ljcrNONE, ljcrNR
};
-/* TODO: Remove need for forward declare */
-struct tMPI_Atomic;
-
-typedef struct {
+typedef struct nbnxn_atomdata_t {
nbnxn_alloc_t *alloc;
nbnxn_free_t *free;
int ntype; /* The number of different atom types */
gmx_bool bUseBufferFlags; /* Use the flags or operate on all atoms */
nbnxn_buffer_flags_t buffer_flags; /* Flags for buffer zeroing+reduc. */
gmx_bool bUseTreeReduce; /* Use tree for force reduction */
- struct tMPI_Atomic *syncStep; /* Synchronization step for tree reduce */
+ tMPI_Atomic_t *syncStep; /* Synchronization step for tree reduce */
} nbnxn_atomdata_t;
#ifdef __cplusplus
#include "ns.h"
#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/mdlib/nb_verlet.h"
#include "gromacs/pbcutil/pbc.h"
#include "gromacs/utility/smalloc.h"
#define _nbnxn_search_h
#include "typedefs.h"
+#include "nbnxn_pairlist.h"
#ifdef __cplusplus
extern "C" {
#include "../gmxlib/nonbonded/nb_free_energy.h"
#include "gromacs/legacyheaders/types/commrec.h"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
#include "gromacs/pbcutil/ishift.h"
#include "gromacs/pbcutil/mshift.h"
#include "gromacs/timing/wallcycle.h"
#include "gmx_omp_nthreads.h"
-#include "nbnxn_cuda_data_mgmt.h"
#include "nbnxn_cuda/nbnxn_cuda.h"
+#include "nb_verlet.h"
+
void print_time(FILE *out,
gmx_walltime_accounting_t walltime_accounting,
gmx_int64_t step,
wallcycle_sub_stop(wcycle, ewcsNONBONDED);
}
+gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
+{
+ return nbv != NULL && nbv->bUseGPU;
+}
+
void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
t_inputrec *inputrec,
gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
t_inputrec *inputrec,
t_nrnb nrnb[], gmx_wallcycle_t wcycle,
gmx_walltime_accounting_t walltime_accounting,
- wallclock_gpu_t *gputimes,
+ nonbonded_verlet_t *nbv,
gmx_bool bWriteStat)
{
int i, j;
if (SIMMASTER(cr))
{
+ wallclock_gpu_t* gputimes = use_GPU(nbv) ?
+ nbnxn_cuda_get_timings(nbv->cu_nbv) : NULL;
wallcycle_print(fplog, cr->nnodes, cr->npmenodes,
elapsed_time_over_all_ranks,
wcycle, gputimes);
#include "membed.h"
#include "types/nlistheuristics.h"
#include "types/iteratedconstraints.h"
-#include "nbnxn_cuda_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
#include "gromacs/fileio/confio.h"
#include "gromacs/fileio/mdoutf.h"
gmx_int64_t *step_rel, t_inputrec *ir,
gmx_wallcycle_t wcycle, t_nrnb *nrnb,
gmx_walltime_accounting_t walltime_accounting,
- nbnxn_cuda_ptr_t cu_nbv)
+ struct nonbonded_verlet_t *nbv)
{
char sbuf[STEPSTRSIZE];
md_print_warn(cr, fplog, "step %s: resetting all time and cycle counters\n",
gmx_step_str(step, sbuf));
- if (cu_nbv)
- {
- nbnxn_cuda_reset_timings(cu_nbv);
- }
+ nbnxn_cuda_reset_timings(nbv);
wallcycle_stop(wcycle, ewcRUN);
wallcycle_reset_all(wcycle);
*/
if ((Flags & MD_TUNEPME) &&
EEL_PME(fr->eeltype) &&
- ( (fr->cutoff_scheme == ecutsVERLET && fr->nbv->bUseGPU) || !(cr->duty & DUTY_PME)) &&
+ ( use_GPU(fr->nbv) || !(cr->duty & DUTY_PME)) &&
!bRerunMD)
{
pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata);
{
/* Reset all the counters related to performance over the run */
reset_all_counters(fplog, cr, step, &step_rel, ir, wcycle, nrnb, walltime_accounting,
- fr->nbv != NULL && fr->nbv->bUseGPU ? fr->nbv->cu_nbv : NULL);
+ use_GPU(fr->nbv) ? fr->nbv : NULL);
wcycle_set_reset_counters(wcycle, -1);
if (!(cr->duty & DUTY_PME))
{
if (pme_loadbal != NULL)
{
pme_loadbal_done(pme_loadbal, cr, fplog,
- fr->nbv != NULL && fr->nbv->bUseGPU);
+ use_GPU(fr->nbv));
}
if (shellfc && fplog)
#include "calcgrid.h"
#include "pme.h"
#include "domdec.h"
-#include "nbnxn_cuda_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
#include "force.h"
#include "macros.h"
#include "md_logging.h"
#include "pme_loadbal.h"
#include "gromacs/math/vec.h"
+#include "gromacs/legacyheaders/sim_util.h"
#include "gromacs/pbcutil/pbc.h"
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/smalloc.h"
pme_lb->cur = pme_lb->start - 1;
}
-gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
- t_commrec *cr,
- FILE *fp_err,
- FILE *fp_log,
- t_inputrec *ir,
- t_state *state,
- double cycles,
- interaction_const_t *ic,
- nonbonded_verlet_t *nbv,
- gmx_pme_t *pmedata,
- gmx_int64_t step)
+gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
+ t_commrec *cr,
+ FILE *fp_err,
+ FILE *fp_log,
+ t_inputrec *ir,
+ t_state *state,
+ double cycles,
+ interaction_const_t *ic,
+ struct nonbonded_verlet_t *nbv,
+ gmx_pme_t *pmedata,
+ gmx_int64_t step)
{
gmx_bool OK;
pme_setup_t *set;
}
bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0);
- if (pme_lb->cutoff_scheme == ecutsVERLET &&
- nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
- {
- nbnxn_cuda_pme_loadbal_update_param(nbv->cu_nbv, ic);
-
- /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
- * also sharing texture references. To keep the code simple, we don't
- * treat texture references as shared resources, but this means that
- * the coulomb_tab texture ref will get updated by multiple threads.
- * Hence, to ensure that the non-bonded kernels don't start before all
- * texture binding operations are finished, we need to wait for all ranks
- * to arrive here before continuing.
- *
- * Note that we could omit this barrier if GPUs are not shared (or
- * texture objects are used), but as this is initialization code, there
- * is not point in complicating things.
- */
+ nbnxn_cuda_pme_loadbal_update_param(nbv, ic);
+
+ /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
+ * also sharing texture references. To keep the code simple, we don't
+ * treat texture references as shared resources, but this means that
+ * the coulomb_tab texture ref will get updated by multiple threads.
+ * Hence, to ensure that the non-bonded kernels don't start before all
+ * texture binding operations are finished, we need to wait for all ranks
+ * to arrive here before continuing.
+ *
+ * Note that we could omit this barrier if GPUs are not shared (or
+ * texture objects are used), but as this is initialization code, there
+ * is not point in complicating things.
+ */
#ifdef GMX_THREAD_MPI
- if (PAR(cr))
- {
- gmx_barrier(cr);
- }
-#endif /* GMX_THREAD_MPI */
+ if (PAR(cr) && use_GPU(nbv))
+ {
+ gmx_barrier(cr);
}
+#endif /* GMX_THREAD_MPI */
/* Usually we won't need the simple tables with GPUs.
* But we do with hybrid acceleration and with free energy.
* factors as well as DD load balancing.
* Returns TRUE the load balancing continues, FALSE is the balancing is done.
*/
-gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
- t_commrec *cr,
- FILE *fp_err,
- FILE *fp_log,
- t_inputrec *ir,
- t_state *state,
- double cycles,
- interaction_const_t *ic,
- nonbonded_verlet_t *nbv,
- gmx_pme_t *pmedata,
- gmx_int64_t step);
+gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
+ t_commrec *cr,
+ FILE *fp_err,
+ FILE *fp_log,
+ t_inputrec *ir,
+ t_state *state,
+ double cycles,
+ interaction_const_t *ic,
+ struct nonbonded_verlet_t *nbv,
+ gmx_pme_t *pmedata,
+ gmx_int64_t step);
/* Restart the PME load balancing discarding all timings gathered up till now */
void restart_pme_loadbal(pme_load_balancing_t pme_lb, int n);
#endif
#include "gpu_utils.h"
-#include "nbnxn_cuda_data_mgmt.h"
typedef struct {
gmx_integrator_t *func;
}
}
-/* Frees GPU memory and destroys the CUDA context.
- *
- * Note that this function needs to be called even if GPUs are not used
- * in this run because the PME ranks have no knowledge of whether GPUs
- * are used or not, but all ranks need to enter the barrier below.
- */
-static void free_gpu_resources(const t_forcerec *fr,
- const t_commrec *cr)
-{
- gmx_bool bIsPPrankUsingGPU;
- char gpu_err_str[STRLEN];
-
- bIsPPrankUsingGPU = (cr->duty & DUTY_PP) && fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU;
-
- if (bIsPPrankUsingGPU)
- {
- /* free nbnxn data in GPU memory */
- nbnxn_cuda_free(fr->nbv->cu_nbv);
-
- /* With tMPI we need to wait for all ranks to finish deallocation before
- * destroying the context in free_gpu() as some ranks may be sharing
- * GPU and context.
- * Note: as only PP ranks need to free GPU resources, so it is safe to
- * not call the barrier on PME ranks.
- */
-#ifdef GMX_THREAD_MPI
- if (PAR(cr))
- {
- gmx_barrier(cr);
- }
-#endif /* GMX_THREAD_MPI */
-
- /* uninitialize GPU (by destroying the context) */
- if (!free_gpu(gpu_err_str))
- {
- gmx_warning("On rank %d failed to free GPU #%d: %s",
- cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
- }
- }
-}
-
int mdrunner(gmx_hw_opt_t *hw_opt,
FILE *fplog, t_commrec *cr, int nfile,
const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
*/
finish_run(fplog, cr,
inputrec, nrnb, wcycle, walltime_accounting,
- fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
- nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
+ fr ? fr->nbv : NULL,
EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));