From: Roland Schulz Date: Tue, 20 May 2014 04:56:53 +0000 (-0400) Subject: Move some verlet headers to mdlib X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=6400ed5de8d26899d750a2652f7bf9e54125c540;p=alexxy%2Fgromacs.git Move some verlet headers to mdlib This change does removes all dependencies on those (explicit) types from files outside of mdlib (the only one were in programs/mdrun). Change-Id: Ia2b90303249e3e7454ac869afb5cbe0cf290a400 --- diff --git a/src/gromacs/legacyheaders/CMakeLists.txt b/src/gromacs/legacyheaders/CMakeLists.txt index 586797fab9..aea277ee6f 100644 --- a/src/gromacs/legacyheaders/CMakeLists.txt +++ b/src/gromacs/legacyheaders/CMakeLists.txt @@ -35,7 +35,7 @@ # includes: Nothing to build, just installation file(GLOB ROOT_LEGACY_HEADERS *.h) file(GLOB ROOT_LEGACY_HEADERS_PRIVATE thread_mpi.h tmpi.h gmx_hash.h - gmx_ga2la.h gpu_utils.h pmalloc_cuda.h nbnxn_cuda_data_mgmt.h) + gmx_ga2la.h gpu_utils.h pmalloc_cuda.h) file(GLOB TYPES_LEGACY_HEADERS types/*.h) file(GLOB TYPES_LEGACY_HEADERS_PRIVATE types/commrec.h) list(REMOVE_ITEM ROOT_LEGACY_HEADERS ${ROOT_LEGACY_HEADERS_PRIVATE}) diff --git a/src/gromacs/legacyheaders/force.h b/src/gromacs/legacyheaders/force.h index 087e5d1ecb..1d3fb6bc79 100644 --- a/src/gromacs/legacyheaders/force.h +++ b/src/gromacs/legacyheaders/force.h @@ -149,9 +149,9 @@ gmx_bool nbnxn_acceleration_supported(FILE *fplog, * message to fplog/stderr. */ -gmx_bool uses_simple_tables(int cutoff_scheme, - nonbonded_verlet_t *nbv, - int group); +gmx_bool uses_simple_tables(int cutoff_scheme, + struct nonbonded_verlet_t *nbv, + int group); /* Returns whether simple tables (i.e. not for use with GPUs) are used * with the type of kernel indicated. */ @@ -285,6 +285,9 @@ extern void do_force_lowlevel(FILE *fplog, float *cycles_pme); /* Call all the force routines */ +void free_gpu_resources(const t_forcerec *fr, + const t_commrec *cr); + #ifdef __cplusplus } #endif diff --git a/src/gromacs/legacyheaders/sim_util.h b/src/gromacs/legacyheaders/sim_util.h index e21a7d278b..5a404cc400 100644 --- a/src/gromacs/legacyheaders/sim_util.h +++ b/src/gromacs/legacyheaders/sim_util.h @@ -109,7 +109,7 @@ void finish_run(FILE *log, t_commrec *cr, t_inputrec *inputrec, t_nrnb nrnb[], gmx_wallcycle_t wcycle, gmx_walltime_accounting_t walltime_accounting, - wallclock_gpu_t *gputimes, + struct nonbonded_verlet_t *nbv, gmx_bool bWriteStat); void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr); @@ -139,6 +139,8 @@ void init_md(FILE *fplog, gmx_bool *bSimAnn, t_vcm **vcm, unsigned long Flags); /* Routine in sim_util.c */ +gmx_bool use_GPU(const struct nonbonded_verlet_t *nbv); + #ifdef __cplusplus } #endif diff --git a/src/gromacs/legacyheaders/typedefs.h b/src/gromacs/legacyheaders/typedefs.h index 242435d4e9..72661f7e95 100644 --- a/src/gromacs/legacyheaders/typedefs.h +++ b/src/gromacs/legacyheaders/typedefs.h @@ -52,7 +52,6 @@ #include "types/inputrec.h" #include "types/nrnb.h" #include "types/nblist.h" -#include "types/nbnxn_pairlist.h" #include "types/nsgrid.h" #include "types/forcerec.h" #include "types/fcdata.h" diff --git a/src/gromacs/legacyheaders/types/forcerec.h b/src/gromacs/legacyheaders/types/forcerec.h index 9be9520633..16b0e87841 100644 --- a/src/gromacs/legacyheaders/types/forcerec.h +++ b/src/gromacs/legacyheaders/types/forcerec.h @@ -39,7 +39,6 @@ #include "genborn.h" #include "qmmmrec.h" #include "../../topology/idef.h" -#include "nb_verlet.h" #include "interaction_const.h" #include "hw_info.h" @@ -52,8 +51,7 @@ extern "C" { /* Abstract type for PME that is defined only in the routine that use them. */ typedef struct gmx_pme *gmx_pme_t; - - +struct nonbonded_verlet_t; /* Structure describing the data in a single table */ typedef struct @@ -315,13 +313,13 @@ typedef struct { rvec *shift_vec; /* The neighborlists including tables */ - int nnblists; - int *gid2nblists; - t_nblists *nblists; + int nnblists; + int *gid2nblists; + t_nblists *nblists; - int cutoff_scheme; /* group- or Verlet-style cutoff */ - gmx_bool bNonbonded; /* true if nonbonded calculations are *not* turned off */ - nonbonded_verlet_t *nbv; + int cutoff_scheme; /* group- or Verlet-style cutoff */ + gmx_bool bNonbonded; /* true if nonbonded calculations are *not* turned off */ + struct nonbonded_verlet_t *nbv; /* The wall tables (if used) */ int nwall; diff --git a/src/gromacs/mdlib/domdec.c b/src/gromacs/mdlib/domdec.c index a70ab1c522..b94aa743fc 100644 --- a/src/gromacs/mdlib/domdec.c +++ b/src/gromacs/mdlib/domdec.c @@ -69,6 +69,7 @@ #include "gromacs/fileio/gmxfio.h" #include "gromacs/fileio/pdbio.h" #include "gromacs/imd/imd.h" +#include "gromacs/mdlib/nb_verlet.h" #include "gromacs/pbcutil/ishift.h" #include "gromacs/pbcutil/pbc.h" #include "gromacs/pulling/pull.h" diff --git a/src/gromacs/mdlib/forcerec.c b/src/gromacs/mdlib/forcerec.c index f2620a9de7..031d517886 100644 --- a/src/gromacs/mdlib/forcerec.c +++ b/src/gromacs/mdlib/forcerec.c @@ -76,8 +76,9 @@ #include "types/nbnxn_cuda_types_ext.h" #include "gpu_utils.h" -#include "nbnxn_cuda_data_mgmt.h" +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h" #include "pmalloc_cuda.h" +#include "nb_verlet.h" t_forcerec *mk_forcerec(void) { @@ -3321,3 +3322,44 @@ void forcerec_set_excl_load(t_forcerec *fr, fr->excl_load[t] = i; } } + +/* Frees GPU memory and destroys the CUDA context. + * + * Note that this function needs to be called even if GPUs are not used + * in this run because the PME ranks have no knowledge of whether GPUs + * are used or not, but all ranks need to enter the barrier below. + */ +void free_gpu_resources(const t_forcerec *fr, + const t_commrec *cr) +{ + gmx_bool bIsPPrankUsingGPU; + char gpu_err_str[STRLEN]; + + bIsPPrankUsingGPU = (cr->duty & DUTY_PP) && fr && fr->nbv && fr->nbv->bUseGPU; + + if (bIsPPrankUsingGPU) + { + /* free nbnxn data in GPU memory */ + nbnxn_cuda_free(fr->nbv->cu_nbv); + + /* With tMPI we need to wait for all ranks to finish deallocation before + * destroying the context in free_gpu() as some ranks may be sharing + * GPU and context. + * Note: as only PP ranks need to free GPU resources, so it is safe to + * not call the barrier on PME ranks. + */ +#ifdef GMX_THREAD_MPI + if (PAR(cr)) + { + gmx_barrier(cr); + } +#endif /* GMX_THREAD_MPI */ + + /* uninitialize GPU (by destroying the context) */ + if (!free_gpu(gpu_err_str)) + { + gmx_warning("On rank %d failed to free GPU #%d: %s", + cr->nodeid, get_current_gpu_device_id(), gpu_err_str); + } + } +} diff --git a/src/gromacs/legacyheaders/types/nb_verlet.h b/src/gromacs/mdlib/nb_verlet.h similarity index 97% rename from src/gromacs/legacyheaders/types/nb_verlet.h rename to src/gromacs/mdlib/nb_verlet.h index e1e8ab0ca8..4f51797e29 100644 --- a/src/gromacs/legacyheaders/types/nb_verlet.h +++ b/src/gromacs/mdlib/nb_verlet.h @@ -37,7 +37,7 @@ #define NB_VERLET_H #include "nbnxn_pairlist.h" -#include "nbnxn_cuda_types_ext.h" +#include "types/nbnxn_cuda_types_ext.h" #ifdef __cplusplus extern "C" { @@ -88,7 +88,7 @@ enum { enbvClearFNo, enbvClearFYes }; -typedef struct { +typedef struct nonbonded_verlet_group_t { nbnxn_pairlist_set_t nbl_lists; /* pair list(s) */ nbnxn_atomdata_t *nbat; /* atom data */ int kernel_type; /* non-bonded kernel - see enum above */ @@ -96,7 +96,7 @@ typedef struct { } nonbonded_verlet_group_t; /* non-bonded data structure with Verlet-type cut-off */ -typedef struct { +typedef struct nonbonded_verlet_t { nbnxn_search_t nbs; /* n vs n atom pair searching data */ int ngrp; /* number of interaction groups */ nonbonded_verlet_group_t grp[2]; /* local and non-local interaction group */ diff --git a/src/gromacs/mdlib/nbnxn_atomdata.c b/src/gromacs/mdlib/nbnxn_atomdata.c index 1babe769f7..5e4dfef0c0 100644 --- a/src/gromacs/mdlib/nbnxn_atomdata.c +++ b/src/gromacs/mdlib/nbnxn_atomdata.c @@ -49,6 +49,7 @@ #include "gmx_omp_nthreads.h" #include "thread_mpi/atomic.h" +#include "gromacs/mdlib/nb_verlet.h" #include "gromacs/pbcutil/ishift.h" #include "gromacs/utility/gmxomp.h" #include "gromacs/utility/smalloc.h" diff --git a/src/gromacs/mdlib/nbnxn_atomdata.h b/src/gromacs/mdlib/nbnxn_atomdata.h index 5855e5b50d..efe86a5b3a 100644 --- a/src/gromacs/mdlib/nbnxn_atomdata.h +++ b/src/gromacs/mdlib/nbnxn_atomdata.h @@ -37,6 +37,7 @@ #define _nbnxn_atomdata_h #include "typedefs.h" +#include "gromacs/mdlib/nbnxn_pairlist.h" #ifdef __cplusplus extern "C" { diff --git a/src/gromacs/mdlib/nbnxn_consts.h b/src/gromacs/mdlib/nbnxn_consts.h index f5bd3d01ab..719e47b8f4 100644 --- a/src/gromacs/mdlib/nbnxn_consts.h +++ b/src/gromacs/mdlib/nbnxn_consts.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013,2014 by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -63,7 +63,7 @@ extern "C" { #define NBNXN_GPU_CLUSTER_SIZE 8 /* With GPU kernels we group cluster pairs in 4 to optimize memory usage. - * To change this, also change nbnxn_cj4_t in include/types/nbnxn_pairlist.h. + * To change this, also change nbnxn_cj4_t in gromacs/mdlib/nbnxn_pairlist.h. */ #define NBNXN_GPU_JGROUP_SIZE 4 #define NBNXN_GPU_JGROUP_SIZE_2LOG 2 diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu index 40d86e1b08..fa2eb36b1c 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu @@ -44,8 +44,8 @@ #include #include "types/simple.h" -#include "types/nbnxn_pairlist.h" -#include "types/nb_verlet.h" +#include "gromacs/mdlib/nbnxn_pairlist.h" +#include "gromacs/mdlib/nb_verlet.h" #include "types/force_flags.h" #include "../nbnxn_consts.h" @@ -56,7 +56,7 @@ #include "nbnxn_cuda_types.h" #include "../../gmxlib/cuda_tools/cudautils.cuh" #include "nbnxn_cuda.h" -#include "nbnxn_cuda_data_mgmt.h" +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h" #include "gromacs/pbcutil/ishift.h" #include "gromacs/utility/cstringutil.h" diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu index 37679494fa..57fd906c10 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu @@ -44,7 +44,7 @@ #include "tables.h" #include "typedefs.h" #include "types/enums.h" -#include "types/nb_verlet.h" +#include "gromacs/mdlib/nb_verlet.h" #include "types/interaction_const.h" #include "types/force_flags.h" #include "../nbnxn_consts.h" @@ -52,7 +52,7 @@ #include "nbnxn_cuda_types.h" #include "../../gmxlib/cuda_tools/cudautils.cuh" -#include "nbnxn_cuda_data_mgmt.h" +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h" #include "pmalloc_cuda.h" #include "gpu_utils.h" @@ -426,10 +426,15 @@ static void init_nbparam(cu_nbparam_t *nbp, /*! Re-generate the GPU Ewald force table, resets rlist, and update the * electrostatic type switching to twin cut-off (or back) if needed. */ -void nbnxn_cuda_pme_loadbal_update_param(nbnxn_cuda_ptr_t cu_nb, - const interaction_const_t *ic) +void nbnxn_cuda_pme_loadbal_update_param(const nonbonded_verlet_t *nbv, + const interaction_const_t *ic) { - cu_nbparam_t *nbp = cu_nb->nbparam; + if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_CUDA) + { + return; + } + nbnxn_cuda_ptr_t cu_nb = nbv->cu_nbv; + cu_nbparam_t *nbp = cu_nb->nbparam; set_cutoff_parameters(nbp, ic); @@ -1077,11 +1082,11 @@ wallclock_gpu_t * nbnxn_cuda_get_timings(nbnxn_cuda_ptr_t cu_nb) return (cu_nb != NULL && cu_nb->bDoTime) ? cu_nb->timings : NULL; } -void nbnxn_cuda_reset_timings(nbnxn_cuda_ptr_t cu_nb) +void nbnxn_cuda_reset_timings(nonbonded_verlet_t* nbv) { - if (cu_nb->bDoTime) + if (nbv->cu_nbv && nbv->cu_nbv->bDoTime) { - init_timings(cu_nb->timings); + init_timings(nbv->cu_nbv->timings); } } diff --git a/src/gromacs/legacyheaders/nbnxn_cuda_data_mgmt.h b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h similarity index 84% rename from src/gromacs/legacyheaders/nbnxn_cuda_data_mgmt.h rename to src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h index 919e1355b0..f77a91583b 100644 --- a/src/gromacs/legacyheaders/nbnxn_cuda_data_mgmt.h +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h @@ -40,7 +40,6 @@ #include "types/interaction_const.h" #include "types/nbnxn_cuda_types_ext.h" #include "types/hw_info.h" -#include "types/nb_verlet.h" #ifdef GMX_GPU #define FUNC_TERM ; @@ -54,6 +53,10 @@ extern "C" { #endif +struct nonbonded_verlet_group_t; +struct nbnxn_pairlist_t; +struct nbnxn_atomdata_t; + /** Initializes the data structures related to CUDA nonbonded calculations. */ FUNC_QUALIFIER void nbnxn_cuda_init(FILE gmx_unused *fplog, @@ -66,30 +69,30 @@ void nbnxn_cuda_init(FILE gmx_unused *fplog, /** Initializes simulation constant data. */ FUNC_QUALIFIER -void nbnxn_cuda_init_const(nbnxn_cuda_ptr_t gmx_unused cu_nb, - const interaction_const_t gmx_unused *ic, - const nonbonded_verlet_group_t gmx_unused *nbv_group) FUNC_TERM +void nbnxn_cuda_init_const(nbnxn_cuda_ptr_t gmx_unused cu_nb, + const interaction_const_t gmx_unused *ic, + const struct nonbonded_verlet_group_t gmx_unused *nbv_group) FUNC_TERM /** Initializes pair-list data for GPU, called at every pair search step. */ FUNC_QUALIFIER -void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t gmx_unused cu_nb, - const nbnxn_pairlist_t gmx_unused *h_nblist, - int gmx_unused iloc) FUNC_TERM +void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t gmx_unused cu_nb, + const struct nbnxn_pairlist_t gmx_unused *h_nblist, + int gmx_unused iloc) FUNC_TERM /** Initializes atom-data on the GPU, called at every pair search step. */ FUNC_QUALIFIER -void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t gmx_unused cu_nb, - const nbnxn_atomdata_t gmx_unused *atomdata) FUNC_TERM +void nbnxn_cuda_init_atomdata(const nbnxn_cuda_ptr_t gmx_unused cu_nb, + const struct nbnxn_atomdata_t gmx_unused *atomdata) FUNC_TERM /*! \brief Update parameters during PP-PME load balancing. */ FUNC_QUALIFIER -void nbnxn_cuda_pme_loadbal_update_param(nbnxn_cuda_ptr_t gmx_unused cu_nb, - const interaction_const_t gmx_unused *ic) FUNC_TERM +void nbnxn_cuda_pme_loadbal_update_param(const struct nonbonded_verlet_t gmx_unused *nbv, + const interaction_const_t gmx_unused *ic) FUNC_TERM /** Uploads shift vector to the GPU if the box is dynamic (otherwise just returns). */ FUNC_QUALIFIER -void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t gmx_unused cu_nb, - const nbnxn_atomdata_t gmx_unused *nbatom) FUNC_TERM +void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t gmx_unused cu_nb, + const struct nbnxn_atomdata_t gmx_unused *nbatom) FUNC_TERM /** Clears GPU outputs: nonbonded force, shift force and energy. */ FUNC_QUALIFIER @@ -113,7 +116,7 @@ wallclock_gpu_t * nbnxn_cuda_get_timings(nbnxn_cuda_ptr_t gmx_unused cu_nb) /** Resets nonbonded GPU timings. */ FUNC_QUALIFIER -void nbnxn_cuda_reset_timings(nbnxn_cuda_ptr_t gmx_unused cu_nb) FUNC_TERM +void nbnxn_cuda_reset_timings(struct nonbonded_verlet_t gmx_unused *nbv) FUNC_TERM /** Calculates the minimum size of proximity lists to improve SM load balance * with CUDA non-bonded kernels. */ diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h index 0fa40d2466..74df69eb34 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h @@ -47,7 +47,7 @@ #define NBNXN_CUDA_TYPES_H #include "types/interaction_const.h" -#include "types/nbnxn_pairlist.h" +#include "gromacs/mdlib/nbnxn_pairlist.h" #include "types/nbnxn_cuda_types_ext.h" #include "../../gmxlib/cuda_tools/cudautils.cuh" diff --git a/src/gromacs/mdlib/nbnxn_internal.h b/src/gromacs/mdlib/nbnxn_internal.h index fc42e60935..352253ec9d 100644 --- a/src/gromacs/mdlib/nbnxn_internal.h +++ b/src/gromacs/mdlib/nbnxn_internal.h @@ -40,6 +40,7 @@ #include "nbnxn_simd.h" #include "domdec.h" #include "gromacs/timing/cyclecounter.h" +#include "gromacs/mdlib/nbnxn_pairlist.h" /* Bounding box calculations are (currently) always in single precision, so diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_common.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_common.h index 7855b310fe..89b365f55b 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_common.h +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_common.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -37,6 +37,7 @@ #define _nbnxn_kernel_common_h #include "typedefs.h" +#include "gromacs/mdlib/nbnxn_pairlist.h" #ifdef __cplusplus extern "C" { diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.c b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.c index 898d300da1..a606329e56 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.c +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.c @@ -47,6 +47,7 @@ #include "nbnxn_kernel_gpu_ref.h" #include "../nbnxn_consts.h" #include "nbnxn_kernel_common.h" +#include "gromacs/mdlib/nb_verlet.h" #define NCL_PER_SUPERCL (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER) #define CL_SIZE (NBNXN_GPU_CLUSTER_SIZE) diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h index 18f4e9d01f..2fda7440a6 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -37,6 +37,7 @@ #define _nbnxn_kernel_gpu_ref_h #include "typedefs.h" +#include "gromacs/mdlib/nbnxn_pairlist.h" #ifdef __cplusplus extern "C" { diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c index 4638d1b6a4..ca00dc79ac 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c @@ -48,6 +48,7 @@ #include "nbnxn_kernel_ref.h" #include "../nbnxn_consts.h" #include "nbnxn_kernel_common.h" +#include "gromacs/mdlib/nb_verlet.h" /*! \brief Typedefs for declaring lookup tables of kernel functions. */ diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.h index bfcfee5b77..16e864d9a3 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.h +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -37,6 +37,7 @@ #define _nbnxn_kernel_ref_h #include "typedefs.h" +#include "gromacs/mdlib/nbnxn_pairlist.h" #ifdef __cplusplus extern "C" { diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c index a6f53dffa6..135f9605c4 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c +++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c @@ -41,6 +41,7 @@ #include "typedefs.h" +#include "gromacs/mdlib/nbnxn_pairlist.h" #include "gromacs/mdlib/nbnxn_simd.h" #ifdef GMX_NBNXN_SIMD_2XNN diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h index c7ec9bcbd7..6baaa8aadb 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h +++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h @@ -39,6 +39,7 @@ #include "typedefs.h" +#include "gromacs/mdlib/nb_verlet.h" #include "gromacs/mdlib/nbnxn_simd.h" #ifdef __cplusplus diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c index de84c80532..8e1db2d10a 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c +++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c @@ -41,6 +41,7 @@ #include "typedefs.h" +#include "gromacs/mdlib/nb_verlet.h" #include "gromacs/mdlib/nbnxn_simd.h" #ifdef GMX_NBNXN_SIMD_4XN diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h index e6e475765a..7fcc431a3a 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h +++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h @@ -39,6 +39,7 @@ #include "typedefs.h" +#include "gromacs/mdlib/nbnxn_pairlist.h" #include "gromacs/mdlib/nbnxn_simd.h" #ifdef __cplusplus diff --git a/src/gromacs/legacyheaders/types/nbnxn_pairlist.h b/src/gromacs/mdlib/nbnxn_pairlist.h similarity index 98% rename from src/gromacs/legacyheaders/types/nbnxn_pairlist.h rename to src/gromacs/mdlib/nbnxn_pairlist.h index dec56d38f1..f03009d9a8 100644 --- a/src/gromacs/legacyheaders/types/nbnxn_pairlist.h +++ b/src/gromacs/mdlib/nbnxn_pairlist.h @@ -36,7 +36,8 @@ #ifndef _nbnxn_pairlist_h #define _nbnxn_pairlist_h -#include "nblist.h" +#include "thread_mpi/atomic.h" +#include "types/nblist.h" #ifdef __cplusplus extern "C" { @@ -124,7 +125,7 @@ typedef struct { */ } nbnxn_excl_t; -typedef struct { +typedef struct nbnxn_pairlist_t { gmx_cache_protect_t cp0; nbnxn_alloc_t *alloc; @@ -222,10 +223,7 @@ enum { ljcrGEOM, ljcrLB, ljcrNONE, ljcrNR }; -/* TODO: Remove need for forward declare */ -struct tMPI_Atomic; - -typedef struct { +typedef struct nbnxn_atomdata_t { nbnxn_alloc_t *alloc; nbnxn_free_t *free; int ntype; /* The number of different atom types */ @@ -269,7 +267,7 @@ typedef struct { gmx_bool bUseBufferFlags; /* Use the flags or operate on all atoms */ nbnxn_buffer_flags_t buffer_flags; /* Flags for buffer zeroing+reduc. */ gmx_bool bUseTreeReduce; /* Use tree for force reduction */ - struct tMPI_Atomic *syncStep; /* Synchronization step for tree reduce */ + tMPI_Atomic_t *syncStep; /* Synchronization step for tree reduce */ } nbnxn_atomdata_t; #ifdef __cplusplus diff --git a/src/gromacs/mdlib/nbnxn_search.c b/src/gromacs/mdlib/nbnxn_search.c index 0826017bab..620dc0dd74 100644 --- a/src/gromacs/mdlib/nbnxn_search.c +++ b/src/gromacs/mdlib/nbnxn_search.c @@ -56,6 +56,7 @@ #include "ns.h" #include "gromacs/pbcutil/ishift.h" +#include "gromacs/mdlib/nb_verlet.h" #include "gromacs/pbcutil/pbc.h" #include "gromacs/utility/smalloc.h" diff --git a/src/gromacs/mdlib/nbnxn_search.h b/src/gromacs/mdlib/nbnxn_search.h index 6b3ab7c8d2..500c7188ac 100644 --- a/src/gromacs/mdlib/nbnxn_search.h +++ b/src/gromacs/mdlib/nbnxn_search.h @@ -37,6 +37,7 @@ #define _nbnxn_search_h #include "typedefs.h" +#include "nbnxn_pairlist.h" #ifdef __cplusplus extern "C" { diff --git a/src/gromacs/mdlib/sim_util.c b/src/gromacs/mdlib/sim_util.c index 06bc1a858b..ada853a849 100644 --- a/src/gromacs/mdlib/sim_util.c +++ b/src/gromacs/mdlib/sim_util.c @@ -79,6 +79,7 @@ #include "../gmxlib/nonbonded/nb_free_energy.h" #include "gromacs/legacyheaders/types/commrec.h" +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h" #include "gromacs/pbcutil/ishift.h" #include "gromacs/pbcutil/mshift.h" #include "gromacs/timing/wallcycle.h" @@ -94,9 +95,10 @@ #include "gmx_omp_nthreads.h" -#include "nbnxn_cuda_data_mgmt.h" #include "nbnxn_cuda/nbnxn_cuda.h" +#include "nb_verlet.h" + void print_time(FILE *out, gmx_walltime_accounting_t walltime_accounting, gmx_int64_t step, @@ -797,6 +799,11 @@ static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists, wallcycle_sub_stop(wcycle, ewcsNONBONDED); } +gmx_bool use_GPU(const nonbonded_verlet_t *nbv) +{ + return nbv != NULL && nbv->bUseGPU; +} + void do_force_cutsVERLET(FILE *fplog, t_commrec *cr, t_inputrec *inputrec, gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle, @@ -2645,7 +2652,7 @@ void finish_run(FILE *fplog, t_commrec *cr, t_inputrec *inputrec, t_nrnb nrnb[], gmx_wallcycle_t wcycle, gmx_walltime_accounting_t walltime_accounting, - wallclock_gpu_t *gputimes, + nonbonded_verlet_t *nbv, gmx_bool bWriteStat) { int i, j; @@ -2709,6 +2716,8 @@ void finish_run(FILE *fplog, t_commrec *cr, if (SIMMASTER(cr)) { + wallclock_gpu_t* gputimes = use_GPU(nbv) ? + nbnxn_cuda_get_timings(nbv->cu_nbv) : NULL; wallcycle_print(fplog, cr->nnodes, cr->npmenodes, elapsed_time_over_all_ranks, wcycle, gputimes); diff --git a/src/programs/mdrun/md.cpp b/src/programs/mdrun/md.cpp index 934443bcf3..011b84a3d8 100644 --- a/src/programs/mdrun/md.cpp +++ b/src/programs/mdrun/md.cpp @@ -76,7 +76,7 @@ #include "membed.h" #include "types/nlistheuristics.h" #include "types/iteratedconstraints.h" -#include "nbnxn_cuda_data_mgmt.h" +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h" #include "gromacs/fileio/confio.h" #include "gromacs/fileio/mdoutf.h" @@ -103,7 +103,7 @@ static void reset_all_counters(FILE *fplog, t_commrec *cr, gmx_int64_t *step_rel, t_inputrec *ir, gmx_wallcycle_t wcycle, t_nrnb *nrnb, gmx_walltime_accounting_t walltime_accounting, - nbnxn_cuda_ptr_t cu_nbv) + struct nonbonded_verlet_t *nbv) { char sbuf[STEPSTRSIZE]; @@ -111,10 +111,7 @@ static void reset_all_counters(FILE *fplog, t_commrec *cr, md_print_warn(cr, fplog, "step %s: resetting all time and cycle counters\n", gmx_step_str(step, sbuf)); - if (cu_nbv) - { - nbnxn_cuda_reset_timings(cu_nbv); - } + nbnxn_cuda_reset_timings(nbv); wallcycle_stop(wcycle, ewcRUN); wallcycle_reset_all(wcycle); @@ -477,7 +474,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[], */ if ((Flags & MD_TUNEPME) && EEL_PME(fr->eeltype) && - ( (fr->cutoff_scheme == ecutsVERLET && fr->nbv->bUseGPU) || !(cr->duty & DUTY_PME)) && + ( use_GPU(fr->nbv) || !(cr->duty & DUTY_PME)) && !bRerunMD) { pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata); @@ -1919,7 +1916,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[], { /* Reset all the counters related to performance over the run */ reset_all_counters(fplog, cr, step, &step_rel, ir, wcycle, nrnb, walltime_accounting, - fr->nbv != NULL && fr->nbv->bUseGPU ? fr->nbv->cu_nbv : NULL); + use_GPU(fr->nbv) ? fr->nbv : NULL); wcycle_set_reset_counters(wcycle, -1); if (!(cr->duty & DUTY_PME)) { @@ -1974,7 +1971,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[], if (pme_loadbal != NULL) { pme_loadbal_done(pme_loadbal, cr, fplog, - fr->nbv != NULL && fr->nbv->bUseGPU); + use_GPU(fr->nbv)); } if (shellfc && fplog) diff --git a/src/programs/mdrun/pme_loadbal.c b/src/programs/mdrun/pme_loadbal.c index 2996bc6210..75c6c78917 100644 --- a/src/programs/mdrun/pme_loadbal.c +++ b/src/programs/mdrun/pme_loadbal.c @@ -39,13 +39,14 @@ #include "calcgrid.h" #include "pme.h" #include "domdec.h" -#include "nbnxn_cuda_data_mgmt.h" +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h" #include "force.h" #include "macros.h" #include "md_logging.h" #include "pme_loadbal.h" #include "gromacs/math/vec.h" +#include "gromacs/legacyheaders/sim_util.h" #include "gromacs/pbcutil/pbc.h" #include "gromacs/utility/cstringutil.h" #include "gromacs/utility/smalloc.h" @@ -428,17 +429,17 @@ static void switch_to_stage1(pme_load_balancing_t pme_lb) pme_lb->cur = pme_lb->start - 1; } -gmx_bool pme_load_balance(pme_load_balancing_t pme_lb, - t_commrec *cr, - FILE *fp_err, - FILE *fp_log, - t_inputrec *ir, - t_state *state, - double cycles, - interaction_const_t *ic, - nonbonded_verlet_t *nbv, - gmx_pme_t *pmedata, - gmx_int64_t step) +gmx_bool pme_load_balance(pme_load_balancing_t pme_lb, + t_commrec *cr, + FILE *fp_err, + FILE *fp_log, + t_inputrec *ir, + t_state *state, + double cycles, + interaction_const_t *ic, + struct nonbonded_verlet_t *nbv, + gmx_pme_t *pmedata, + gmx_int64_t step) { gmx_bool OK; pme_setup_t *set; @@ -690,30 +691,26 @@ gmx_bool pme_load_balance(pme_load_balancing_t pme_lb, } bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0); - if (pme_lb->cutoff_scheme == ecutsVERLET && - nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA) - { - nbnxn_cuda_pme_loadbal_update_param(nbv->cu_nbv, ic); - - /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore - * also sharing texture references. To keep the code simple, we don't - * treat texture references as shared resources, but this means that - * the coulomb_tab texture ref will get updated by multiple threads. - * Hence, to ensure that the non-bonded kernels don't start before all - * texture binding operations are finished, we need to wait for all ranks - * to arrive here before continuing. - * - * Note that we could omit this barrier if GPUs are not shared (or - * texture objects are used), but as this is initialization code, there - * is not point in complicating things. - */ + nbnxn_cuda_pme_loadbal_update_param(nbv, ic); + + /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore + * also sharing texture references. To keep the code simple, we don't + * treat texture references as shared resources, but this means that + * the coulomb_tab texture ref will get updated by multiple threads. + * Hence, to ensure that the non-bonded kernels don't start before all + * texture binding operations are finished, we need to wait for all ranks + * to arrive here before continuing. + * + * Note that we could omit this barrier if GPUs are not shared (or + * texture objects are used), but as this is initialization code, there + * is not point in complicating things. + */ #ifdef GMX_THREAD_MPI - if (PAR(cr)) - { - gmx_barrier(cr); - } -#endif /* GMX_THREAD_MPI */ + if (PAR(cr) && use_GPU(nbv)) + { + gmx_barrier(cr); } +#endif /* GMX_THREAD_MPI */ /* Usually we won't need the simple tables with GPUs. * But we do with hybrid acceleration and with free energy. diff --git a/src/programs/mdrun/pme_loadbal.h b/src/programs/mdrun/pme_loadbal.h index eddb4d67eb..a96881eac3 100644 --- a/src/programs/mdrun/pme_loadbal.h +++ b/src/programs/mdrun/pme_loadbal.h @@ -60,17 +60,17 @@ void pme_loadbal_init(pme_load_balancing_t *pme_lb_p, * factors as well as DD load balancing. * Returns TRUE the load balancing continues, FALSE is the balancing is done. */ -gmx_bool pme_load_balance(pme_load_balancing_t pme_lb, - t_commrec *cr, - FILE *fp_err, - FILE *fp_log, - t_inputrec *ir, - t_state *state, - double cycles, - interaction_const_t *ic, - nonbonded_verlet_t *nbv, - gmx_pme_t *pmedata, - gmx_int64_t step); +gmx_bool pme_load_balance(pme_load_balancing_t pme_lb, + t_commrec *cr, + FILE *fp_err, + FILE *fp_log, + t_inputrec *ir, + t_state *state, + double cycles, + interaction_const_t *ic, + struct nonbonded_verlet_t *nbv, + gmx_pme_t *pmedata, + gmx_int64_t step); /* Restart the PME load balancing discarding all timings gathered up till now */ void restart_pme_loadbal(pme_load_balancing_t pme_lb, int n); diff --git a/src/programs/mdrun/runner.cpp b/src/programs/mdrun/runner.cpp index 9a00d35da4..4281e4e035 100644 --- a/src/programs/mdrun/runner.cpp +++ b/src/programs/mdrun/runner.cpp @@ -97,7 +97,6 @@ #endif #include "gpu_utils.h" -#include "nbnxn_cuda_data_mgmt.h" typedef struct { gmx_integrator_t *func; @@ -1037,47 +1036,6 @@ static void override_nsteps_cmdline(FILE *fplog, } } -/* Frees GPU memory and destroys the CUDA context. - * - * Note that this function needs to be called even if GPUs are not used - * in this run because the PME ranks have no knowledge of whether GPUs - * are used or not, but all ranks need to enter the barrier below. - */ -static void free_gpu_resources(const t_forcerec *fr, - const t_commrec *cr) -{ - gmx_bool bIsPPrankUsingGPU; - char gpu_err_str[STRLEN]; - - bIsPPrankUsingGPU = (cr->duty & DUTY_PP) && fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU; - - if (bIsPPrankUsingGPU) - { - /* free nbnxn data in GPU memory */ - nbnxn_cuda_free(fr->nbv->cu_nbv); - - /* With tMPI we need to wait for all ranks to finish deallocation before - * destroying the context in free_gpu() as some ranks may be sharing - * GPU and context. - * Note: as only PP ranks need to free GPU resources, so it is safe to - * not call the barrier on PME ranks. - */ -#ifdef GMX_THREAD_MPI - if (PAR(cr)) - { - gmx_barrier(cr); - } -#endif /* GMX_THREAD_MPI */ - - /* uninitialize GPU (by destroying the context) */ - if (!free_gpu(gpu_err_str)) - { - gmx_warning("On rank %d failed to free GPU #%d: %s", - cr->nodeid, get_current_gpu_device_id(), gpu_err_str); - } - } -} - int mdrunner(gmx_hw_opt_t *hw_opt, FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose, @@ -1793,8 +1751,7 @@ int mdrunner(gmx_hw_opt_t *hw_opt, */ finish_run(fplog, cr, inputrec, nrnb, wcycle, walltime_accounting, - fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ? - nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL, + fr ? fr->nbv : NULL, EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));