From: Roland Schulz <roland@utk.edu>
Date: Tue, 20 May 2014 04:56:53 +0000 (-0400)
Subject: Move some verlet headers to mdlib
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=6400ed5de8d26899d750a2652f7bf9e54125c540;p=alexxy%2Fgromacs.git

Move some verlet headers to mdlib

This change does removes all dependencies on those (explicit) types
from files outside of mdlib (the only one were in programs/mdrun).

Change-Id: Ia2b90303249e3e7454ac869afb5cbe0cf290a400
---

diff --git a/src/gromacs/legacyheaders/CMakeLists.txt b/src/gromacs/legacyheaders/CMakeLists.txt
index 586797fab9..aea277ee6f 100644
--- a/src/gromacs/legacyheaders/CMakeLists.txt
+++ b/src/gromacs/legacyheaders/CMakeLists.txt
@@ -35,7 +35,7 @@
 # includes: Nothing to build, just installation
 file(GLOB ROOT_LEGACY_HEADERS          *.h)
 file(GLOB ROOT_LEGACY_HEADERS_PRIVATE  thread_mpi.h tmpi.h gmx_hash.h 
-     gmx_ga2la.h gpu_utils.h pmalloc_cuda.h nbnxn_cuda_data_mgmt.h)
+     gmx_ga2la.h gpu_utils.h pmalloc_cuda.h)
 file(GLOB TYPES_LEGACY_HEADERS         types/*.h)
 file(GLOB TYPES_LEGACY_HEADERS_PRIVATE types/commrec.h)
 list(REMOVE_ITEM ROOT_LEGACY_HEADERS   ${ROOT_LEGACY_HEADERS_PRIVATE})
diff --git a/src/gromacs/legacyheaders/force.h b/src/gromacs/legacyheaders/force.h
index 087e5d1ecb..1d3fb6bc79 100644
--- a/src/gromacs/legacyheaders/force.h
+++ b/src/gromacs/legacyheaders/force.h
@@ -149,9 +149,9 @@ gmx_bool nbnxn_acceleration_supported(FILE             *fplog,
  * message to fplog/stderr.
  */
 
-gmx_bool uses_simple_tables(int                 cutoff_scheme,
-                            nonbonded_verlet_t *nbv,
-                            int                 group);
+gmx_bool uses_simple_tables(int                        cutoff_scheme,
+                            struct nonbonded_verlet_t *nbv,
+                            int                        group);
 /* Returns whether simple tables (i.e. not for use with GPUs) are used
  * with the type of kernel indicated.
  */
@@ -285,6 +285,9 @@ extern void do_force_lowlevel(FILE         *fplog,
                               float        *cycles_pme);
 /* Call all the force routines */
 
+void free_gpu_resources(const t_forcerec *fr,
+                        const t_commrec  *cr);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gromacs/legacyheaders/sim_util.h b/src/gromacs/legacyheaders/sim_util.h
index e21a7d278b..5a404cc400 100644
--- a/src/gromacs/legacyheaders/sim_util.h
+++ b/src/gromacs/legacyheaders/sim_util.h
@@ -109,7 +109,7 @@ void finish_run(FILE *log, t_commrec *cr,
                 t_inputrec *inputrec,
                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
                 gmx_walltime_accounting_t walltime_accounting,
-                wallclock_gpu_t *gputimes,
+                struct nonbonded_verlet_t *nbv,
                 gmx_bool bWriteStat);
 
 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr);
@@ -139,6 +139,8 @@ void init_md(FILE *fplog,
              gmx_bool *bSimAnn, t_vcm **vcm, unsigned long Flags);
 /* Routine in sim_util.c */
 
+gmx_bool use_GPU(const struct nonbonded_verlet_t *nbv);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gromacs/legacyheaders/typedefs.h b/src/gromacs/legacyheaders/typedefs.h
index 242435d4e9..72661f7e95 100644
--- a/src/gromacs/legacyheaders/typedefs.h
+++ b/src/gromacs/legacyheaders/typedefs.h
@@ -52,7 +52,6 @@
 #include "types/inputrec.h"
 #include "types/nrnb.h"
 #include "types/nblist.h"
-#include "types/nbnxn_pairlist.h"
 #include "types/nsgrid.h"
 #include "types/forcerec.h"
 #include "types/fcdata.h"
diff --git a/src/gromacs/legacyheaders/types/forcerec.h b/src/gromacs/legacyheaders/types/forcerec.h
index 9be9520633..16b0e87841 100644
--- a/src/gromacs/legacyheaders/types/forcerec.h
+++ b/src/gromacs/legacyheaders/types/forcerec.h
@@ -39,7 +39,6 @@
 #include "genborn.h"
 #include "qmmmrec.h"
 #include "../../topology/idef.h"
-#include "nb_verlet.h"
 #include "interaction_const.h"
 #include "hw_info.h"
 
@@ -52,8 +51,7 @@ extern "C" {
 
 /* Abstract type for PME that is defined only in the routine that use them. */
 typedef struct gmx_pme *gmx_pme_t;
-
-
+struct nonbonded_verlet_t;
 
 /* Structure describing the data in a single table */
 typedef struct
@@ -315,13 +313,13 @@ typedef struct {
     rvec        *shift_vec;
 
     /* The neighborlists including tables */
-    int                 nnblists;
-    int                *gid2nblists;
-    t_nblists          *nblists;
+    int                        nnblists;
+    int                       *gid2nblists;
+    t_nblists                 *nblists;
 
-    int                 cutoff_scheme; /* group- or Verlet-style cutoff */
-    gmx_bool            bNonbonded;    /* true if nonbonded calculations are *not* turned off */
-    nonbonded_verlet_t *nbv;
+    int                        cutoff_scheme; /* group- or Verlet-style cutoff */
+    gmx_bool                   bNonbonded;    /* true if nonbonded calculations are *not* turned off */
+    struct nonbonded_verlet_t *nbv;
 
     /* The wall tables (if used) */
     int            nwall;
diff --git a/src/gromacs/mdlib/domdec.c b/src/gromacs/mdlib/domdec.c
index a70ab1c522..b94aa743fc 100644
--- a/src/gromacs/mdlib/domdec.c
+++ b/src/gromacs/mdlib/domdec.c
@@ -69,6 +69,7 @@
 #include "gromacs/fileio/gmxfio.h"
 #include "gromacs/fileio/pdbio.h"
 #include "gromacs/imd/imd.h"
+#include "gromacs/mdlib/nb_verlet.h"
 #include "gromacs/pbcutil/ishift.h"
 #include "gromacs/pbcutil/pbc.h"
 #include "gromacs/pulling/pull.h"
diff --git a/src/gromacs/mdlib/forcerec.c b/src/gromacs/mdlib/forcerec.c
index f2620a9de7..031d517886 100644
--- a/src/gromacs/mdlib/forcerec.c
+++ b/src/gromacs/mdlib/forcerec.c
@@ -76,8 +76,9 @@
 
 #include "types/nbnxn_cuda_types_ext.h"
 #include "gpu_utils.h"
-#include "nbnxn_cuda_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
 #include "pmalloc_cuda.h"
+#include "nb_verlet.h"
 
 t_forcerec *mk_forcerec(void)
 {
@@ -3321,3 +3322,44 @@ void forcerec_set_excl_load(t_forcerec           *fr,
         fr->excl_load[t] = i;
     }
 }
+
+/* Frees GPU memory and destroys the CUDA context.
+ *
+ * Note that this function needs to be called even if GPUs are not used
+ * in this run because the PME ranks have no knowledge of whether GPUs
+ * are used or not, but all ranks need to enter the barrier below.
+ */
+void free_gpu_resources(const t_forcerec *fr,
+                        const t_commrec  *cr)
+{
+    gmx_bool bIsPPrankUsingGPU;
+    char     gpu_err_str[STRLEN];
+
+    bIsPPrankUsingGPU = (cr->duty & DUTY_PP) && fr && fr->nbv && fr->nbv->bUseGPU;
+
+    if (bIsPPrankUsingGPU)
+    {
+        /* free nbnxn data in GPU memory */
+        nbnxn_cuda_free(fr->nbv->cu_nbv);
+
+        /* With tMPI we need to wait for all ranks to finish deallocation before
+         * destroying the context in free_gpu() as some ranks may be sharing
+         * GPU and context.
+         * Note: as only PP ranks need to free GPU resources, so it is safe to
+         * not call the barrier on PME ranks.
+         */
+#ifdef GMX_THREAD_MPI
+        if (PAR(cr))
+        {
+            gmx_barrier(cr);
+        }
+#endif  /* GMX_THREAD_MPI */
+
+        /* uninitialize GPU (by destroying the context) */
+        if (!free_gpu(gpu_err_str))
+        {
+            gmx_warning("On rank %d failed to free GPU #%d: %s",
+                        cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
+        }
+    }
+}
diff --git a/src/gromacs/legacyheaders/types/nb_verlet.h b/src/gromacs/mdlib/nb_verlet.h
similarity index 97%
rename from src/gromacs/legacyheaders/types/nb_verlet.h
rename to src/gromacs/mdlib/nb_verlet.h
index e1e8ab0ca8..4f51797e29 100644
--- a/src/gromacs/legacyheaders/types/nb_verlet.h
+++ b/src/gromacs/mdlib/nb_verlet.h
@@ -37,7 +37,7 @@
 #define NB_VERLET_H
 
 #include "nbnxn_pairlist.h"
-#include "nbnxn_cuda_types_ext.h"
+#include "types/nbnxn_cuda_types_ext.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -88,7 +88,7 @@ enum {
     enbvClearFNo, enbvClearFYes
 };
 
-typedef struct {
+typedef struct nonbonded_verlet_group_t {
     nbnxn_pairlist_set_t  nbl_lists;   /* pair list(s)                       */
     nbnxn_atomdata_t     *nbat;        /* atom data                          */
     int                   kernel_type; /* non-bonded kernel - see enum above */
@@ -96,7 +96,7 @@ typedef struct {
 } nonbonded_verlet_group_t;
 
 /* non-bonded data structure with Verlet-type cut-off */
-typedef struct {
+typedef struct nonbonded_verlet_t {
     nbnxn_search_t           nbs;             /* n vs n atom pair searching data       */
     int                      ngrp;            /* number of interaction groups          */
     nonbonded_verlet_group_t grp[2];          /* local and non-local interaction group */
diff --git a/src/gromacs/mdlib/nbnxn_atomdata.c b/src/gromacs/mdlib/nbnxn_atomdata.c
index 1babe769f7..5e4dfef0c0 100644
--- a/src/gromacs/mdlib/nbnxn_atomdata.c
+++ b/src/gromacs/mdlib/nbnxn_atomdata.c
@@ -49,6 +49,7 @@
 #include "gmx_omp_nthreads.h"
 #include "thread_mpi/atomic.h"
 
+#include "gromacs/mdlib/nb_verlet.h"
 #include "gromacs/pbcutil/ishift.h"
 #include "gromacs/utility/gmxomp.h"
 #include "gromacs/utility/smalloc.h"
diff --git a/src/gromacs/mdlib/nbnxn_atomdata.h b/src/gromacs/mdlib/nbnxn_atomdata.h
index 5855e5b50d..efe86a5b3a 100644
--- a/src/gromacs/mdlib/nbnxn_atomdata.h
+++ b/src/gromacs/mdlib/nbnxn_atomdata.h
@@ -37,6 +37,7 @@
 #define _nbnxn_atomdata_h
 
 #include "typedefs.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/gromacs/mdlib/nbnxn_consts.h b/src/gromacs/mdlib/nbnxn_consts.h
index f5bd3d01ab..719e47b8f4 100644
--- a/src/gromacs/mdlib/nbnxn_consts.h
+++ b/src/gromacs/mdlib/nbnxn_consts.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013,2014 by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -63,7 +63,7 @@ extern "C" {
 #define NBNXN_GPU_CLUSTER_SIZE         8
 
 /* With GPU kernels we group cluster pairs in 4 to optimize memory usage.
- * To change this, also change nbnxn_cj4_t in include/types/nbnxn_pairlist.h.
+ * To change this, also change nbnxn_cj4_t in gromacs/mdlib/nbnxn_pairlist.h.
  */
 #define NBNXN_GPU_JGROUP_SIZE       4
 #define NBNXN_GPU_JGROUP_SIZE_2LOG  2
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
index 40d86e1b08..fa2eb36b1c 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -44,8 +44,8 @@
 #include <cuda.h>
 
 #include "types/simple.h"
-#include "types/nbnxn_pairlist.h"
-#include "types/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
+#include "gromacs/mdlib/nb_verlet.h"
 #include "types/force_flags.h"
 #include "../nbnxn_consts.h"
 
@@ -56,7 +56,7 @@
 #include "nbnxn_cuda_types.h"
 #include "../../gmxlib/cuda_tools/cudautils.cuh"
 #include "nbnxn_cuda.h"
-#include "nbnxn_cuda_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
 
 #include "gromacs/pbcutil/ishift.h"
 #include "gromacs/utility/cstringutil.h"
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
index 37679494fa..57fd906c10 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -44,7 +44,7 @@
 #include "tables.h"
 #include "typedefs.h"
 #include "types/enums.h"
-#include "types/nb_verlet.h"
+#include "gromacs/mdlib/nb_verlet.h"
 #include "types/interaction_const.h"
 #include "types/force_flags.h"
 #include "../nbnxn_consts.h"
@@ -52,7 +52,7 @@
 
 #include "nbnxn_cuda_types.h"
 #include "../../gmxlib/cuda_tools/cudautils.cuh"
-#include "nbnxn_cuda_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
 #include "pmalloc_cuda.h"
 #include "gpu_utils.h"
 
@@ -426,10 +426,15 @@ static void init_nbparam(cu_nbparam_t              *nbp,
 
 /*! Re-generate the GPU Ewald force table, resets rlist, and update the
  *  electrostatic type switching to twin cut-off (or back) if needed. */
-void nbnxn_cuda_pme_loadbal_update_param(nbnxn_cuda_ptr_t           cu_nb,
-                                         const interaction_const_t *ic)
+void nbnxn_cuda_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
+                                         const interaction_const_t   *ic)
 {
-    cu_nbparam_t *nbp = cu_nb->nbparam;
+    if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_CUDA)
+    {
+        return;
+    }
+    nbnxn_cuda_ptr_t cu_nb = nbv->cu_nbv;
+    cu_nbparam_t    *nbp   = cu_nb->nbparam;
 
     set_cutoff_parameters(nbp, ic);
 
@@ -1077,11 +1082,11 @@ wallclock_gpu_t * nbnxn_cuda_get_timings(nbnxn_cuda_ptr_t cu_nb)
     return (cu_nb != NULL && cu_nb->bDoTime) ? cu_nb->timings : NULL;
 }
 
-void nbnxn_cuda_reset_timings(nbnxn_cuda_ptr_t cu_nb)
+void nbnxn_cuda_reset_timings(nonbonded_verlet_t* nbv)
 {
-    if (cu_nb->bDoTime)
+    if (nbv->cu_nbv && nbv->cu_nbv->bDoTime)
     {
-        init_timings(cu_nb->timings);
+        init_timings(nbv->cu_nbv->timings);
     }
 }
 
diff --git a/src/gromacs/legacyheaders/nbnxn_cuda_data_mgmt.h b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h
similarity index 84%
rename from src/gromacs/legacyheaders/nbnxn_cuda_data_mgmt.h
rename to src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h
index 919e1355b0..f77a91583b 100644
--- a/src/gromacs/legacyheaders/nbnxn_cuda_data_mgmt.h
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h
@@ -40,7 +40,6 @@
 #include "types/interaction_const.h"
 #include "types/nbnxn_cuda_types_ext.h"
 #include "types/hw_info.h"
-#include "types/nb_verlet.h"
 
 #ifdef GMX_GPU
 #define FUNC_TERM ;
@@ -54,6 +53,10 @@
 extern "C" {
 #endif
 
+struct nonbonded_verlet_group_t;
+struct nbnxn_pairlist_t;
+struct nbnxn_atomdata_t;
+
 /** Initializes the data structures related to CUDA nonbonded calculations. */
 FUNC_QUALIFIER
 void nbnxn_cuda_init(FILE gmx_unused                 *fplog,
@@ -66,30 +69,30 @@ void nbnxn_cuda_init(FILE gmx_unused                 *fplog,
 
 /** Initializes simulation constant data. */
 FUNC_QUALIFIER
-void nbnxn_cuda_init_const(nbnxn_cuda_ptr_t               gmx_unused  cu_nb,
-                           const interaction_const_t      gmx_unused *ic,
-                           const nonbonded_verlet_group_t gmx_unused *nbv_group) FUNC_TERM
+void nbnxn_cuda_init_const(nbnxn_cuda_ptr_t               gmx_unused         cu_nb,
+                           const interaction_const_t      gmx_unused        *ic,
+                           const struct nonbonded_verlet_group_t gmx_unused *nbv_group) FUNC_TERM
 
 /** Initializes pair-list data for GPU, called at every pair search step. */
 FUNC_QUALIFIER
-void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t       gmx_unused  cu_nb,
-                              const nbnxn_pairlist_t gmx_unused *h_nblist,
-                              int                    gmx_unused  iloc) FUNC_TERM
+void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t       gmx_unused         cu_nb,
+                              const struct nbnxn_pairlist_t gmx_unused *h_nblist,
+                              int                    gmx_unused         iloc) FUNC_TERM
 
 /** Initializes atom-data on the GPU, called at every pair search step. */
 FUNC_QUALIFIER
-void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t       gmx_unused  cu_nb,
-                              const nbnxn_atomdata_t gmx_unused *atomdata) FUNC_TERM
+void nbnxn_cuda_init_atomdata(const nbnxn_cuda_ptr_t       gmx_unused   cu_nb,
+                              const struct nbnxn_atomdata_t gmx_unused *atomdata) FUNC_TERM
 
 /*! \brief Update parameters during PP-PME load balancing. */
 FUNC_QUALIFIER
-void nbnxn_cuda_pme_loadbal_update_param(nbnxn_cuda_ptr_t          gmx_unused  cu_nb,
-                                         const interaction_const_t gmx_unused *ic) FUNC_TERM
+void nbnxn_cuda_pme_loadbal_update_param(const struct nonbonded_verlet_t gmx_unused *nbv,
+                                         const interaction_const_t gmx_unused       *ic) FUNC_TERM
 
 /** Uploads shift vector to the GPU if the box is dynamic (otherwise just returns). */
 FUNC_QUALIFIER
-void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t       gmx_unused  cu_nb,
-                                const nbnxn_atomdata_t gmx_unused *nbatom) FUNC_TERM
+void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t       gmx_unused         cu_nb,
+                                const struct nbnxn_atomdata_t gmx_unused *nbatom) FUNC_TERM
 
 /** Clears GPU outputs: nonbonded force, shift force and energy. */
 FUNC_QUALIFIER
@@ -113,7 +116,7 @@ wallclock_gpu_t * nbnxn_cuda_get_timings(nbnxn_cuda_ptr_t gmx_unused cu_nb)
 
 /** Resets nonbonded GPU timings. */
 FUNC_QUALIFIER
-void nbnxn_cuda_reset_timings(nbnxn_cuda_ptr_t gmx_unused cu_nb) FUNC_TERM
+void nbnxn_cuda_reset_timings(struct nonbonded_verlet_t gmx_unused *nbv) FUNC_TERM
 
 /** Calculates the minimum size of proximity lists to improve SM load balance
  *  with CUDA non-bonded kernels. */
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
index 0fa40d2466..74df69eb34 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
@@ -47,7 +47,7 @@
 #define NBNXN_CUDA_TYPES_H
 
 #include "types/interaction_const.h"
-#include "types/nbnxn_pairlist.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
 #include "types/nbnxn_cuda_types_ext.h"
 #include "../../gmxlib/cuda_tools/cudautils.cuh"
 
diff --git a/src/gromacs/mdlib/nbnxn_internal.h b/src/gromacs/mdlib/nbnxn_internal.h
index fc42e60935..352253ec9d 100644
--- a/src/gromacs/mdlib/nbnxn_internal.h
+++ b/src/gromacs/mdlib/nbnxn_internal.h
@@ -40,6 +40,7 @@
 #include "nbnxn_simd.h"
 #include "domdec.h"
 #include "gromacs/timing/cyclecounter.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
 
 
 /* Bounding box calculations are (currently) always in single precision, so
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_common.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_common.h
index 7855b310fe..89b365f55b 100644
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_common.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_common.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -37,6 +37,7 @@
 #define _nbnxn_kernel_common_h
 
 #include "typedefs.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.c b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.c
index 898d300da1..a606329e56 100644
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.c
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.c
@@ -47,6 +47,7 @@
 #include "nbnxn_kernel_gpu_ref.h"
 #include "../nbnxn_consts.h"
 #include "nbnxn_kernel_common.h"
+#include "gromacs/mdlib/nb_verlet.h"
 
 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h
index 18f4e9d01f..2fda7440a6 100644
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -37,6 +37,7 @@
 #define _nbnxn_kernel_gpu_ref_h
 
 #include "typedefs.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c
index 4638d1b6a4..ca00dc79ac 100644
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c
@@ -48,6 +48,7 @@
 #include "nbnxn_kernel_ref.h"
 #include "../nbnxn_consts.h"
 #include "nbnxn_kernel_common.h"
+#include "gromacs/mdlib/nb_verlet.h"
 
 /*! \brief Typedefs for declaring lookup tables of kernel functions.
  */
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.h
index bfcfee5b77..16e864d9a3 100644
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -37,6 +37,7 @@
 #define _nbnxn_kernel_ref_h
 
 #include "typedefs.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c
index a6f53dffa6..135f9605c4 100644
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c
@@ -41,6 +41,7 @@
 
 #include "typedefs.h"
 
+#include "gromacs/mdlib/nbnxn_pairlist.h"
 #include "gromacs/mdlib/nbnxn_simd.h"
 
 #ifdef GMX_NBNXN_SIMD_2XNN
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h
index c7ec9bcbd7..6baaa8aadb 100644
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h
@@ -39,6 +39,7 @@
 
 #include "typedefs.h"
 
+#include "gromacs/mdlib/nb_verlet.h"
 #include "gromacs/mdlib/nbnxn_simd.h"
 
 #ifdef __cplusplus
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c
index de84c80532..8e1db2d10a 100644
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c
@@ -41,6 +41,7 @@
 
 #include "typedefs.h"
 
+#include "gromacs/mdlib/nb_verlet.h"
 #include "gromacs/mdlib/nbnxn_simd.h"
 
 #ifdef GMX_NBNXN_SIMD_4XN
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h
index e6e475765a..7fcc431a3a 100644
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h
@@ -39,6 +39,7 @@
 
 #include "typedefs.h"
 
+#include "gromacs/mdlib/nbnxn_pairlist.h"
 #include "gromacs/mdlib/nbnxn_simd.h"
 
 #ifdef __cplusplus
diff --git a/src/gromacs/legacyheaders/types/nbnxn_pairlist.h b/src/gromacs/mdlib/nbnxn_pairlist.h
similarity index 98%
rename from src/gromacs/legacyheaders/types/nbnxn_pairlist.h
rename to src/gromacs/mdlib/nbnxn_pairlist.h
index dec56d38f1..f03009d9a8 100644
--- a/src/gromacs/legacyheaders/types/nbnxn_pairlist.h
+++ b/src/gromacs/mdlib/nbnxn_pairlist.h
@@ -36,7 +36,8 @@
 #ifndef _nbnxn_pairlist_h
 #define _nbnxn_pairlist_h
 
-#include "nblist.h"
+#include "thread_mpi/atomic.h"
+#include "types/nblist.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -124,7 +125,7 @@ typedef struct {
                             */
 } nbnxn_excl_t;
 
-typedef struct {
+typedef struct nbnxn_pairlist_t {
     gmx_cache_protect_t cp0;
 
     nbnxn_alloc_t      *alloc;
@@ -222,10 +223,7 @@ enum {
     ljcrGEOM, ljcrLB, ljcrNONE, ljcrNR
 };
 
-/* TODO: Remove need for forward declare */
-struct tMPI_Atomic;
-
-typedef struct {
+typedef struct nbnxn_atomdata_t {
     nbnxn_alloc_t           *alloc;
     nbnxn_free_t            *free;
     int                      ntype;           /* The number of different atom types                 */
@@ -269,7 +267,7 @@ typedef struct {
     gmx_bool                 bUseBufferFlags;        /* Use the flags or operate on all atoms     */
     nbnxn_buffer_flags_t     buffer_flags;           /* Flags for buffer zeroing+reduc.  */
     gmx_bool                 bUseTreeReduce;         /* Use tree for force reduction */
-    struct tMPI_Atomic      *syncStep;               /* Synchronization step for tree reduce */
+    tMPI_Atomic_t           *syncStep;               /* Synchronization step for tree reduce */
 } nbnxn_atomdata_t;
 
 #ifdef __cplusplus
diff --git a/src/gromacs/mdlib/nbnxn_search.c b/src/gromacs/mdlib/nbnxn_search.c
index 0826017bab..620dc0dd74 100644
--- a/src/gromacs/mdlib/nbnxn_search.c
+++ b/src/gromacs/mdlib/nbnxn_search.c
@@ -56,6 +56,7 @@
 #include "ns.h"
 
 #include "gromacs/pbcutil/ishift.h"
+#include "gromacs/mdlib/nb_verlet.h"
 #include "gromacs/pbcutil/pbc.h"
 #include "gromacs/utility/smalloc.h"
 
diff --git a/src/gromacs/mdlib/nbnxn_search.h b/src/gromacs/mdlib/nbnxn_search.h
index 6b3ab7c8d2..500c7188ac 100644
--- a/src/gromacs/mdlib/nbnxn_search.h
+++ b/src/gromacs/mdlib/nbnxn_search.h
@@ -37,6 +37,7 @@
 #define _nbnxn_search_h
 
 #include "typedefs.h"
+#include "nbnxn_pairlist.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/gromacs/mdlib/sim_util.c b/src/gromacs/mdlib/sim_util.c
index 06bc1a858b..ada853a849 100644
--- a/src/gromacs/mdlib/sim_util.c
+++ b/src/gromacs/mdlib/sim_util.c
@@ -79,6 +79,7 @@
 #include "../gmxlib/nonbonded/nb_free_energy.h"
 
 #include "gromacs/legacyheaders/types/commrec.h"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
 #include "gromacs/pbcutil/ishift.h"
 #include "gromacs/pbcutil/mshift.h"
 #include "gromacs/timing/wallcycle.h"
@@ -94,9 +95,10 @@
 
 #include "gmx_omp_nthreads.h"
 
-#include "nbnxn_cuda_data_mgmt.h"
 #include "nbnxn_cuda/nbnxn_cuda.h"
 
+#include "nb_verlet.h"
+
 void print_time(FILE                     *out,
                 gmx_walltime_accounting_t walltime_accounting,
                 gmx_int64_t               step,
@@ -797,6 +799,11 @@ static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
     wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 }
 
+gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
+{
+    return nbv != NULL && nbv->bUseGPU;
+}
+
 void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
                          t_inputrec *inputrec,
                          gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
@@ -2645,7 +2652,7 @@ void finish_run(FILE *fplog, t_commrec *cr,
                 t_inputrec *inputrec,
                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
                 gmx_walltime_accounting_t walltime_accounting,
-                wallclock_gpu_t *gputimes,
+                nonbonded_verlet_t *nbv,
                 gmx_bool bWriteStat)
 {
     int     i, j;
@@ -2709,6 +2716,8 @@ void finish_run(FILE *fplog, t_commrec *cr,
 
     if (SIMMASTER(cr))
     {
+        wallclock_gpu_t* gputimes = use_GPU(nbv) ?
+            nbnxn_cuda_get_timings(nbv->cu_nbv) : NULL;
         wallcycle_print(fplog, cr->nnodes, cr->npmenodes,
                         elapsed_time_over_all_ranks,
                         wcycle, gputimes);
diff --git a/src/programs/mdrun/md.cpp b/src/programs/mdrun/md.cpp
index 934443bcf3..011b84a3d8 100644
--- a/src/programs/mdrun/md.cpp
+++ b/src/programs/mdrun/md.cpp
@@ -76,7 +76,7 @@
 #include "membed.h"
 #include "types/nlistheuristics.h"
 #include "types/iteratedconstraints.h"
-#include "nbnxn_cuda_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
 
 #include "gromacs/fileio/confio.h"
 #include "gromacs/fileio/mdoutf.h"
@@ -103,7 +103,7 @@ static void reset_all_counters(FILE *fplog, t_commrec *cr,
                                gmx_int64_t *step_rel, t_inputrec *ir,
                                gmx_wallcycle_t wcycle, t_nrnb *nrnb,
                                gmx_walltime_accounting_t walltime_accounting,
-                               nbnxn_cuda_ptr_t cu_nbv)
+                               struct nonbonded_verlet_t *nbv)
 {
     char sbuf[STEPSTRSIZE];
 
@@ -111,10 +111,7 @@ static void reset_all_counters(FILE *fplog, t_commrec *cr,
     md_print_warn(cr, fplog, "step %s: resetting all time and cycle counters\n",
                   gmx_step_str(step, sbuf));
 
-    if (cu_nbv)
-    {
-        nbnxn_cuda_reset_timings(cu_nbv);
-    }
+    nbnxn_cuda_reset_timings(nbv);
 
     wallcycle_stop(wcycle, ewcRUN);
     wallcycle_reset_all(wcycle);
@@ -477,7 +474,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
      */
     if ((Flags & MD_TUNEPME) &&
         EEL_PME(fr->eeltype) &&
-        ( (fr->cutoff_scheme == ecutsVERLET && fr->nbv->bUseGPU) || !(cr->duty & DUTY_PME)) &&
+        ( use_GPU(fr->nbv) || !(cr->duty & DUTY_PME)) &&
         !bRerunMD)
     {
         pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata);
@@ -1919,7 +1916,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
         {
             /* Reset all the counters related to performance over the run */
             reset_all_counters(fplog, cr, step, &step_rel, ir, wcycle, nrnb, walltime_accounting,
-                               fr->nbv != NULL && fr->nbv->bUseGPU ? fr->nbv->cu_nbv : NULL);
+                               use_GPU(fr->nbv) ? fr->nbv : NULL);
             wcycle_set_reset_counters(wcycle, -1);
             if (!(cr->duty & DUTY_PME))
             {
@@ -1974,7 +1971,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     if (pme_loadbal != NULL)
     {
         pme_loadbal_done(pme_loadbal, cr, fplog,
-                         fr->nbv != NULL && fr->nbv->bUseGPU);
+                         use_GPU(fr->nbv));
     }
 
     if (shellfc && fplog)
diff --git a/src/programs/mdrun/pme_loadbal.c b/src/programs/mdrun/pme_loadbal.c
index 2996bc6210..75c6c78917 100644
--- a/src/programs/mdrun/pme_loadbal.c
+++ b/src/programs/mdrun/pme_loadbal.c
@@ -39,13 +39,14 @@
 #include "calcgrid.h"
 #include "pme.h"
 #include "domdec.h"
-#include "nbnxn_cuda_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
 #include "force.h"
 #include "macros.h"
 #include "md_logging.h"
 #include "pme_loadbal.h"
 
 #include "gromacs/math/vec.h"
+#include "gromacs/legacyheaders/sim_util.h"
 #include "gromacs/pbcutil/pbc.h"
 #include "gromacs/utility/cstringutil.h"
 #include "gromacs/utility/smalloc.h"
@@ -428,17 +429,17 @@ static void switch_to_stage1(pme_load_balancing_t pme_lb)
     pme_lb->cur = pme_lb->start - 1;
 }
 
-gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
-                          t_commrec           *cr,
-                          FILE                *fp_err,
-                          FILE                *fp_log,
-                          t_inputrec          *ir,
-                          t_state             *state,
-                          double               cycles,
-                          interaction_const_t *ic,
-                          nonbonded_verlet_t  *nbv,
-                          gmx_pme_t           *pmedata,
-                          gmx_int64_t          step)
+gmx_bool pme_load_balance(pme_load_balancing_t        pme_lb,
+                          t_commrec                  *cr,
+                          FILE                       *fp_err,
+                          FILE                       *fp_log,
+                          t_inputrec                 *ir,
+                          t_state                    *state,
+                          double                      cycles,
+                          interaction_const_t        *ic,
+                          struct nonbonded_verlet_t  *nbv,
+                          gmx_pme_t                  *pmedata,
+                          gmx_int64_t                 step)
 {
     gmx_bool     OK;
     pme_setup_t *set;
@@ -690,30 +691,26 @@ gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
     }
 
     bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0);
-    if (pme_lb->cutoff_scheme == ecutsVERLET &&
-        nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
-    {
-        nbnxn_cuda_pme_loadbal_update_param(nbv->cu_nbv, ic);
-
-        /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
-         * also sharing texture references. To keep the code simple, we don't
-         * treat texture references as shared resources, but this means that
-         * the coulomb_tab texture ref will get updated by multiple threads.
-         * Hence, to ensure that the non-bonded kernels don't start before all
-         * texture binding operations are finished, we need to wait for all ranks
-         * to arrive here before continuing.
-         *
-         * Note that we could omit this barrier if GPUs are not shared (or
-         * texture objects are used), but as this is initialization code, there
-         * is not point in complicating things.
-         */
+    nbnxn_cuda_pme_loadbal_update_param(nbv, ic);
+
+    /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
+     * also sharing texture references. To keep the code simple, we don't
+     * treat texture references as shared resources, but this means that
+     * the coulomb_tab texture ref will get updated by multiple threads.
+     * Hence, to ensure that the non-bonded kernels don't start before all
+     * texture binding operations are finished, we need to wait for all ranks
+     * to arrive here before continuing.
+     *
+     * Note that we could omit this barrier if GPUs are not shared (or
+     * texture objects are used), but as this is initialization code, there
+     * is not point in complicating things.
+     */
 #ifdef GMX_THREAD_MPI
-        if (PAR(cr))
-        {
-            gmx_barrier(cr);
-        }
-#endif  /* GMX_THREAD_MPI */
+    if (PAR(cr) && use_GPU(nbv))
+    {
+        gmx_barrier(cr);
     }
+#endif  /* GMX_THREAD_MPI */
 
     /* Usually we won't need the simple tables with GPUs.
      * But we do with hybrid acceleration and with free energy.
diff --git a/src/programs/mdrun/pme_loadbal.h b/src/programs/mdrun/pme_loadbal.h
index eddb4d67eb..a96881eac3 100644
--- a/src/programs/mdrun/pme_loadbal.h
+++ b/src/programs/mdrun/pme_loadbal.h
@@ -60,17 +60,17 @@ void pme_loadbal_init(pme_load_balancing_t *pme_lb_p,
  * factors as well as DD load balancing.
  * Returns TRUE the load balancing continues, FALSE is the balancing is done.
  */
-gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
-                          t_commrec           *cr,
-                          FILE                *fp_err,
-                          FILE                *fp_log,
-                          t_inputrec          *ir,
-                          t_state             *state,
-                          double               cycles,
-                          interaction_const_t *ic,
-                          nonbonded_verlet_t  *nbv,
-                          gmx_pme_t           *pmedata,
-                          gmx_int64_t          step);
+gmx_bool pme_load_balance(pme_load_balancing_t        pme_lb,
+                          t_commrec                  *cr,
+                          FILE                       *fp_err,
+                          FILE                       *fp_log,
+                          t_inputrec                 *ir,
+                          t_state                    *state,
+                          double                      cycles,
+                          interaction_const_t        *ic,
+                          struct nonbonded_verlet_t  *nbv,
+                          gmx_pme_t                  *pmedata,
+                          gmx_int64_t                 step);
 
 /* Restart the PME load balancing discarding all timings gathered up till now */
 void restart_pme_loadbal(pme_load_balancing_t pme_lb, int n);
diff --git a/src/programs/mdrun/runner.cpp b/src/programs/mdrun/runner.cpp
index 9a00d35da4..4281e4e035 100644
--- a/src/programs/mdrun/runner.cpp
+++ b/src/programs/mdrun/runner.cpp
@@ -97,7 +97,6 @@
 #endif
 
 #include "gpu_utils.h"
-#include "nbnxn_cuda_data_mgmt.h"
 
 typedef struct {
     gmx_integrator_t *func;
@@ -1037,47 +1036,6 @@ static void override_nsteps_cmdline(FILE            *fplog,
     }
 }
 
-/* Frees GPU memory and destroys the CUDA context.
- *
- * Note that this function needs to be called even if GPUs are not used
- * in this run because the PME ranks have no knowledge of whether GPUs
- * are used or not, but all ranks need to enter the barrier below.
- */
-static void free_gpu_resources(const t_forcerec *fr,
-                               const t_commrec  *cr)
-{
-    gmx_bool bIsPPrankUsingGPU;
-    char     gpu_err_str[STRLEN];
-
-    bIsPPrankUsingGPU = (cr->duty & DUTY_PP) && fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU;
-
-    if (bIsPPrankUsingGPU)
-    {
-        /* free nbnxn data in GPU memory */
-        nbnxn_cuda_free(fr->nbv->cu_nbv);
-
-        /* With tMPI we need to wait for all ranks to finish deallocation before
-         * destroying the context in free_gpu() as some ranks may be sharing
-         * GPU and context.
-         * Note: as only PP ranks need to free GPU resources, so it is safe to
-         * not call the barrier on PME ranks.
-         */
-#ifdef GMX_THREAD_MPI
-        if (PAR(cr))
-        {
-            gmx_barrier(cr);
-        }
-#endif  /* GMX_THREAD_MPI */
-
-        /* uninitialize GPU (by destroying the context) */
-        if (!free_gpu(gpu_err_str))
-        {
-            gmx_warning("On rank %d failed to free GPU #%d: %s",
-                        cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
-        }
-    }
-}
-
 int mdrunner(gmx_hw_opt_t *hw_opt,
              FILE *fplog, t_commrec *cr, int nfile,
              const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
@@ -1793,8 +1751,7 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
      */
     finish_run(fplog, cr,
                inputrec, nrnb, wcycle, walltime_accounting,
-               fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
-               nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
+               fr ? fr->nbv : NULL,
                EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));