Separate management of GPU contexts from modules
authorMark Abraham <mark.j.abraham@gmail.com>
Fri, 3 Nov 2017 01:33:01 +0000 (02:33 +0100)
committerAleksei Iupinov <a.yupinov@gmail.com>
Fri, 24 Nov 2017 14:16:05 +0000 (15:16 +0100)
Tasks from modules might share GPU contexts across either tasks, or
thread-MPI ranks, so init and free operations can't be the
responsibility of the modules themselves.

Simplified the error reporting for init and free. Knowledge of the
rank ID might help in diagnosing issues in some cases, but should
(later) be the responsibility of a proper framework to catch errors
during initialization across MPI ranks.

Moved the GPU profiling cleanup back to where it was intended to be,
before some earlier refactoring had left it somewhere not-quite-right.

Change-Id: I682a1b1c7058cbebb41805dba05e688cbee18c2a

12 files changed:
src/gromacs/ewald/pme-gpu-internal.cpp
src/gromacs/ewald/pme-gpu-internal.h
src/gromacs/ewald/pme.cpp
src/gromacs/ewald/tests/pmetestcommon.cpp
src/gromacs/gpu_utils/gpu_utils.cu
src/gromacs/gpu_utils/gpu_utils.h
src/gromacs/gpu_utils/gpu_utils_ocl.cpp
src/gromacs/mdlib/force.h
src/gromacs/mdlib/forcerec.cpp
src/gromacs/mdlib/forcerec.h
src/gromacs/taskassignment/taskassignment.h
src/programs/mdrun/runner.cpp

index 6e1d9992bede96cabfd1da6548348564160a8ca9..9487fd0c5de7bd9a364b7b87ed50fb123480822b 100644 (file)
@@ -282,11 +282,8 @@ static bool pme_gpu_check_restrictions(const gmx_pme_t *pme, std::string *error)
  *
  * \param[in,out] pme       The PME structure.
  * \param[in,out] gpuInfo   The GPU information structure.
- * \param[in]     mdlog     The logger.
- * \param[in]     cr        The communication structure.
  */
-static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog,
-                         const t_commrec *cr)
+static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo)
 {
     std::string errorString;
     bool        canRunOnGpu = pme_gpu_check_restrictions(pme, &errorString);
@@ -308,8 +305,6 @@ static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::
 
     pme_gpu_set_testing(pmeGPU, false);
 
-    // GPU initialization
-    init_gpu(mdlog, cr->nodeid, gpuInfo);
     pmeGPU->deviceInfo = gpuInfo;
 
     pme_gpu_init_internal(pmeGPU);
@@ -395,7 +390,7 @@ void pme_gpu_get_real_grid_sizes(const PmeGpu *pmeGPU, gmx::IVec *gridSize, gmx:
     }
 }
 
-void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog, const t_commrec *cr)
+void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo)
 {
     if (!pme_gpu_active(pme))
     {
@@ -405,7 +400,7 @@ void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLog
     if (!pme->gpu)
     {
         /* First-time initialization */
-        pme_gpu_init(pme, gpuInfo, mdlog, cr);
+        pme_gpu_init(pme, gpuInfo);
     }
     else
     {
index be50ead6afc23a28f83b146608752f9a691cf98b..33c83103a852b2f22f1f4c4a632074c5c5dabeb5 100644 (file)
@@ -55,7 +55,6 @@
 struct gmx_hw_info_t;
 struct gmx_gpu_opt_t;
 struct gmx_pme_t;                              // only used in pme_gpu_reinit
-struct t_commrec;
 struct gmx_wallclock_gpu_pme_t;
 struct pme_atomcomm_t;
 struct t_complex;
@@ -636,11 +635,9 @@ void pme_gpu_get_real_grid_sizes(const PmeGpu *pmeGPU, gmx::IVec *gridSize, gmx:
  *
  * \param[in,out] pme       The PME structure.
  * \param[in,out] gpuInfo   The GPU information structure.
- * \param[in]     mdlog     The logger.
- * \param[in]     cr        The communication structure.
  * \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
  */
-void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog, const t_commrec *cr);
+void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo);
 
 /*! \libinternal \brief
  * Destroys the PME GPU data at the end of the run.
index 344d8553cc5d162b2a93a1d93e34d5101dc5bc94..c90254b0a34c7e16d0f66a692706eb0b7434cccc 100644 (file)
@@ -520,7 +520,7 @@ gmx_pme_t *gmx_pme_init(const t_commrec     *cr,
                         PmeRunMode           runMode,
                         PmeGpu              *pmeGPU,
                         gmx_device_info_t   *gpuInfo,
-                        const gmx::MDLogger &mdlog)
+                        const gmx::MDLogger  & /*mdlog*/)
 {
     int               use_threads, sum_use_threads, i;
     ivec              ndata;
@@ -853,7 +853,7 @@ gmx_pme_t *gmx_pme_init(const t_commrec     *cr,
     pme->lb_buf2       = nullptr;
     pme->lb_buf_nalloc = 0;
 
-    pme_gpu_reinit(pme.get(), gpuInfo, mdlog, cr);
+    pme_gpu_reinit(pme.get(), gpuInfo);
 
     pme_init_all_work(&pme->solve_work, pme->nthread, pme->nkx);
 
index f7c7b0f141f48f9b660da98ac8e99321085d5d70..f9ea663ce6f292348b84857e0bfcb009c90bb3a6 100644 (file)
@@ -52,6 +52,7 @@
 #include "gromacs/ewald/pme-solve.h"
 #include "gromacs/ewald/pme-spread.h"
 #include "gromacs/fft/parallel_3dfft.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/math/invertmatrix.h"
 #include "gromacs/mdtypes/commrec.h"
 #include "gromacs/pbcutil/pbc.h"
@@ -103,6 +104,10 @@ static PmeSafePointer pmeInitInternal(const t_inputrec         *inputRec,
                                       )
 {
     const MDLogger dummyLogger;
+    if (gpuInfo)
+    {
+        init_gpu(dummyLogger, gpuInfo);
+    }
     const auto     runMode       = (mode == CodePath::CPU) ? PmeRunMode::CPU : PmeRunMode::GPU;
     t_commrec      dummyCommrec  = {0};
     gmx_pme_t     *pmeDataRaw    = gmx_pme_init(&dummyCommrec, 1, 1, inputRec, atomCount, false, false, true,
index fb509f5d37e16b964b275d4a828796948d6274b7..4e2200794089385cc0310117b7e24ca7064de543 100644 (file)
 #include "gromacs/hardware/gpu_hw_info.h"
 #include "gromacs/utility/basedefinitions.h"
 #include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/gmxassert.h"
 #include "gromacs/utility/logger.h"
 #include "gromacs/utility/programcontext.h"
 #include "gromacs/utility/smalloc.h"
 #include "gromacs/utility/snprintf.h"
+#include "gromacs/utility/stringutil.h"
 
 #if HAVE_NVML
 #include <nvml.h>
@@ -507,20 +509,18 @@ static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused
 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
 }
 
-void init_gpu(const gmx::MDLogger &mdlog, int rank,
-              gmx_device_info_t *deviceInfo)
+void init_gpu(const gmx::MDLogger &mdlog,
+              gmx_device_info_t   *deviceInfo)
 {
     cudaError_t stat;
-    char        sbuf[STRLEN];
 
     assert(deviceInfo);
 
     stat = cudaSetDevice(deviceInfo->id);
     if (stat != cudaSuccess)
     {
-        snprintf(sbuf, STRLEN, "On rank %d failed to initialize GPU #%d",
-                 rank, deviceInfo->id);
-        CU_RET_ERR(stat, sbuf);
+        auto message = gmx::formatString("Failed to initialize GPU #%d", deviceInfo->id);
+        CU_RET_ERR(stat, message.c_str());
     }
 
     if (debug)
@@ -534,13 +534,9 @@ void init_gpu(const gmx::MDLogger &mdlog, int rank,
     init_gpu_application_clocks(mdlog, deviceInfo);
 }
 
-gmx_bool free_cuda_gpu(const gmx_device_info_t *deviceInfo,
-                       char                    *result_str)
+void free_gpu(const gmx_device_info_t *deviceInfo)
 {
     cudaError_t  stat;
-    gmx_bool     reset_gpu_application_clocks_status = true;
-
-    assert(result_str);
 
     if (debug)
     {
@@ -552,12 +548,17 @@ gmx_bool free_cuda_gpu(const gmx_device_info_t *deviceInfo,
 
     if (deviceInfo != nullptr)
     {
-        reset_gpu_application_clocks_status = reset_gpu_application_clocks(deviceInfo);
+        if (!reset_gpu_application_clocks(deviceInfo))
+        {
+            gmx_warning("Failed to reset GPU application clocks on GPU #%d", deviceInfo->id);
+        }
     }
 
     stat = cudaDeviceReset();
-    strncpy(result_str, cudaGetErrorString(stat), STRLEN);
-    return (stat == cudaSuccess) && reset_gpu_application_clocks_status;
+    if (stat != cudaSuccess)
+    {
+        gmx_warning("Failed to free GPU #%d: %s", deviceInfo->id, cudaGetErrorString(stat));
+    }
 }
 
 gmx_device_info_t *getDeviceInfo(const gmx_gpu_info_t &gpu_info,
index 8f596304629b040b187576636d7bf997cdaf5a78..b1cc07b51f9b656dcd9d46f6575537638ae33d93 100644 (file)
@@ -114,7 +114,6 @@ void free_gpu_info(const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info)) GPU
  * the patterns here are the same as elsewhere in this header.
  *
  *  param[in]    mdlog        log file to write to
- *  param[in]    rank         MPI rank of this process (for error output)
  * \param[inout] deviceInfo   device info of the GPU to initialize
  *
  * Issues a fatal error for any critical errors that occur during
@@ -122,7 +121,6 @@ void free_gpu_info(const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info)) GPU
  */
 GPU_FUNC_QUALIFIER
 void init_gpu(const gmx::MDLogger &GPU_FUNC_ARGUMENT(mdlog),
-              int GPU_FUNC_ARGUMENT(rank),
               gmx_device_info_t *GPU_FUNC_ARGUMENT(deviceInfo)) GPU_FUNC_TERM
 
 /*! \brief Frees up the CUDA GPU used by the active context at the time of calling.
@@ -130,15 +128,14 @@ void init_gpu(const gmx::MDLogger &GPU_FUNC_ARGUMENT(mdlog),
  * The context is explicitly destroyed and therefore all data uploaded to the GPU
  * is lost. This should only be called when none of this data is required anymore.
  *
+ * Calls gmx_warning upon errors.
+ *
  * \param[in]  deviceInfo   device info of the GPU to clean up for
- * \param[out] result_str   the message related to the error that occurred
- *                          during the initialization (if there was any).
  *
  * \returns                 true if no error occurs during the freeing.
  */
 CUDA_FUNC_QUALIFIER
-gmx_bool free_cuda_gpu(const gmx_device_info_t *CUDA_FUNC_ARGUMENT(deviceInfo),
-                       char *CUDA_FUNC_ARGUMENT(result_str)) CUDA_FUNC_TERM_WITH_RETURN(TRUE)
+void free_gpu(const gmx_device_info_t *CUDA_FUNC_ARGUMENT(deviceInfo)) CUDA_FUNC_TERM
 
 /*! \brief Return a pointer to the device info for \c deviceId
  *
index 5195baccc5d97ffcede987b83d7eb21aa52d1bf4..c121f813b3904ed3870f1307c4acbd9682e92fc3 100644 (file)
@@ -401,7 +401,6 @@ void get_gpu_device_info_string(char *s, const gmx_gpu_info_t &gpu_info, int ind
 
 //! This function is documented in the header file
 void init_gpu(const gmx::MDLogger               & /*mdlog*/,
-              int                               /* rank */,
               gmx_device_info_t                *deviceInfo)
 {
     assert(deviceInfo);
index 87d4fe94961a048b1ad5bbd594fbac7285cf654b..2d83e17a252b8e66234483d3d2f65fa0732f01ed 100644 (file)
@@ -216,7 +216,6 @@ void do_force_lowlevel(t_forcerec   *fr,
 /* Call all the force routines */
 
 void free_gpu_resources(const t_forcerec            *fr,
-                        const t_commrec             *cr,
-                        const gmx_device_info_t     *deviceInfo);
+                        const t_commrec             *cr);
 
 #endif
index 05c59c2f2937752d83b48a44dfdab5bb5191519e..0d9cfb49c7412291e6695655c50b1d10d95697fe 100644 (file)
@@ -2134,19 +2134,15 @@ init_interaction_const(FILE                       *fp,
     *interaction_const = ic;
 }
 
-/* TODO deviceInfo should be logically const, but currently
- * init_gpu modifies it to set up NVML support. This could
- * happen during the detection phase, and deviceInfo could
- * the become const. */
-static void init_nb_verlet(const gmx::MDLogger &mdlog,
-                           nonbonded_verlet_t **nb_verlet,
-                           gmx_bool             bFEP_NonBonded,
-                           const t_inputrec    *ir,
-                           const t_forcerec    *fr,
-                           const t_commrec     *cr,
-                           gmx_device_info_t   *deviceInfo,
-                           const gmx_mtop_t    *mtop,
-                           matrix               box)
+static void init_nb_verlet(const gmx::MDLogger     &mdlog,
+                           nonbonded_verlet_t     **nb_verlet,
+                           gmx_bool                 bFEP_NonBonded,
+                           const t_inputrec        *ir,
+                           const t_forcerec        *fr,
+                           const t_commrec         *cr,
+                           const gmx_device_info_t *deviceInfo,
+                           const gmx_mtop_t        *mtop,
+                           matrix                   box)
 {
     nonbonded_verlet_t *nbv;
     char               *env;
@@ -2161,12 +2157,6 @@ static void init_nb_verlet(const gmx::MDLogger &mdlog,
 
     GMX_RELEASE_ASSERT(!(nbv->emulateGpu == EmulateGpuNonbonded::Yes && nbv->bUseGPU), "When GPU emulation is active, there cannot be a GPU assignment");
 
-    if (nbv->bUseGPU)
-    {
-        /* Use the assigned GPU. */
-        init_gpu(mdlog, cr->nodeid, deviceInfo);
-    }
-
     nbv->nbs             = nullptr;
     nbv->min_ci_balanced = 0;
 
@@ -2319,20 +2309,20 @@ gmx_bool usingGpu(nonbonded_verlet_t *nbv)
     return nbv != nullptr && nbv->bUseGPU;
 }
 
-void init_forcerec(FILE                *fp,
-                   const gmx::MDLogger &mdlog,
-                   t_forcerec          *fr,
-                   t_fcdata            *fcd,
-                   const t_inputrec    *ir,
-                   const gmx_mtop_t    *mtop,
-                   const t_commrec     *cr,
-                   matrix               box,
-                   const char          *tabfn,
-                   const char          *tabpfn,
-                   const t_filenm      *tabbfnm,
-                   gmx_device_info_t   *deviceInfo,
-                   gmx_bool             bNoSolvOpt,
-                   real                 print_force)
+void init_forcerec(FILE                    *fp,
+                   const gmx::MDLogger     &mdlog,
+                   t_forcerec              *fr,
+                   t_fcdata                *fcd,
+                   const t_inputrec        *ir,
+                   const gmx_mtop_t        *mtop,
+                   const t_commrec         *cr,
+                   matrix                   box,
+                   const char              *tabfn,
+                   const char              *tabpfn,
+                   const t_filenm          *tabbfnm,
+                   const gmx_device_info_t *deviceInfo,
+                   gmx_bool                 bNoSolvOpt,
+                   real                     print_force)
 {
     int            i, m, negp_pp, negptable, egi, egj;
     real           rtab;
@@ -3156,27 +3146,26 @@ void init_forcerec(FILE                *fp,
     }
 }
 
-/* Frees GPU memory and destroys the GPU context.
+/* Frees GPU memory and sets a tMPI node barrier.
  *
  * Note that this function needs to be called even if GPUs are not used
  * in this run because the PME ranks have no knowledge of whether GPUs
  * are used or not, but all ranks need to enter the barrier below.
+ * \todo Remove physical node barrier from this function after making sure
+ * that it's not needed anymore (with a shared GPU run).
  */
 void free_gpu_resources(const t_forcerec        *fr,
-                        const t_commrec         *cr,
-                        const gmx_device_info_t *deviceInfo)
+                        const t_commrec         *cr)
 {
-    gmx_bool bIsPPrankUsingGPU;
-    char     gpu_err_str[STRLEN];
+    bool isPPrankUsingGPU = fr && fr->nbv && fr->nbv->bUseGPU;
 
-    bIsPPrankUsingGPU = thisRankHasDuty(cr, DUTY_PP) && fr && fr->nbv && fr->nbv->bUseGPU;
+    /* stop the GPU profiler (only CUDA) */
+    stopGpuProfiler();
 
-    if (bIsPPrankUsingGPU)
+    if (isPPrankUsingGPU)
     {
         /* free nbnxn data in GPU memory */
         nbnxn_gpu_free(fr->nbv->gpu_nbv);
-        /* stop the GPU profiler (only CUDA) */
-        stopGpuProfiler();
     }
 
     /* With tMPI we need to wait for all ranks to finish deallocation before
@@ -3189,20 +3178,8 @@ void free_gpu_resources(const t_forcerec        *fr,
      * Note: it is safe to not call the barrier on the ranks which do not use GPU,
      * but it is easier and more futureproof to call it on the whole node.
      */
-#if GMX_THREAD_MPI
-    if (PAR(cr) || MULTISIM(cr))
+    if (GMX_THREAD_MPI && (PAR(cr) || MULTISIM(cr)))
     {
         gmx_barrier_physical_node(cr);
     }
-#endif  /* GMX_THREAD_MPI */
-
-    if (bIsPPrankUsingGPU)
-    {
-        /* uninitialize GPU (by destroying the context) */
-        if (!free_cuda_gpu(deviceInfo, gpu_err_str))
-        {
-            gmx_warning("On rank %d failed to free GPU #%d: %s",
-                        cr->nodeid, get_current_cuda_gpu_device_id(), gpu_err_str);
-        }
-    }
 }
index 5dd6a73fca4cfed23357d5271bdd3ddceea521a3..9906a755a118d0eaf068a11dcff4a947004dfed6 100644 (file)
@@ -111,20 +111,20 @@ void init_interaction_const_tables(FILE                   *fp,
  * \param[in]  bNoSolvOpt  Do not use solvent optimization
  * \param[in]  print_force Print forces for atoms with force >= print_force
  */
-void init_forcerec(FILE                   *fplog,
-                   const gmx::MDLogger    &mdlog,
-                   t_forcerec             *fr,
-                   t_fcdata               *fcd,
-                   const t_inputrec       *ir,
-                   const gmx_mtop_t       *mtop,
-                   const t_commrec        *cr,
-                   matrix                  box,
-                   const char             *tabfn,
-                   const char             *tabpfn,
-                   const t_filenm         *tabbfnm,
-                   gmx_device_info_t      *deviceInfo,
-                   gmx_bool                bNoSolvOpt,
-                   real                    print_force);
+void init_forcerec(FILE                    *fplog,
+                   const gmx::MDLogger     &mdlog,
+                   t_forcerec              *fr,
+                   t_fcdata                *fcd,
+                   const t_inputrec        *ir,
+                   const gmx_mtop_t        *mtop,
+                   const t_commrec         *cr,
+                   matrix                   box,
+                   const char              *tabfn,
+                   const char              *tabpfn,
+                   const t_filenm          *tabbfnm,
+                   const gmx_device_info_t *deviceInfo,
+                   gmx_bool                 bNoSolvOpt,
+                   real                     print_force);
 
 /*! \brief Divide exclusions over threads
  *
index 294230543ebb3ff69430fd71983ba6cfb5139ecc..62123e082405d0935ecf228398de2552584a02c2 100644 (file)
@@ -118,6 +118,13 @@ runTaskAssignment(const std::vector<int>     &gpuIdsToUse,
                   const t_commrec            *cr,
                   const std::vector<GpuTask> &gpuTasksOnThisRank);
 
+//! Function for whether the task of \c mapping has value \c TaskType.
+template<GpuTask TaskType>
+bool hasTaskType(const GpuTaskMapping &mapping)
+{
+    return mapping.task_ == TaskType;
+}
+
 } // namespace
 
 #endif
index 53c55ff09fe9d25ef69b513ab9e31902911f42d8..b266ecb41547f674dba7523d96b1dc15fe5255b3 100644 (file)
@@ -84,6 +84,7 @@
 #include "gromacs/mdlib/mdrun.h"
 #include "gromacs/mdlib/minimize.h"
 #include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
 #include "gromacs/mdlib/nbnxn_search.h"
 #include "gromacs/mdlib/nbnxn_tuning.h"
 #include "gromacs/mdlib/qmmm.h"
@@ -1009,22 +1010,25 @@ int Mdrunner::mdrunner()
                                        cr, mdlog);
 
     gmx_device_info_t *nonbondedDeviceInfo = nullptr;
-    int                nonbondedDeviceId   = -1;
+
     if (thisRankHasDuty(cr, DUTY_PP))
     {
-        if (!gpuTaskAssignment.empty())
+        // This works because only one task of each type is currently permitted.
+        auto nbGpuTaskMapping = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(),
+                                             hasTaskType<GpuTask::Nonbonded>);
+        if (nbGpuTaskMapping != gpuTaskAssignment.end())
         {
-            GMX_RELEASE_ASSERT(gpuTaskAssignment.size() == 1, "A valid GPU assignment can only have one task per rank");
-            GMX_RELEASE_ASSERT(gpuTaskAssignment[0].task_ == gmx::GpuTask::Nonbonded, "A valid GPU assignment can only include short-ranged tasks");
-            nonbondedDeviceId   = gpuTaskAssignment[0].deviceId_;
+            int nonbondedDeviceId = nbGpuTaskMapping->deviceId_;
             nonbondedDeviceInfo = getDeviceInfo(hwinfo->gpu_info, nonbondedDeviceId);
-        }
-    }
+            init_gpu(mdlog, nonbondedDeviceInfo);
 
-    if (DOMAINDECOMP(cr))
-    {
-        /* When we share GPUs over ranks, we need to know this for the DLB */
-        dd_setup_dlb_resource_sharing(cr, nonbondedDeviceId);
+            if (DOMAINDECOMP(cr))
+            {
+                /* When we share GPUs over ranks, we need to know this for the DLB */
+                dd_setup_dlb_resource_sharing(cr, nonbondedDeviceId);
+            }
+
+        }
     }
 
     /* getting number of PP/PME threads
@@ -1306,8 +1310,16 @@ int Mdrunner::mdrunner()
         pmedata = nullptr;
     }
 
-    /* Free GPU memory and context */
-    free_gpu_resources(fr, cr, nonbondedDeviceInfo);
+    // FIXME: this is only here to manually unpin mdAtoms->chargeA_ and state->x,
+    // before we destroy the GPU context(s) in free_gpu_resources().
+    // Pinned buffers are associated with contexts in CUDA.
+    // As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
+    mdAtoms.reset(nullptr);
+    globalState.reset(nullptr);
+
+    /* Free GPU memory and set a physical node tMPI barrier (which should eventually go away) */
+    free_gpu_resources(fr, cr);
+    free_gpu(nonbondedDeviceInfo);
 
     if (doMembed)
     {