Merge branch release-2018 into release-2019
[alexxy/gromacs.git] / src / gromacs / ewald / pme.cpp
index 8b861b3163a6ae1f0f99593f468acc665f309529..717eadbfad9453d3e300af6466410df170772e1c 100644 (file)
 
 #include "config.h"
 
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
+#include <cassert>
 #include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
 
 #include <algorithm>
+#include <list>
 
+#include "gromacs/domdec/domdec.h"
 #include "gromacs/ewald/ewald-utils.h"
 #include "gromacs/fft/parallel_3dfft.h"
 #include "gromacs/fileio/pdbio.h"
 #include "gromacs/timing/cyclecounter.h"
 #include "gromacs/timing/wallcycle.h"
 #include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/topology/topology.h"
 #include "gromacs/utility/basedefinitions.h"
 #include "gromacs/utility/exceptions.h"
 #include "gromacs/utility/fatalerror.h"
 #include "pme-spline-work.h"
 #include "pme-spread.h"
 
+/*! \brief Help build a descriptive message in \c error if there are
+ * \c errorReasons why PME on GPU is not supported.
+ *
+ * \returns Whether the lack of errorReasons indicate there is support. */
+static bool
+addMessageIfNotSupported(const std::list<std::string> &errorReasons,
+                         std::string                  *error)
+{
+    bool foundErrorReasons = errorReasons.empty();
+    if (!foundErrorReasons && error)
+    {
+        std::string regressionTestMarker = "PME GPU does not support";
+        // this prefix is tested for in the regression tests script gmxtest.pl
+        *error = regressionTestMarker + ": " + gmx::joinStrings(errorReasons, "; ") + ".";
+    }
+    return foundErrorReasons;
+}
+
+bool pme_gpu_supports_build(std::string *error)
+{
+    std::list<std::string> errorReasons;
+    if (GMX_DOUBLE)
+    {
+        errorReasons.emplace_back("double precision");
+    }
+    if (GMX_GPU == GMX_GPU_NONE)
+    {
+        errorReasons.emplace_back("non-GPU build of GROMACS");
+    }
+    return addMessageIfNotSupported(errorReasons, error);
+}
+
+bool pme_gpu_supports_input(const t_inputrec &ir, const gmx_mtop_t &mtop, std::string *error)
+{
+    std::list<std::string> errorReasons;
+    if (!EEL_PME(ir.coulombtype))
+    {
+        errorReasons.emplace_back("systems that do not use PME for electrostatics");
+    }
+    if (ir.pme_order != 4)
+    {
+        errorReasons.emplace_back("interpolation orders other than 4");
+    }
+    if (ir.efep != efepNO)
+    {
+        if (gmx_mtop_has_perturbed_charges(mtop))
+        {
+            errorReasons.emplace_back("free energy calculations with perturbed charges (multiple grids)");
+        }
+    }
+    if (EVDW_PME(ir.vdwtype))
+    {
+        errorReasons.emplace_back("Lennard-Jones PME");
+    }
+    if (ir.cutoff_scheme == ecutsGROUP)
+    {
+        errorReasons.emplace_back("group cutoff scheme");
+    }
+    if (!EI_DYNAMICS(ir.eI))
+    {
+        errorReasons.emplace_back("not a dynamical integrator");
+    }
+    return addMessageIfNotSupported(errorReasons, error);
+}
+
+/*! \brief \libinternal
+ * Finds out if PME with given inputs is possible to run on GPU.
+ * This function is an internal final check, validating the whole PME structure on creation,
+ * but it still duplicates the preliminary checks from the above (externally exposed) pme_gpu_supports_input() - just in case.
+ *
+ * \param[in]  pme          The PME structure.
+ * \param[out] error        The error message if the input is not supported on GPU.
+ * \returns                 True if this PME input is possible to run on GPU, false otherwise.
+ */
+static bool pme_gpu_check_restrictions(const gmx_pme_t *pme, std::string *error)
+{
+    std::list<std::string> errorReasons;
+    if (pme->nnodes != 1)
+    {
+        errorReasons.emplace_back("PME decomposition");
+    }
+    if (pme->pme_order != 4)
+    {
+        errorReasons.emplace_back("interpolation orders other than 4");
+    }
+    if (pme->bFEP)
+    {
+        errorReasons.emplace_back("free energy calculations (multiple grids)");
+    }
+    if (pme->doLJ)
+    {
+        errorReasons.emplace_back("Lennard-Jones PME");
+    }
+    if (GMX_DOUBLE)
+    {
+        errorReasons.emplace_back("double precision");
+    }
+    if (GMX_GPU == GMX_GPU_NONE)
+    {
+        errorReasons.emplace_back("non-GPU build of GROMACS");
+    }
+
+    return addMessageIfNotSupported(errorReasons, error);
+}
+
+PmeRunMode pme_run_mode(const gmx_pme_t *pme)
+{
+    GMX_ASSERT(pme != nullptr, "Expecting valid PME data pointer");
+    return pme->runMode;
+}
+
+gmx::PinningPolicy pme_get_pinning_policy()
+{
+    return gmx::PinningPolicy::PinnedIfSupported;
+}
+
 /*! \brief Number of bytes in a cache line.
  *
  * Must also be a multiple of the SIMD and SIMD4 register size, to
@@ -177,7 +295,7 @@ static double estimate_pme_load_imbalance(struct gmx_pme_t *pme)
 
     /* pme_solve is roughly double the cost of an fft */
 
-    return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
+    return (n1 + n2 + 3*n3)/static_cast<double>(6*pme->nkx*pme->nky*pme->nkz);
 }
 
 /*! \brief Initialize atom communication data structure */
@@ -298,10 +416,7 @@ init_overlap_comm(pme_overlap_t *  ol,
                   int              ndata,
                   int              commplainsize)
 {
-    int              b, i;
-    pme_grid_comm_t *pgc;
     gmx_bool         bCont;
-    int              fft_start, fft_end, send_index1, recv_index1;
 #if GMX_MPI
     MPI_Status       stat;
 
@@ -318,13 +433,13 @@ init_overlap_comm(pme_overlap_t *  ol,
      * that belong to higher nodes (modulo nnodes)
      */
 
-    snew(ol->s2g0, ol->nnodes+1);
-    snew(ol->s2g1, ol->nnodes);
+    ol->s2g0.resize(ol->nnodes + 1);
+    ol->s2g1.resize(ol->nnodes);
     if (debug)
     {
         fprintf(debug, "PME slab boundaries:");
     }
-    for (i = 0; i < nnodes; i++)
+    for (int i = 0; i < nnodes; i++)
     {
         /* s2g0 the local interpolation grid start.
          * s2g1 the local interpolation grid end.
@@ -332,8 +447,8 @@ init_overlap_comm(pme_overlap_t *  ol,
          * spatially uniform along dimension x or y, we need to round
          * s2g0 down and s2g1 up.
          */
-        ol->s2g0[i] = ( i   *ndata + 0       )/nnodes;
-        ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
+        ol->s2g0[i] = (i * ndata + 0) / nnodes;
+        ol->s2g1[i] = ((i + 1) * ndata + nnodes - 1) / nnodes + norder - 1;
 
         if (debug)
         {
@@ -347,55 +462,50 @@ init_overlap_comm(pme_overlap_t *  ol,
     }
 
     /* Determine with how many nodes we need to communicate the grid overlap */
-    b = 0;
+    int testRankCount = 0;
     do
     {
-        b++;
+        testRankCount++;
         bCont = FALSE;
-        for (i = 0; i < nnodes; i++)
+        for (int i = 0; i < nnodes; i++)
         {
-            if ((i+b <  nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
-                (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
+            if ((i + testRankCount <  nnodes && ol->s2g1[i] > ol->s2g0[i + testRankCount]) ||
+                (i + testRankCount >= nnodes && ol->s2g1[i] > ol->s2g0[i + testRankCount - nnodes] + ndata))
             {
                 bCont = TRUE;
             }
         }
     }
-    while (bCont && b < nnodes);
-    ol->noverlap_nodes = b - 1;
-
-    snew(ol->send_id, ol->noverlap_nodes);
-    snew(ol->recv_id, ol->noverlap_nodes);
-    for (b = 0; b < ol->noverlap_nodes; b++)
-    {
-        ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
-        ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
-    }
-    snew(ol->comm_data, ol->noverlap_nodes);
+    while (bCont && testRankCount < nnodes);
 
+    ol->comm_data.resize(testRankCount - 1);
     ol->send_size = 0;
-    for (b = 0; b < ol->noverlap_nodes; b++)
+
+    for (size_t b = 0; b < ol->comm_data.size(); b++)
     {
-        pgc = &ol->comm_data[b];
+        pme_grid_comm_t *pgc = &ol->comm_data[b];
+
         /* Send */
-        fft_start        = ol->s2g0[ol->send_id[b]];
-        fft_end          = ol->s2g0[ol->send_id[b]+1];
-        if (ol->send_id[b] < nodeid)
+        pgc->send_id = (ol->nodeid + (b + 1)) % ol->nnodes;
+        int fft_start = ol->s2g0[pgc->send_id];
+        int fft_end   = ol->s2g0[pgc->send_id + 1];
+        if (pgc->send_id < nodeid)
         {
             fft_start += ndata;
             fft_end   += ndata;
         }
-        send_index1       = ol->s2g1[nodeid];
-        send_index1       = std::min(send_index1, fft_end);
-        pgc->send_index0  = fft_start;
-        pgc->send_nindex  = std::max(0, send_index1 - pgc->send_index0);
-        ol->send_size    += pgc->send_nindex;
+        int send_index1  = ol->s2g1[nodeid];
+        send_index1      = std::min(send_index1, fft_end);
+        pgc->send_index0 = fft_start;
+        pgc->send_nindex = std::max(0, send_index1 - pgc->send_index0);
+        ol->send_size   += pgc->send_nindex;
 
         /* We always start receiving to the first index of our slab */
+        pgc->recv_id     = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
         fft_start        = ol->s2g0[ol->nodeid];
-        fft_end          = ol->s2g0[ol->nodeid+1];
-        recv_index1      = ol->s2g1[ol->recv_id[b]];
-        if (ol->recv_id[b] > nodeid)
+        fft_end          = ol->s2g0[ol->nodeid + 1];
+        int recv_index1  = ol->s2g1[pgc->recv_id];
+        if (pgc->recv_id > nodeid)
         {
             recv_index1 -= ndata;
         }
@@ -406,30 +516,17 @@ init_overlap_comm(pme_overlap_t *  ol,
 
 #if GMX_MPI
     /* Communicate the buffer sizes to receive */
-    for (b = 0; b < ol->noverlap_nodes; b++)
+    for (size_t b = 0; b < ol->comm_data.size(); b++)
     {
-        MPI_Sendrecv(&ol->send_size, 1, MPI_INT, ol->send_id[b], b,
-                     &ol->comm_data[b].recv_size, 1, MPI_INT, ol->recv_id[b], b,
+        MPI_Sendrecv(&ol->send_size, 1, MPI_INT, ol->comm_data[b].send_id, b,
+                     &ol->comm_data[b].recv_size, 1, MPI_INT, ol->comm_data[b].recv_id, b,
                      ol->mpi_comm, &stat);
     }
 #endif
 
     /* For non-divisible grid we need pme_order iso pme_order-1 */
-    snew(ol->sendbuf, norder*commplainsize);
-    snew(ol->recvbuf, norder*commplainsize);
-}
-
-/*! \brief Destroy data structure for communication */
-static void
-destroy_overlap_comm(const pme_overlap_t *ol)
-{
-    sfree(ol->s2g0);
-    sfree(ol->s2g1);
-    sfree(ol->send_id);
-    sfree(ol->recv_id);
-    sfree(ol->comm_data);
-    sfree(ol->sendbuf);
-    sfree(ol->recvbuf);
+    ol->sendbuf.resize(norder * commplainsize);
+    ol->recvbuf.resize(norder * commplainsize);
 }
 
 int minimalPmeGridSize(int pmeOrder)
@@ -451,7 +548,7 @@ int minimalPmeGridSize(int pmeOrder)
 
 bool gmx_pme_check_restrictions(int pme_order,
                                 int nkx, int nky, int nkz,
-                                int nnodes_major,
+                                int numPmeDomainsAlongX,
                                 bool useThreads,
                                 bool errorsAreFatal)
 {
@@ -465,7 +562,7 @@ bool gmx_pme_check_restrictions(int pme_order,
         std::string message = gmx::formatString(
                     "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
                     pme_order, PME_ORDER_MAX);
-        GMX_THROW(InconsistentInputError(message));
+        GMX_THROW(gmx::InconsistentInputError(message));
     }
 
     const int minGridSize = minimalPmeGridSize(pme_order);
@@ -480,21 +577,21 @@ bool gmx_pme_check_restrictions(int pme_order,
         std::string message = gmx::formatString(
                     "The PME grid sizes need to be >= 2*(pme_order-1) (%d)",
                     minGridSize);
-        GMX_THROW(InconsistentInputError(message));
+        GMX_THROW(gmx::InconsistentInputError(message));
     }
 
     /* Check for a limitation of the (current) sum_fftgrid_dd code.
      * We only allow multiple communication pulses in dim 1, not in dim 0.
      */
-    if (useThreads && (nkx < nnodes_major*pme_order &&
-                       nkx != nnodes_major*(pme_order - 1)))
+    if (useThreads && (nkx < numPmeDomainsAlongX*pme_order &&
+                       nkx != numPmeDomainsAlongX*(pme_order - 1)))
     {
         if (!errorsAreFatal)
         {
             return false;
         }
         gmx_fatal(FARGS, "The number of PME grid lines per rank along x is %g. But when using OpenMP threads, the number of grid lines per rank along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use fewer ranks along x (and possibly more along y and/or z) by specifying -dd manually.",
-                  nkx/(double)nnodes_major, pme_order);
+                  nkx/static_cast<double>(numPmeDomainsAlongX), pme_order);
     }
 
     return true;
@@ -506,21 +603,21 @@ static int div_round_up(int enumerator, int denominator)
     return (enumerator + denominator - 1)/denominator;
 }
 
-gmx_pme_t *gmx_pme_init(const t_commrec     *cr,
-                        int                  nnodes_major,
-                        int                  nnodes_minor,
-                        const t_inputrec    *ir,
-                        int                  homenr,
-                        gmx_bool             bFreeEnergy_q,
-                        gmx_bool             bFreeEnergy_lj,
-                        gmx_bool             bReproducible,
-                        real                 ewaldcoeff_q,
-                        real                 ewaldcoeff_lj,
-                        int                  nthread,
-                        PmeRunMode           runMode,
-                        PmeGpu              *pmeGpu,
-                        gmx_device_info_t   *gpuInfo,
-                        const gmx::MDLogger  & /*mdlog*/)
+gmx_pme_t *gmx_pme_init(const t_commrec         *cr,
+                        const NumPmeDomains     &numPmeDomains,
+                        const t_inputrec        *ir,
+                        int                      homenr,
+                        gmx_bool                 bFreeEnergy_q,
+                        gmx_bool                 bFreeEnergy_lj,
+                        gmx_bool                 bReproducible,
+                        real                     ewaldcoeff_q,
+                        real                     ewaldcoeff_lj,
+                        int                      nthread,
+                        PmeRunMode               runMode,
+                        PmeGpu                  *pmeGpu,
+                        const gmx_device_info_t *gpuInfo,
+                        PmeGpuProgramHandle      pmeGpuProgram,
+                        const gmx::MDLogger      & /*mdlog*/)
 {
     int               use_threads, sum_use_threads, i;
     ivec              ndata;
@@ -530,7 +627,7 @@ gmx_pme_t *gmx_pme_init(const t_commrec     *cr,
         fprintf(debug, "Creating PME data structures.\n");
     }
 
-    unique_cptr<gmx_pme_t, gmx_pme_destroy> pme(new gmx_pme_t());
+    gmx::unique_cptr<gmx_pme_t, gmx_pme_destroy> pme(new gmx_pme_t());
 
     pme->sum_qgrid_tmp       = nullptr;
     pme->sum_qgrid_dd_tmp    = nullptr;
@@ -540,17 +637,17 @@ gmx_pme_t *gmx_pme_init(const t_commrec     *cr,
     pme->nnodes              = 1;
     pme->bPPnode             = TRUE;
 
-    pme->nnodes_major        = nnodes_major;
-    pme->nnodes_minor        = nnodes_minor;
+    pme->nnodes_major        = numPmeDomains.x;
+    pme->nnodes_minor        = numPmeDomains.y;
 
 #if GMX_MPI
-    if (nnodes_major*nnodes_minor > 1)
+    if (numPmeDomains.x*numPmeDomains.y > 1)
     {
         pme->mpi_comm = cr->mpi_comm_mygroup;
 
         MPI_Comm_rank(pme->mpi_comm, &pme->nodeid);
         MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
-        if (pme->nnodes != nnodes_major*nnodes_minor)
+        if (pme->nnodes != numPmeDomains.x*numPmeDomains.y)
         {
             gmx_incons("PME rank count mismatch");
         }
@@ -576,7 +673,7 @@ gmx_pme_t *gmx_pme_init(const t_commrec     *cr,
     }
     else
     {
-        if (nnodes_minor == 1)
+        if (numPmeDomains.y == 1)
         {
 #if GMX_MPI
             pme->mpi_comm_d[0] = pme->mpi_comm;
@@ -587,7 +684,7 @@ gmx_pme_t *gmx_pme_init(const t_commrec     *cr,
             pme->nodeid_minor = 0;
 
         }
-        else if (nnodes_major == 1)
+        else if (numPmeDomains.x == 1)
         {
 #if GMX_MPI
             pme->mpi_comm_d[0] = MPI_COMM_NULL;
@@ -599,16 +696,16 @@ gmx_pme_t *gmx_pme_init(const t_commrec     *cr,
         }
         else
         {
-            if (pme->nnodes % nnodes_major != 0)
+            if (pme->nnodes % numPmeDomains.x != 0)
             {
-                gmx_incons("For 2D PME decomposition, #PME ranks must be divisible by the number of ranks in the major dimension");
+                gmx_incons("For 2D PME decomposition, #PME ranks must be divisible by the number of domains along x");
             }
             pme->ndecompdim = 2;
 
 #if GMX_MPI
-            MPI_Comm_split(pme->mpi_comm, pme->nodeid % nnodes_minor,
+            MPI_Comm_split(pme->mpi_comm, pme->nodeid % numPmeDomains.y,
                            pme->nodeid, &pme->mpi_comm_d[0]);  /* My communicator along major dimension */
-            MPI_Comm_split(pme->mpi_comm, pme->nodeid/nnodes_minor,
+            MPI_Comm_split(pme->mpi_comm, pme->nodeid/numPmeDomains.y,
                            pme->nodeid, &pme->mpi_comm_d[1]);  /* My communicator along minor dimension */
 
             MPI_Comm_rank(pme->mpi_comm_d[0], &pme->nodeid_major);
@@ -706,7 +803,7 @@ gmx_pme_t *gmx_pme_init(const t_commrec     *cr,
                     "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_ranks_x (%d)\n"
                     "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_ranks_y (%d)\n"
                     "\n",
-                    (int)((imbal-1)*100 + 0.5),
+                    gmx::roundToInt((imbal-1)*100),
                     pme->nkx, pme->nky, pme->nnodes_major,
                     pme->nky, pme->nkz, pme->nnodes_minor);
         }
@@ -740,7 +837,7 @@ gmx_pme_t *gmx_pme_init(const t_commrec     *cr,
     /* Double-check for a limitation of the (current) sum_fftgrid_dd code.
      * Note that gmx_pme_check_restrictions checked for this already.
      */
-    if (pme->bUseThreads && pme->overlap[0].noverlap_nodes > 1)
+    if (pme->bUseThreads && (pme->overlap[0].comm_data.size() > 1))
     {
         gmx_incons("More than one communication pulse required for grid overlap communication along the major dimension while using threads");
     }
@@ -816,7 +913,7 @@ gmx_pme_t *gmx_pme_init(const t_commrec     *cr,
                           pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
                           pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
             /* This routine will allocate the grid data to fit the FFTs */
-            const auto allocateRealGridForGpu = (pme->runMode == PmeRunMode::Mixed) ? gmx::PinningPolicy::CanBePinned : gmx::PinningPolicy::CannotBePinned;
+            const auto allocateRealGridForGpu = (pme->runMode == PmeRunMode::Mixed) ? gmx::PinningPolicy::PinnedIfSupported : gmx::PinningPolicy::CannotBePinned;
             gmx_parallel_3dfft_init(&pme->pfft_setup[i], ndata,
                                     &pme->fftgrid[i], &pme->cfftgrid[i],
                                     pme->mpi_comm_d,
@@ -837,7 +934,7 @@ gmx_pme_t *gmx_pme_init(const t_commrec     *cr,
     }
 
     /* Use atc[0] for spreading */
-    init_atomcomm(pme.get(), &pme->atc[0], nnodes_major > 1 ? 0 : 1, TRUE);
+    init_atomcomm(pme.get(), &pme->atc[0], numPmeDomains.x > 1 ? 0 : 1, TRUE);
     if (pme->ndecompdim >= 2)
     {
         init_atomcomm(pme.get(), &pme->atc[1], 1, FALSE);
@@ -853,7 +950,21 @@ gmx_pme_t *gmx_pme_init(const t_commrec     *cr,
     pme->lb_buf2       = nullptr;
     pme->lb_buf_nalloc = 0;
 
-    pme_gpu_reinit(pme.get(), gpuInfo);
+    if (pme_gpu_active(pme.get()))
+    {
+        if (!pme->gpu)
+        {
+            // Initial check of validity of the data
+            std::string errorString;
+            bool        canRunOnGpu = pme_gpu_check_restrictions(pme.get(), &errorString);
+            if (!canRunOnGpu)
+            {
+                GMX_THROW(gmx::NotImplementedError(errorString));
+            }
+        }
+
+        pme_gpu_reinit(pme.get(), gpuInfo, pmeGpuProgram);
+    }
 
     pme_init_all_work(&pme->solve_work, pme->nthread, pme->nkx);
 
@@ -862,7 +973,7 @@ gmx_pme_t *gmx_pme_init(const t_commrec     *cr,
 }
 
 void gmx_pme_reinit(struct gmx_pme_t **pmedata,
-                    t_commrec *        cr,
+                    const t_commrec   *cr,
                     struct gmx_pme_t * pme_src,
                     const t_inputrec * ir,
                     const ivec         grid_size,
@@ -902,9 +1013,10 @@ void gmx_pme_reinit(struct gmx_pme_t **pmedata,
         // so we don't expect the actual logging.
         // TODO: when PME is an object, it should take reference to mdlog on construction and save it.
         GMX_ASSERT(pmedata, "Invalid PME pointer");
-        *pmedata = gmx_pme_init(cr, pme_src->nnodes_major, pme_src->nnodes_minor,
+        NumPmeDomains numPmeDomains = { pme_src->nnodes_major, pme_src->nnodes_minor };
+        *pmedata = gmx_pme_init(cr, numPmeDomains,
                                 &irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE, ewaldcoeff_q, ewaldcoeff_lj,
-                                pme_src->nthread, pme_src->runMode, pme_src->gpu, nullptr, dummyLogger);
+                                pme_src->nthread, pme_src->runMode, pme_src->gpu, nullptr, nullptr, dummyLogger);
         //TODO this is mostly passing around current values
     }
     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
@@ -923,7 +1035,7 @@ void gmx_pme_calc_energy(struct gmx_pme_t *pme, int n, rvec *x, real *q, real *V
     {
         gmx_incons("gmx_pme_calc_energy called in parallel");
     }
-    if (pme->bFEP_q > 1)
+    if (pme->bFEP_q)
     {
         gmx_incons("gmx_pme_calc_energy with free energy");
     }
@@ -953,7 +1065,7 @@ void gmx_pme_calc_energy(struct gmx_pme_t *pme, int n, rvec *x, real *q, real *V
 
 /*! \brief Calculate initial Lorentz-Berthelot coefficients for LJ-PME */
 static void
-calc_initial_lb_coeffs(struct gmx_pme_t *pme, real *local_c6, real *local_sigma)
+calc_initial_lb_coeffs(struct gmx_pme_t *pme, const real *local_c6, const real *local_sigma)
 {
     int  i;
     for (i = 0; i < pme->atc[0].n; ++i)
@@ -968,7 +1080,7 @@ calc_initial_lb_coeffs(struct gmx_pme_t *pme, real *local_c6, real *local_sigma)
 
 /*! \brief Calculate next Lorentz-Berthelot coefficients for LJ-PME */
 static void
-calc_next_lb_coeffs(struct gmx_pme_t *pme, real *local_sigma)
+calc_next_lb_coeffs(struct gmx_pme_t *pme, const real *local_sigma)
 {
     int  i;
 
@@ -984,9 +1096,9 @@ int gmx_pme_do(struct gmx_pme_t *pme,
                real chargeA[],  real chargeB[],
                real c6A[],      real c6B[],
                real sigmaA[],   real sigmaB[],
-               matrix box,      t_commrec *cr,
+               matrix box,      const t_commrec *cr,
                int  maxshift_x, int maxshift_y,
-               t_nrnb *nrnb,    gmx_wallcycle_t wcycle,
+               t_nrnb *nrnb,    gmx_wallcycle *wcycle,
                matrix vir_q,    matrix vir_lj,
                real *energy_q,  real *energy_lj,
                real lambda_q,   real lambda_lj,
@@ -1013,9 +1125,9 @@ int gmx_pme_do(struct gmx_pme_t *pme,
     gmx_bool             bFirst, bDoSplines;
     int                  fep_state;
     int                  fep_states_lj           = pme->bFEP_lj ? 2 : 1;
-    const gmx_bool       bCalcEnerVir            = flags & GMX_PME_CALC_ENER_VIR;
-    const gmx_bool       bBackFFT                = flags & (GMX_PME_CALC_F | GMX_PME_CALC_POT);
-    const gmx_bool       bCalcF                  = flags & GMX_PME_CALC_F;
+    const gmx_bool       bCalcEnerVir            = (flags & GMX_PME_CALC_ENER_VIR) != 0;
+    const gmx_bool       bBackFFT                = (flags & (GMX_PME_CALC_F | GMX_PME_CALC_POT)) != 0;
+    const gmx_bool       bCalcF                  = (flags & GMX_PME_CALC_F) != 0;
 
     /* We could be passing lambda!=1 while no q or LJ is actually perturbed */
     if (!pme->bFEP_q)
@@ -1118,13 +1230,12 @@ int gmx_pme_do(struct gmx_pme_t *pme,
         {
             fprintf(debug, "PME: number of ranks = %d, rank = %d\n",
                     cr->nnodes, cr->nodeid);
-            fprintf(debug, "Grid = %p\n", (void*)grid);
+            fprintf(debug, "Grid = %p\n", static_cast<void*>(grid));
             if (grid == nullptr)
             {
                 gmx_fatal(FARGS, "No grid!");
             }
         }
-        where();
 
         if (pme->nnodes == 1)
         {
@@ -1134,7 +1245,6 @@ int gmx_pme_do(struct gmx_pme_t *pme,
         {
             wallcycle_start(wcycle, ewcPME_REDISTXF);
             do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, coefficient);
-            where();
 
             wallcycle_stop(wcycle, ewcPME_REDISTXF);
         }
@@ -1168,7 +1278,6 @@ int gmx_pme_do(struct gmx_pme_t *pme,
                 if (pme->nnodes > 1)
                 {
                     gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
-                    where();
                 }
 #endif
 
@@ -1205,7 +1314,6 @@ int gmx_pme_do(struct gmx_pme_t *pme,
                     {
                         wallcycle_stop(wcycle, ewcPME_FFT);
                     }
-                    where();
 
                     /* solve in k-space for our local cells */
                     if (thread == 0)
@@ -1232,7 +1340,6 @@ int gmx_pme_do(struct gmx_pme_t *pme,
                     if (thread == 0)
                     {
                         wallcycle_stop(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
-                        where();
                         inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
                     }
                 }
@@ -1242,7 +1349,6 @@ int gmx_pme_do(struct gmx_pme_t *pme,
                     /* do 3d-invfft */
                     if (thread == 0)
                     {
-                        where();
                         wallcycle_start(wcycle, ewcPME_FFT);
                     }
                     gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
@@ -1251,7 +1357,6 @@ int gmx_pme_do(struct gmx_pme_t *pme,
                     {
                         wallcycle_stop(wcycle, ewcPME_FFT);
 
-                        where();
 
                         if (pme->nodeid == 0)
                         {
@@ -1283,7 +1388,6 @@ int gmx_pme_do(struct gmx_pme_t *pme,
                 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
             }
 #endif
-            where();
 
             unwrap_periodic_pmegrid(pme, grid);
         }
@@ -1292,7 +1396,6 @@ int gmx_pme_do(struct gmx_pme_t *pme,
         {
             /* interpolate forces for our local atoms */
 
-            where();
 
             /* If we are running without parallelization,
              * atc->f is the actual force array, not a buffer,
@@ -1312,7 +1415,6 @@ int gmx_pme_do(struct gmx_pme_t *pme,
                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
             }
 
-            where();
 
             inc_nrnb(nrnb, eNR_GATHERFBSP,
                      pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
@@ -1399,7 +1501,6 @@ int gmx_pme_do(struct gmx_pme_t *pme,
                 {
                     local_c6[i] = atc->coefficient[i];
                 }
-                where();
 
                 do_redist_pos_coeffs(pme, cr, start, homenr, FALSE, x, RedistSigma);
                 local_sigma = pme->lb_buf2;
@@ -1407,7 +1508,6 @@ int gmx_pme_do(struct gmx_pme_t *pme,
                 {
                     local_sigma[i] = atc->coefficient[i];
                 }
-                where();
 
                 wallcycle_stop(wcycle, ewcPME_REDISTXF);
             }
@@ -1422,7 +1522,6 @@ int gmx_pme_do(struct gmx_pme_t *pme,
                 pfft_setup = pme->pfft_setup[grid_index];
                 calc_next_lb_coeffs(pme, local_sigma);
                 grid = pmegrid->grid.grid;
-                where();
 
                 if (flags & GMX_PME_SPREAD)
                 {
@@ -1445,7 +1544,6 @@ int gmx_pme_do(struct gmx_pme_t *pme,
                         if (pme->nnodes > 1)
                         {
                             gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
-                            where();
                         }
 #endif
                         copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
@@ -1472,7 +1570,6 @@ int gmx_pme_do(struct gmx_pme_t *pme,
                             {
                                 wallcycle_stop(wcycle, ewcPME_FFT);
                             }
-                            where();
                         }
                     }
                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
@@ -1501,7 +1598,6 @@ int gmx_pme_do(struct gmx_pme_t *pme,
                         if (thread == 0)
                         {
                             wallcycle_stop(wcycle, ewcLJPME);
-                            where();
                             inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
                         }
                     }
@@ -1529,7 +1625,6 @@ int gmx_pme_do(struct gmx_pme_t *pme,
                     pfft_setup = pme->pfft_setup[grid_index];
                     grid       = pmegrid->grid.grid;
                     calc_next_lb_coeffs(pme, local_sigma);
-                    where();
 #pragma omp parallel num_threads(pme->nthread) private(thread)
                     {
                         try
@@ -1538,7 +1633,6 @@ int gmx_pme_do(struct gmx_pme_t *pme,
                             /* do 3d-invfft */
                             if (thread == 0)
                             {
-                                where();
                                 wallcycle_start(wcycle, ewcPME_FFT);
                             }
 
@@ -1548,7 +1642,6 @@ int gmx_pme_do(struct gmx_pme_t *pme,
                             {
                                 wallcycle_stop(wcycle, ewcPME_FFT);
 
-                                where();
 
                                 if (pme->nodeid == 0)
                                 {
@@ -1571,14 +1664,12 @@ int gmx_pme_do(struct gmx_pme_t *pme,
                         gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
                     }
 #endif
-                    where();
 
                     unwrap_periodic_pmegrid(pme, grid);
 
                     if (bCalcF)
                     {
                         /* interpolate forces for our local atoms */
-                        where();
                         bClearF = (bFirst && PAR(cr));
                         scale   = pme->bFEP ? (fep_state < 1 ? 1.0-lambda_lj : lambda_lj) : 1.0;
                         scale  *= lb_scale_factor[grid_index-2];
@@ -1595,7 +1686,6 @@ int gmx_pme_do(struct gmx_pme_t *pme,
                             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
                         }
 
-                        where();
 
                         inc_nrnb(nrnb, eNR_GATHERFBSP,
                                  pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
@@ -1633,7 +1723,6 @@ int gmx_pme_do(struct gmx_pme_t *pme,
 
         wallcycle_stop(wcycle, ewcPME_REDISTXF);
     }
-    where();
 
     if (bCalcEnerVir)
     {
@@ -1740,9 +1829,6 @@ void gmx_pme_destroy(gmx_pme_t *pme)
         sfree(pme->bsp_mod[i]);
     }
 
-    destroy_overlap_comm(&pme->overlap[0]);
-    destroy_overlap_comm(&pme->overlap[1]);
-
     sfree(pme->lb_buf1);
     sfree(pme->lb_buf2);
 
@@ -1757,6 +1843,8 @@ void gmx_pme_destroy(gmx_pme_t *pme)
     sfree(pme->sum_qgrid_tmp);
     sfree(pme->sum_qgrid_dd_tmp);
 
+    destroy_pme_spline_work(pme->spline_work);
+
     if (pme_gpu_active(pme) && pme->gpu)
     {
         pme_gpu_destroy(pme->gpu);