Add class ListedForces
[alexxy/gromacs.git] / src / gromacs / listed_forces / manage_threading.cpp
index f663cbebf3aac109c61d02b063fabc5974673c2e..f666b5a012ecdb5499348fa29fa67533b631caa9 100644 (file)
@@ -40,6 +40,7 @@
  * interactions.
  *
  * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \author Berk Hess <hess@kth.se>
  * \ingroup module_listed_forces
  */
 #include "gmxpre.h"
 #include <string>
 
 #include "gromacs/listed_forces/gpubonded.h"
-#include "gromacs/mdlib/gmx_omp_nthreads.h"
 #include "gromacs/pbcutil/ishift.h"
 #include "gromacs/topology/ifunc.h"
 #include "gromacs/utility/exceptions.h"
 #include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/gmxassert.h"
-#include "gromacs/utility/smalloc.h"
 
 #include "listed_internal.h"
 #include "utilities.h"
@@ -348,25 +347,21 @@ static void calc_bonded_reduction_mask(int                           natoms,
     GMX_ASSERT(bondedThreading.nthreads <= BITMASK_SIZE,
                "We need at least nthreads bits in the mask");
 
-    int nblock = (natoms + reduction_block_size - 1) >> reduction_block_bits;
+    const int nblock = (natoms + reduction_block_size - 1) >> reduction_block_bits;
 
-    if (nblock > f_thread->block_nalloc)
-    {
-        f_thread->block_nalloc = over_alloc_large(nblock);
-        srenew(f_thread->mask, f_thread->block_nalloc);
-        srenew(f_thread->block_index, f_thread->block_nalloc);
-        // NOTE: It seems f_thread->f does not need to be aligned
-        sfree_aligned(f_thread->f);
-        snew_aligned(f_thread->f, f_thread->block_nalloc * reduction_block_size, 128);
-    }
-
-    gmx_bitmask_t* mask = f_thread->mask;
+    f_thread->mask.resize(nblock);
+    f_thread->block_index.resize(nblock);
+    // NOTE: It seems f_thread->f does not need to be aligned
+    f_thread->fBuffer.resize(nblock * reduction_block_size * sizeof(rvec4) / sizeof(real));
+    f_thread->f = reinterpret_cast<rvec4*>(f_thread->fBuffer.data());
 
-    for (int b = 0; b < nblock; b++)
+    for (gmx_bitmask_t& mask : f_thread->mask)
     {
-        bitmask_clear(&mask[b]);
+        bitmask_clear(&mask);
     }
 
+    gmx::ArrayRef<gmx_bitmask_t> mask = f_thread->mask;
+
     for (int ftype = 0; ftype < F_NRE; ftype++)
     {
         if (ftype_is_bonded_potential(ftype))
@@ -404,7 +399,7 @@ static void calc_bonded_reduction_mask(int                           natoms,
 }
 
 void setup_bonded_threading(bonded_threading_t*           bt,
-                            int                           numAtoms,
+                            int                           numAtomsForce,
                             bool                          useGpuForBondeds,
                             const InteractionDefinitions& idef)
 {
@@ -412,6 +407,8 @@ void setup_bonded_threading(bonded_threading_t*           bt,
 
     assert(bt->nthreads >= 1);
 
+    bt->numAtomsForce = numAtomsForce;
+
     /* Divide the bonded interaction over the threads */
     divide_bondeds_over_threads(bt, useGpuForBondeds, idef);
 
@@ -429,7 +426,7 @@ void setup_bonded_threading(bonded_threading_t*           bt,
     {
         try
         {
-            calc_bonded_reduction_mask(numAtoms, bt->f_t[t].get(), idef, t, *bt);
+            calc_bonded_reduction_mask(numAtomsForce, bt->f_t[t].get(), idef, t, *bt);
         }
         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
     }
@@ -437,7 +434,7 @@ void setup_bonded_threading(bonded_threading_t*           bt,
     /* Reduce the masks over the threads and determine which blocks
      * we need to reduce over.
      */
-    int nblock_tot = (numAtoms + reduction_block_size - 1) >> reduction_block_bits;
+    int nblock_tot = (numAtomsForce + reduction_block_size - 1) >> reduction_block_bits;
     /* Ensure we have sufficient space for all blocks */
     if (static_cast<size_t>(nblock_tot) > bt->block_index.size())
     {
@@ -485,36 +482,29 @@ void setup_bonded_threading(bonded_threading_t*           bt,
     {
         fprintf(debug, "Number of %d atom blocks to reduce: %d\n", reduction_block_size, bt->nblock_used);
         fprintf(debug, "Reduction density %.2f for touched blocks only %.2f\n",
-                ctot * reduction_block_size / static_cast<double>(numAtoms),
+                ctot * reduction_block_size / static_cast<double>(numAtomsForce),
                 ctot / static_cast<double>(bt->nblock_used));
     }
 }
 
-void tear_down_bonded_threading(bonded_threading_t* bt)
-{
-    delete bt;
-}
-
-f_thread_t::f_thread_t(int numEnergyGroups) : grpp(numEnergyGroups)
-{
-    snew(fshift, SHIFTS);
-}
-
-f_thread_t::~f_thread_t()
-{
-    sfree(mask);
-    sfree(fshift);
-    sfree(block_index);
-    sfree_aligned(f);
-}
+f_thread_t::f_thread_t(int numEnergyGroups) : fshift(SHIFTS), grpp(numEnergyGroups) {}
 
-bonded_threading_t::bonded_threading_t(const int numThreads, const int numEnergyGroups) :
+bonded_threading_t::bonded_threading_t(const int numThreads, const int numEnergyGroups, FILE* fplog) :
     nthreads(numThreads),
     nblock_used(0),
     haveBondeds(false),
     workDivision(nthreads),
     foreignLambdaWorkDivision(1)
 {
+    /* These thread local data structures are used for bondeds only.
+     *
+     * Note that we also use there structures when running single-threaded.
+     * This is because the bonded force buffer uses type rvec4, whereas
+     * the normal force buffer is uses type rvec. This leads to a little
+     * reduction overhead, but the speed gain in the bonded calculations
+     * of doing transposeScatterIncr/DecrU with aligment 4 instead of 3
+     * is much larger than the reduction overhead.
+     */
     f_t.resize(numThreads);
 #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (int t = 0; t < nthreads; t++)
@@ -528,41 +518,25 @@ bonded_threading_t::bonded_threading_t(const int numThreads, const int numEnergy
         }
         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
     }
-}
-
-bonded_threading_t* init_bonded_threading(FILE* fplog, const int nenergrp)
-{
-    /* These thread local data structures are used for bondeds only.
-     *
-     * Note that we also use there structures when running single-threaded.
-     * This is because the bonded force buffer uses type rvec4, whereas
-     * the normal force buffer is uses type rvec. This leads to a little
-     * reduction overhead, but the speed gain in the bonded calculations
-     * of doing transposeScatterIncr/DecrU with aligment 4 instead of 3
-     * is much larger than the reduction overhead.
-     */
-    bonded_threading_t* bt = new bonded_threading_t(gmx_omp_nthreads_get(emntBonded), nenergrp);
 
     /* The optimal value after which to switch from uniform to localized
      * bonded interaction distribution is 3, 4 or 5 depending on the system
      * and hardware.
      */
-    const int max_nthread_uniform = 4;
+    const int max_nthread_uniform_default = 4;
     char*     ptr;
 
     if ((ptr = getenv("GMX_BONDED_NTHREAD_UNIFORM")) != nullptr)
     {
-        sscanf(ptr, "%d", &bt->max_nthread_uniform);
+        sscanf(ptr, "%d", &max_nthread_uniform);
         if (fplog != nullptr)
         {
             fprintf(fplog, "\nMax threads for uniform bonded distribution set to %d by env.var.\n",
-                    bt->max_nthread_uniform);
+                    max_nthread_uniform);
         }
     }
     else
     {
-        bt->max_nthread_uniform = max_nthread_uniform;
+        max_nthread_uniform = max_nthread_uniform_default;
     }
-
-    return bt;
 }