From 49630d37fd2076449f8539add2f3c820b74d00ba Mon Sep 17 00:00:00 2001
From: Berk Hess <hess@n176-p33.kthopen.kth.se>
Date: Fri, 27 Feb 2015 10:19:31 +0100
Subject: [PATCH] Separated bonded and Ewald correction threading

The bonded and Ewald correction threading code used the same data
structures. These have now been separated and the bonded threading
data is opaque and defined in listed-internal.h.
This change is only code refactoring, except for the removal of
the thread local ewald correction force array and reduction, which
were never needed.

Change-Id: I162255107aa9e153a4c8ff41d786867d6fbed444
---
 src/gromacs/legacyheaders/types/forcerec.h    | 21 ++---
 src/gromacs/listed-forces/listed-forces.cpp   | 46 +++++-----
 src/gromacs/listed-forces/listed-internal.h   | 36 +++++++-
 .../listed-forces/manage-threading.cpp        | 91 +++++++++++--------
 src/gromacs/listed-forces/manage-threading.h  |  9 +-
 src/gromacs/mdlib/force.cpp                   | 76 ++++++----------
 src/gromacs/mdlib/forcerec-threading.h        | 12 +--
 src/gromacs/mdlib/forcerec.cpp                | 11 ++-
 8 files changed, 166 insertions(+), 136 deletions(-)

diff --git a/src/gromacs/legacyheaders/types/forcerec.h b/src/gromacs/legacyheaders/types/forcerec.h
index 33c44a6442..c0d8aa9919 100644
--- a/src/gromacs/legacyheaders/types/forcerec.h
+++ b/src/gromacs/legacyheaders/types/forcerec.h
@@ -60,6 +60,7 @@ extern "C" {
 /* Abstract type for PME that is defined only in the routine that use them. */
 struct gmx_pme_t;
 struct nonbonded_verlet_t;
+struct bonded_threading_t;
 
 /* Structure describing the data in a single table */
 typedef struct
@@ -185,7 +186,7 @@ typedef struct {
 /* Forward declaration of type for managing Ewald tables */
 struct gmx_ewald_tab_t;
 
-typedef struct f_thread_t f_thread_t;
+typedef struct ewald_corr_thread_t ewald_corr_thread_t;
 
 typedef struct {
     interaction_const_t *ic;
@@ -468,18 +469,14 @@ typedef struct {
     real userreal3;
     real userreal4;
 
-    /* Thread local force and energy data */
-    /* FIXME move to bonded_thread_data_t */
-    int         nthreads;
-    int         red_ashift;
-    int         red_nblock;
-    f_thread_t *f_t;
+    /* Pointer to struct for managing threading of bonded force calculation */
+    struct bonded_threading_t *bonded_threading;
 
-    /* Maximum thread count for uniform distribution of bondeds over threads */
-    int   bonded_max_nthread_uniform;
-
-    /* Exclusion load distribution over the threads */
-    int  *excl_load;
+    /* Ewald correction thread local virial and energy data */
+    int                  nthread_ewc;
+    ewald_corr_thread_t *ewc_t;
+    /* Ewald charge correction load distribution over the threads */
+    int                 *excl_load;
 } t_forcerec;
 
 /* Important: Starting with Gromacs-4.6, the values of c6 and c12 in the nbfp array have
diff --git a/src/gromacs/listed-forces/listed-forces.cpp b/src/gromacs/listed-forces/listed-forces.cpp
index 3f68624add..1fa6afccd2 100644
--- a/src/gromacs/listed-forces/listed-forces.cpp
+++ b/src/gromacs/listed-forces/listed-forces.cpp
@@ -60,13 +60,13 @@
 #include "gromacs/listed-forces/bonded.h"
 #include "gromacs/listed-forces/position-restraints.h"
 #include "gromacs/math/vec.h"
-#include "gromacs/mdlib/forcerec-threading.h"
 #include "gromacs/pbcutil/ishift.h"
 #include "gromacs/pbcutil/pbc.h"
 #include "gromacs/simd/simd.h"
 #include "gromacs/timing/wallcycle.h"
 #include "gromacs/utility/smalloc.h"
 
+#include "listed-internal.h"
 #include "pairs.h"
 
 namespace
@@ -405,14 +405,18 @@ void calc_listed(const gmx_multisim_t *ms,
                  t_fcdata *fcd, int *global_atom_index,
                  int force_flags)
 {
-    gmx_bool      bCalcEnerVir;
-    int           i;
-    real          dvdl[efptNR]; /* The dummy array is to have a place to store the dhdl at other values
-                                                        of lambda, which will be thrown away in the end*/
-    const  t_pbc *pbc_null;
-    int           thread;
+    struct bonded_threading_t *bt;
+    gmx_bool                   bCalcEnerVir;
+    int                        i;
+    /* The dummy array is to have a place to store the dhdl at other values
+       of lambda, which will be thrown away in the end */
+    real                       dvdl[efptNR];
+    const  t_pbc              *pbc_null;
+    int                        thread;
+
+    bt = fr->bonded_threading;
 
-    assert(fr->nthreads == idef->nthreads);
+    assert(bt->nthreads == idef->nthreads);
 
     bCalcEnerVir = (force_flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY));
 
@@ -485,8 +489,8 @@ void calc_listed(const gmx_multisim_t *ms,
     }
 
     wallcycle_sub_start(wcycle, ewcsLISTED);
-#pragma omp parallel for num_threads(fr->nthreads) schedule(static)
-    for (thread = 0; thread < fr->nthreads; thread++)
+#pragma omp parallel for num_threads(bt->nthreads) schedule(static)
+    for (thread = 0; thread < bt->nthreads; thread++)
     {
         int                ftype;
         real              *epot, v;
@@ -505,14 +509,14 @@ void calc_listed(const gmx_multisim_t *ms,
         }
         else
         {
-            zero_thread_forces(&fr->f_t[thread], fr->natoms_force,
-                               fr->red_nblock, 1<<fr->red_ashift);
-
-            ft     = fr->f_t[thread].f;
-            fshift = fr->f_t[thread].fshift;
-            epot   = fr->f_t[thread].ener;
-            grpp   = &fr->f_t[thread].grpp;
-            dvdlt  = fr->f_t[thread].dvdl;
+            zero_thread_forces(&bt->f_t[thread], fr->natoms_force,
+                               bt->red_nblock, 1<<bt->red_ashift);
+
+            ft     = bt->f_t[thread].f;
+            fshift = bt->f_t[thread].fshift;
+            epot   = bt->f_t[thread].ener;
+            grpp   = &bt->f_t[thread].grpp;
+            dvdlt  = bt->f_t[thread].dvdl;
         }
         /* Loop over all bonded force types to calculate the bonded forces */
         for (ftype = 0; (ftype < F_NRE); ftype++)
@@ -530,13 +534,13 @@ void calc_listed(const gmx_multisim_t *ms,
     }
     wallcycle_sub_stop(wcycle, ewcsLISTED);
 
-    if (fr->nthreads > 1)
+    if (bt->nthreads > 1)
     {
         wallcycle_sub_start(wcycle, ewcsLISTED_BUF_OPS);
         reduce_thread_forces(fr->natoms_force, f, fr->fshift,
                              enerd->term, &enerd->grpp, dvdl,
-                             fr->nthreads, fr->f_t,
-                             fr->red_nblock, 1<<fr->red_ashift,
+                             bt->nthreads, bt->f_t,
+                             bt->red_nblock, 1<<bt->red_ashift,
                              bCalcEnerVir,
                              force_flags & GMX_FORCE_DHDL);
         wallcycle_sub_stop(wcycle, ewcsLISTED_BUF_OPS);
diff --git a/src/gromacs/listed-forces/listed-internal.h b/src/gromacs/listed-forces/listed-internal.h
index 7e2982bc1d..a6fa29c174 100644
--- a/src/gromacs/listed-forces/listed-internal.h
+++ b/src/gromacs/listed-forces/listed-internal.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2014, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -43,6 +43,40 @@
 #ifndef GMX_LISTED_FORCES_LISTED_INTERNAL_H
 #define GMX_LISTED_FORCES_LISTED_INTERNAL_H
 
+#include "gromacs/legacyheaders/types/forcerec.h"
+#include "gromacs/math/vectypes.h"
+#include "gromacs/topology/idef.h"
+#include "gromacs/utility/bitmask.h"
+
+/*! \internal \brief struct with output for bonded forces, used per thread */
+typedef struct
+{
+    rvec             *f;            /**< Force array */
+    int               f_nalloc;     /**< Allocation size of f */
+    gmx_bitmask_t     red_mask;     /**< Mask for marking which parts of f are filled */
+    rvec             *fshift;       /**< Shift force array, size SHIFTS */
+    real              ener[F_NRE];  /**< Energy array */
+    gmx_grppairener_t grpp;         /**< Group pair energy data for pairs */
+    real              dvdl[efptNR]; /**< Free-energy dV/dl output */
+}
+f_thread_t;
+
+/*! \internal \brief struct contain all data for bonded force threading */
+struct bonded_threading_t
+{
+    /* Thread local force and energy data */
+    int         nthreads;   /**< Number of threads to be used for bondeds */
+    int         red_ashift; /**< Size of force reduction blocks in bits */
+    int         red_nblock; /**< The number of force blocks to reduce */
+    f_thread_t *f_t;        /**< Force/enegry data per thread, size nthreads */
+
+    /* There are two different ways to distribute the bonded force calculation
+     * over the threads. We dedice which to use based on the number of threads.
+     */
+    int bonded_max_nthread_uniform; /**< Maximum thread count for uniform distribution of bondeds over threads */
+};
+
+
 /*! \brief Returns the global topology atom number belonging to local
  * atom index i.
  *
diff --git a/src/gromacs/listed-forces/manage-threading.cpp b/src/gromacs/listed-forces/manage-threading.cpp
index b703d3d8ce..183b819bef 100644
--- a/src/gromacs/listed-forces/manage-threading.cpp
+++ b/src/gromacs/listed-forces/manage-threading.cpp
@@ -53,12 +53,13 @@
 
 #include "gromacs/legacyheaders/gmx_omp_nthreads.h"
 #include "gromacs/listed-forces/listed-forces.h"
-#include "gromacs/mdlib/forcerec-threading.h"
 #include "gromacs/pbcutil/ishift.h"
 #include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/smalloc.h"
 #include "gromacs/utility/stringutil.h"
 
+#include "listed-internal.h"
+
 /*! \brief struct for passing all data required for a function type */
 typedef struct {
     int      ftype; /**< the function type index */
@@ -343,65 +344,68 @@ const int maxBlockBits = BITMASK_SIZE;
 
 void setup_bonded_threading(t_forcerec *fr, t_idef *idef)
 {
-    int t;
-    int ctot, c, b;
+    bonded_threading_t *bt;
+    int                 t;
+    int                 ctot, c, b;
+
+    bt = fr->bonded_threading;
 
-    assert(fr->nthreads >= 1);
+    assert(bt->nthreads >= 1);
 
     /* Divide the bonded interaction over the threads */
     divide_bondeds_over_threads(idef,
-                                fr->nthreads,
-                                fr->bonded_max_nthread_uniform);
+                                bt->nthreads,
+                                bt->bonded_max_nthread_uniform);
 
-    if (fr->nthreads == 1)
+    if (bt->nthreads == 1)
     {
-        fr->red_nblock = 0;
+        bt->red_nblock = 0;
 
         return;
     }
 
-    fr->red_ashift = 6;
-    while (fr->natoms_force > (int)(maxBlockBits*(1U<<fr->red_ashift)))
+    bt->red_ashift = 6;
+    while (fr->natoms_force > (int)(maxBlockBits*(1U<<bt->red_ashift)))
     {
-        fr->red_ashift++;
+        bt->red_ashift++;
     }
     if (debug)
     {
         fprintf(debug, "bonded force buffer block atom shift %d bits\n",
-                fr->red_ashift);
+                bt->red_ashift);
     }
 
     /* Determine to which blocks each thread's bonded force calculation
      * contributes. Store this is a mask for each thread.
      */
-#pragma omp parallel for num_threads(fr->nthreads) schedule(static)
-    for (t = 1; t < fr->nthreads; t++)
+#pragma omp parallel for num_threads(bt->nthreads) schedule(static)
+    for (t = 1; t < bt->nthreads; t++)
     {
-        calc_bonded_reduction_mask(&fr->f_t[t].red_mask,
-                                   idef, fr->red_ashift, t, fr->nthreads);
+        calc_bonded_reduction_mask(&bt->f_t[t].red_mask,
+                                   idef, bt->red_ashift, t, bt->nthreads);
     }
 
     /* Determine the maximum number of blocks we need to reduce over */
-    fr->red_nblock = 0;
+    bt->red_nblock = 0;
     ctot           = 0;
-    for (t = 0; t < fr->nthreads; t++)
+    for (t = 0; t < bt->nthreads; t++)
     {
         c = 0;
         for (b = 0; b < maxBlockBits; b++)
         {
-            if (bitmask_is_set(fr->f_t[t].red_mask, b))
+            if (bitmask_is_set(bt->f_t[t].red_mask, b))
             {
-                fr->red_nblock = std::max(fr->red_nblock, b+1);
+                bt->red_nblock = std::max(bt->red_nblock, b+1);
                 c++;
             }
         }
         if (debug)
         {
 #if BITMASK_SIZE <= 64 //move into bitmask when it is C++
-            std::string flags = gmx::formatString("%x", fr->f_t[t].red_mask);
+            std::string flags = gmx::formatString("%x", bt->f_t[t].red_mask);
 #else
-            std::string flags = gmx::formatAndJoin(fr->f_t[t].red_mask,
-                                                   fr->f_t[t].red_mask+BITMASK_ALEN,
+            std::string flags = gmx::formatAndJoin(bt->f_t[t].red_mask,
+                                                   bt->f_t[t].red_mask+BITMASK_ALEN,
                                                    "", gmx::StringFormatter("%x"));
 #endif
             fprintf(debug, "thread %d flags %s count %d\n",
@@ -412,38 +416,43 @@ void setup_bonded_threading(t_forcerec *fr, t_idef *idef)
     if (debug)
     {
         fprintf(debug, "Number of blocks to reduce: %d of size %d\n",
-                fr->red_nblock, 1<<fr->red_ashift);
+                bt->red_nblock, 1<<bt->red_ashift);
         fprintf(debug, "Reduction density %.2f density/#thread %.2f\n",
-                ctot*(1<<fr->red_ashift)/(double)fr->natoms_force,
-                ctot*(1<<fr->red_ashift)/(double)(fr->natoms_force*fr->nthreads));
+                ctot*(1<<bt->red_ashift)/(double)fr->natoms_force,
+                ctot*(1<<bt->red_ashift)/(double)(fr->natoms_force*bt->nthreads));
     }
 }
 
-void init_bonded_threading(FILE *fplog, t_forcerec *fr, int nenergrp)
+void init_bonded_threading(FILE *fplog, int nenergrp,
+                           struct bonded_threading_t **bt_ptr)
 {
+    bonded_threading_t *bt;
+
+    snew(bt, 1);
+
     /* These thread local data structures are used for bondeds only */
-    fr->nthreads = gmx_omp_nthreads_get(emntBonded);
+    bt->nthreads = gmx_omp_nthreads_get(emntBonded);
 
-    if (fr->nthreads > 1)
+    if (bt->nthreads > 1)
     {
         int t;
 
-        snew(fr->f_t, fr->nthreads);
-#pragma omp parallel for num_threads(fr->nthreads) schedule(static)
-        for (t = 0; t < fr->nthreads; t++)
+        snew(bt->f_t, bt->nthreads);
+#pragma omp parallel for num_threads(bt->nthreads) schedule(static)
+        for (t = 0; t < bt->nthreads; t++)
         {
             /* Thread 0 uses the global force and energy arrays */
             if (t > 0)
             {
                 int i;
 
-                fr->f_t[t].f        = NULL;
-                fr->f_t[t].f_nalloc = 0;
-                snew(fr->f_t[t].fshift, SHIFTS);
-                fr->f_t[t].grpp.nener = nenergrp*nenergrp;
+                bt->f_t[t].f        = NULL;
+                bt->f_t[t].f_nalloc = 0;
+                snew(bt->f_t[t].fshift, SHIFTS);
+                bt->f_t[t].grpp.nener = nenergrp*nenergrp;
                 for (i = 0; i < egNR; i++)
                 {
-                    snew(fr->f_t[t].grpp.ener[i], fr->f_t[t].grpp.nener);
+                    snew(bt->f_t[t].grpp.ener[i], bt->f_t[t].grpp.nener);
                 }
             }
         }
@@ -457,16 +466,18 @@ void init_bonded_threading(FILE *fplog, t_forcerec *fr, int nenergrp)
 
         if ((ptr = getenv("GMX_BONDED_NTHREAD_UNIFORM")) != NULL)
         {
-            sscanf(ptr, "%d", &fr->bonded_max_nthread_uniform);
+            sscanf(ptr, "%d", &bt->bonded_max_nthread_uniform);
             if (fplog != NULL)
             {
                 fprintf(fplog, "\nMax threads for uniform bonded distribution set to %d by env.var.\n",
-                        fr->bonded_max_nthread_uniform);
+                        bt->bonded_max_nthread_uniform);
             }
         }
         else
         {
-            fr->bonded_max_nthread_uniform = max_nthread_uniform;
+            bt->bonded_max_nthread_uniform = max_nthread_uniform;
         }
     }
+
+    *bt_ptr = bt;
 }
diff --git a/src/gromacs/listed-forces/manage-threading.h b/src/gromacs/listed-forces/manage-threading.h
index 2096ec63ae..c8718da6eb 100644
--- a/src/gromacs/listed-forces/manage-threading.h
+++ b/src/gromacs/listed-forces/manage-threading.h
@@ -60,8 +60,13 @@ extern "C" {
  */
 void setup_bonded_threading(t_forcerec *fr, t_idef *idef);
 
-/*! \brief Initialize the bonded threading data structures */
-void init_bonded_threading(FILE *fplog, t_forcerec *fr, int nenergrp);
+/*! \brief Initialize the bonded threading data structures
+ *
+ * Allocates and initializes a bonded threading data structure.
+ * A pointer to this struct is returned as \p *bb_ptr.
+ */
+void init_bonded_threading(FILE *fplog, int nenergrp,
+                           struct bonded_threading_t **bt_ptr);
 
 #ifdef __cplusplus
 }
diff --git a/src/gromacs/mdlib/force.cpp b/src/gromacs/mdlib/force.cpp
index 20ecfe36bd..e27e347587 100644
--- a/src/gromacs/mdlib/force.cpp
+++ b/src/gromacs/mdlib/force.cpp
@@ -113,34 +113,22 @@ void ns(FILE              *fp,
     }
 }
 
-static void reduce_thread_forces(int n, rvec *f,
-                                 tensor vir_q, tensor vir_lj,
-                                 real *Vcorr_q, real *Vcorr_lj,
-                                 real *dvdl_q, real *dvdl_lj,
-                                 int nthreads, f_thread_t *f_t)
+static void reduce_thread_energies(tensor vir_q, tensor vir_lj,
+                                   real *Vcorr_q, real *Vcorr_lj,
+                                   real *dvdl_q, real *dvdl_lj,
+                                   int nthreads,
+                                   ewald_corr_thread_t *ewc_t)
 {
-    int t, i;
-    int nthreads_loop gmx_unused;
+    int t;
 
-    // cppcheck-suppress unreadVariable
-    nthreads_loop = gmx_omp_nthreads_get(emntBonded);
-    /* This reduction can run over any number of threads */
-#pragma omp parallel for num_threads(nthreads_loop) private(t) schedule(static)
-    for (i = 0; i < n; i++)
-    {
-        for (t = 1; t < nthreads; t++)
-        {
-            rvec_inc(f[i], f_t[t].f[i]);
-        }
-    }
     for (t = 1; t < nthreads; t++)
     {
-        *Vcorr_q  += f_t[t].Vcorr_q;
-        *Vcorr_lj += f_t[t].Vcorr_lj;
-        *dvdl_q   += f_t[t].dvdl[efptCOUL];
-        *dvdl_lj  += f_t[t].dvdl[efptVDW];
-        m_add(vir_q, f_t[t].vir_q, vir_q);
-        m_add(vir_lj, f_t[t].vir_lj, vir_lj);
+        *Vcorr_q  += ewc_t[t].Vcorr_q;
+        *Vcorr_lj += ewc_t[t].Vcorr_lj;
+        *dvdl_q   += ewc_t[t].dvdl[efptCOUL];
+        *dvdl_lj  += ewc_t[t].dvdl[efptVDW];
+        m_add(vir_q, ewc_t[t].vir_q, vir_q);
+        m_add(vir_lj, ewc_t[t].vir_lj, vir_lj);
     }
 }
 
@@ -440,17 +428,14 @@ void do_force_lowlevel(t_forcerec *fr,      t_inputrec *ir,
                     gmx_fatal(FARGS, "TPI with PME currently only works in a 3D geometry with tin-foil boundary conditions");
                 }
 
-                nthreads = gmx_omp_nthreads_get(emntBonded);
+                nthreads = fr->nthread_ewc;
 #pragma omp parallel for num_threads(nthreads) schedule(static)
                 for (t = 0; t < nthreads; t++)
                 {
-                    int     i;
-                    rvec   *fnv;
                     tensor *vir_q, *vir_lj;
                     real   *Vcorrt_q, *Vcorrt_lj, *dvdlt_q, *dvdlt_lj;
                     if (t == 0)
                     {
-                        fnv       = fr->f_novirsum;
                         vir_q     = &fr->vir_el_recip;
                         vir_lj    = &fr->vir_lj_recip;
                         Vcorrt_q  = &Vcorr_q;
@@ -460,23 +445,23 @@ void do_force_lowlevel(t_forcerec *fr,      t_inputrec *ir,
                     }
                     else
                     {
-                        fnv       = fr->f_t[t].f;
-                        vir_q     = &fr->f_t[t].vir_q;
-                        vir_lj    = &fr->f_t[t].vir_lj;
-                        Vcorrt_q  = &fr->f_t[t].Vcorr_q;
-                        Vcorrt_lj = &fr->f_t[t].Vcorr_lj;
-                        dvdlt_q   = &fr->f_t[t].dvdl[efptCOUL];
-                        dvdlt_lj  = &fr->f_t[t].dvdl[efptVDW];
-                        for (i = 0; i < fr->natoms_force; i++)
-                        {
-                            clear_rvec(fnv[i]);
-                        }
+                        vir_q     = &fr->ewc_t[t].vir_q;
+                        vir_lj    = &fr->ewc_t[t].vir_lj;
+                        Vcorrt_q  = &fr->ewc_t[t].Vcorr_q;
+                        Vcorrt_lj = &fr->ewc_t[t].Vcorr_lj;
+                        dvdlt_q   = &fr->ewc_t[t].dvdl[efptCOUL];
+                        dvdlt_lj  = &fr->ewc_t[t].dvdl[efptVDW];
                         clear_mat(*vir_q);
                         clear_mat(*vir_lj);
                     }
                     *dvdlt_q  = 0;
                     *dvdlt_lj = 0;
 
+                    /* Threading is only supported with the Verlet cut-off
+                     * scheme and then only single particle forces (no
+                     * exclusion forces) are calculated, so we can store
+                     * the forces in the normal, single fr->f_novirsum array.
+                     */
                     ewald_LRcorrection(fr->excl_load[t], fr->excl_load[t+1],
                                        cr, t, fr,
                                        md->chargeA, md->chargeB,
@@ -488,19 +473,18 @@ void do_force_lowlevel(t_forcerec *fr,      t_inputrec *ir,
                                        excl, x, bSB ? boxs : box, mu_tot,
                                        ir->ewald_geometry,
                                        ir->epsilon_surface,
-                                       fnv, *vir_q, *vir_lj,
+                                       fr->f_novirsum, *vir_q, *vir_lj,
                                        Vcorrt_q, Vcorrt_lj,
                                        lambda[efptCOUL], lambda[efptVDW],
                                        dvdlt_q, dvdlt_lj);
                 }
                 if (nthreads > 1)
                 {
-                    reduce_thread_forces(fr->natoms_force, fr->f_novirsum,
-                                         fr->vir_el_recip, fr->vir_lj_recip,
-                                         &Vcorr_q, &Vcorr_lj,
-                                         &dvdl_long_range_correction_q,
-                                         &dvdl_long_range_correction_lj,
-                                         nthreads, fr->f_t);
+                    reduce_thread_energies(fr->vir_el_recip, fr->vir_lj_recip,
+                                           &Vcorr_q, &Vcorr_lj,
+                                           &dvdl_long_range_correction_q,
+                                           &dvdl_long_range_correction_lj,
+                                           nthreads, fr->ewc_t);
                 }
                 wallcycle_sub_stop(wcycle, ewcsEWALD_CORRECTION);
             }
diff --git a/src/gromacs/mdlib/forcerec-threading.h b/src/gromacs/mdlib/forcerec-threading.h
index b40a18ba32..070b476ec5 100644
--- a/src/gromacs/mdlib/forcerec-threading.h
+++ b/src/gromacs/mdlib/forcerec-threading.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2014, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -36,19 +36,11 @@
 #ifndef GMX_MDLIB_FORCEREC_THREADING_H
 #define GMX_MDLIB_FORCEREC_THREADING_H
 
-#include "gromacs/utility/bitmask.h"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-struct f_thread_t {
-    rvec             *f;
-    int               f_nalloc;
-    gmx_bitmask_t     red_mask; /* Mask for marking which parts of f are filled */
-    rvec             *fshift;
-    real              ener[F_NRE];
-    gmx_grppairener_t grpp;
+struct ewald_corr_thread_t {
     real              Vcorr_q;
     real              Vcorr_lj;
     real              dvdl[efptNR];
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp
index 7de43327b5..b04b09a16b 100644
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -3212,9 +3212,12 @@ void init_forcerec(FILE              *fp,
     }
 
     /* Initialize the thread working data for bonded interactions */
-    init_bonded_threading(fp, fr, mtop->groups.grps[egcENER].nr);
+    init_bonded_threading(fp, mtop->groups.grps[egcENER].nr,
+                          &fr->bonded_threading);
 
-    snew(fr->excl_load, fr->nthreads+1);
+    fr->nthread_ewc = gmx_omp_nthreads_get(emntBonded);
+    snew(fr->ewc_t, fr->nthread_ewc);
+    snew(fr->excl_load, fr->nthread_ewc + 1);
 
     /* fr->ic is used both by verlet and group kernels (to some extent) now */
     init_interaction_const(fp, &fr->ic, fr);
@@ -3285,9 +3288,9 @@ void forcerec_set_excl_load(t_forcerec           *fr,
     fr->excl_load[0] = 0;
     n                = 0;
     i                = 0;
-    for (t = 1; t <= fr->nthreads; t++)
+    for (t = 1; t <= fr->nthread_ewc; t++)
     {
-        ntarget = (ntot*t)/fr->nthreads;
+        ntarget = (ntot*t)/fr->nthread_ewc;
         while (i < top->excls.nr && n < ntarget)
         {
             for (j = ind[i]; j < ind[i+1]; j++)
-- 
2.22.0