Merge branch release-4-6
authorMark Abraham <mark.j.abraham@gmail.com>
Wed, 25 Sep 2013 09:16:18 +0000 (11:16 +0200)
committerMark Abraham <mark.j.abraham@gmail.com>
Thu, 26 Sep 2013 15:48:11 +0000 (17:48 +0200)
Conflicts:
src/gromacs/gmxlib/bondfree.c
src/gromacs/legacyheaders/bondf.h
src/gromacs/legacyheaders/pme.h
src/gromacs/mdlib/pme.c
src/programs/mdrun/runner.c

Conflicts all resolved in favour of release-4-6 branch,
except where the master-branch removal of unused parameters
was correct.

Change-Id: Ib32a52b97b5443b86b4f0e4d575767990dc6c47e

12 files changed:
src/gromacs/gmxana/gmx_tune_pme.c
src/gromacs/gmxlib/bondfree.c
src/gromacs/legacyheaders/bondf.h
src/gromacs/legacyheaders/gmx_simd_macros.h
src/gromacs/legacyheaders/gmx_simd_math_single.h
src/gromacs/legacyheaders/pme.h
src/gromacs/legacyheaders/types/idef.h
src/gromacs/mdlib/domdec.c
src/gromacs/mdlib/minimize.c
src/gromacs/mdlib/pme.c
src/programs/mdrun/md.c
src/programs/mdrun/runner.c

index 9231655c1ef4f5e85c0933008b59ca009fa01ec7..d611bca21e028091c991459837a64fc5b936eab7 100644 (file)
@@ -73,6 +73,7 @@ enum {
     eParselogNoDDGrid,
     eParselogTPXVersion,
     eParselogNotParallel,
+    eParselogLargePrimeFactor,
     eParselogFatal,
     eParselogNr
 };
@@ -285,6 +286,11 @@ static int parse_logfile(const char *logfile, const char *errfile,
                     fclose(fp);
                     return eParselogNoDDGrid;
                 }
+                else if (str_starts(line, "The number of nodes you selected"))
+                {
+                    fclose(fp);
+                    return eParselogLargePrimeFactor;
+                }
                 else if (str_starts(line, "reading tpx file"))
                 {
                     fclose(fp);
@@ -1380,6 +1386,7 @@ static void do_the_tests(
         "No DD grid found for these settings.",
         "TPX version conflict!",
         "mdrun was not started in parallel!",
+        "Number of PP nodes has a prime factor that is too large.",
         "An error occured."
     };
     char        str_PME_f_load[13];
index f03f8738f3a6beb0a17df84d9c2ba8f671102406..ae984652fa819bf7f541c09f1a1f01cb6c2dcc19 100644 (file)
@@ -38,6 +38,7 @@
 #endif
 
 #include <math.h>
+#include <assert.h>
 #include "physics.h"
 #include "vec.h"
 #include "maths.h"
@@ -3757,6 +3758,73 @@ real tab_dihs(int nbonds,
     return vtot;
 }
 
+/* Return if this is a potential calculated in bondfree.c,
+ * i.e. an interaction that actually calculates a potential and
+ * works on multiple atoms (not e.g. a connection or a position restraint).
+ */
+static gmx_inline gmx_bool ftype_is_bonded_potential(int ftype)
+{
+    return
+        (interaction_function[ftype].flags & IF_BOND) &&
+        !(ftype == F_CONNBONDS || ftype == F_POSRES) &&
+        (ftype < F_GB12 || ftype > F_GB14);
+}
+
+static void divide_bondeds_over_threads(t_idef *idef, int nthreads)
+{
+    int ftype;
+    int nat1;
+    int t;
+    int il_nr_thread;
+
+    idef->nthreads = nthreads;
+
+    if (F_NRE*(nthreads+1) > idef->il_thread_division_nalloc)
+    {
+        idef->il_thread_division_nalloc = F_NRE*(nthreads+1);
+        snew(idef->il_thread_division, idef->il_thread_division_nalloc);
+    }
+
+    for (ftype = 0; ftype < F_NRE; ftype++)
+    {
+        if (ftype_is_bonded_potential(ftype))
+        {
+            nat1 = interaction_function[ftype].nratoms + 1;
+
+            for (t = 0; t <= nthreads; t++)
+            {
+                /* Divide the interactions equally over the threads.
+                 * When the different types of bonded interactions
+                 * are distributed roughly equally over the threads,
+                 * this should lead to well localized output into
+                 * the force buffer on each thread.
+                 * If this is not the case, a more advanced scheme
+                 * (not implemented yet) will do better.
+                 */
+                il_nr_thread = (((idef->il[ftype].nr/nat1)*t)/nthreads)*nat1;
+
+                /* Ensure that distance restraint pairs with the same label
+                 * end up on the same thread.
+                 * This is slighlty tricky code, since the next for iteration
+                 * may have an initial il_nr_thread lower than the final value
+                 * in the previous iteration, but this will anyhow be increased
+                 * to the approriate value again by this while loop.
+                 */
+                while (ftype == F_DISRES &&
+                       il_nr_thread > 0 &&
+                       il_nr_thread < idef->il[ftype].nr &&
+                       idef->iparams[idef->il[ftype].iatoms[il_nr_thread]].disres.label ==
+                       idef->iparams[idef->il[ftype].iatoms[il_nr_thread-nat1]].disres.label)
+                {
+                    il_nr_thread += nat1;
+                }
+
+                idef->il_thread_division[ftype*(nthreads+1)+t] = il_nr_thread;
+            }
+        }
+    }
+}
+
 static unsigned
 calc_bonded_reduction_mask(const t_idef *idef,
                            int shift,
@@ -3769,9 +3837,7 @@ calc_bonded_reduction_mask(const t_idef *idef,
 
     for (ftype = 0; ftype < F_NRE; ftype++)
     {
-        if (interaction_function[ftype].flags & IF_BOND &&
-            !(ftype == F_CONNBONDS || ftype == F_POSRES) &&
-            (ftype<F_GB12 || ftype>F_GB14))
+        if (ftype_is_bonded_potential(ftype))
         {
             nb = idef->il[ftype].nr;
             if (nb > 0)
@@ -3781,8 +3847,8 @@ calc_bonded_reduction_mask(const t_idef *idef,
                 /* Divide this interaction equally over the threads.
                  * This is not stored: should match division in calc_bonds.
                  */
-                nb0 = (((nb/nat1)* t   )/nt)*nat1;
-                nb1 = (((nb/nat1)*(t+1))/nt)*nat1;
+                nb0 = idef->il_thread_division[ftype*(nt+1)+t];
+                nb1 = idef->il_thread_division[ftype*(nt+1)+t+1];
 
                 for (i = nb0; i < nb1; i += nat1)
                 {
@@ -3798,14 +3864,18 @@ calc_bonded_reduction_mask(const t_idef *idef,
     return mask;
 }
 
-void init_bonded_thread_force_reduction(t_forcerec   *fr,
-                                        const t_idef *idef)
+void setup_bonded_threading(t_forcerec   *fr, t_idef *idef)
 {
 #define MAX_BLOCK_BITS 32
     int t;
     int ctot, c, b;
 
-    if (fr->nthreads <= 1)
+    assert(fr->nthreads >= 1);
+
+    /* Divide the bonded interaction over the threads */
+    divide_bondeds_over_threads(idef, fr->nthreads);
+
+    if (fr->nthreads == 1)
     {
         fr->red_nblock = 0;
 
@@ -4029,14 +4099,14 @@ static real calc_one_bond(FILE *fplog, int thread,
                           rvec x[], rvec f[], rvec fshift[],
                           t_forcerec *fr,
                           const t_pbc *pbc, const t_graph *g,
-                          gmx_enerdata_t gmx_unused *enerd, gmx_grppairener_t *grpp,
+                          gmx_grppairener_t *grpp,
                           t_nrnb *nrnb,
                           real *lambda, real *dvdl,
                           const t_mdatoms *md, t_fcdata *fcd,
                           gmx_bool bCalcEnerVir,
                           int *global_atom_index, gmx_bool bPrintSepPot)
 {
-    int      ind, nat1, nbonds, efptFTYPE;
+    int      nat1, nbonds, efptFTYPE;
     real     v = 0;
     t_iatom *iatoms;
     int      nb0, nbn;
@@ -4050,170 +4120,88 @@ static real calc_one_bond(FILE *fplog, int thread,
         efptFTYPE = efptBONDED;
     }
 
-    if (interaction_function[ftype].flags & IF_BOND &&
-        !(ftype == F_CONNBONDS || ftype == F_POSRES))
-    {
-        ind       = interaction_function[ftype].nrnb_ind;
-        nat1      = interaction_function[ftype].nratoms + 1;
-        nbonds    = idef->il[ftype].nr/nat1;
-        iatoms    = idef->il[ftype].iatoms;
+    nat1      = interaction_function[ftype].nratoms + 1;
+    nbonds    = idef->il[ftype].nr/nat1;
+    iatoms    = idef->il[ftype].iatoms;
 
-        nb0 = ((nbonds* thread   )/(fr->nthreads))*nat1;
-        nbn = ((nbonds*(thread+1))/(fr->nthreads))*nat1 - nb0;
+    nb0 = idef->il_thread_division[ftype*(idef->nthreads+1)+thread];
+    nbn = idef->il_thread_division[ftype*(idef->nthreads+1)+thread+1] - nb0;
 
-        if (!IS_LISTED_LJ_C(ftype))
+    if (!IS_LISTED_LJ_C(ftype))
+    {
+        if (ftype == F_CMAP)
         {
-            if (ftype == F_CMAP)
-            {
-                v = cmap_dihs(nbn, iatoms+nb0,
-                              idef->iparams, &idef->cmap_grid,
-                              (const rvec*)x, f, fshift,
-                              pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]),
-                              md, fcd, global_atom_index);
-            }
+            v = cmap_dihs(nbn, iatoms+nb0,
+                          idef->iparams, &idef->cmap_grid,
+                          (const rvec*)x, f, fshift,
+                          pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]),
+                          md, fcd, global_atom_index);
+        }
 #ifdef SIMD_BONDEDS
-            else if (ftype == F_ANGLES &&
-                     !bCalcEnerVir && fr->efep == efepNO)
-            {
-                /* No energies, shift forces, dvdl */
-                angles_noener_simd(nbn, idef->il[ftype].iatoms+nb0,
-                                   idef->iparams,
-                                   (const rvec*)x, f,
-                                   pbc, g, lambda[efptFTYPE], md, fcd,
-                                   global_atom_index);
-                v = 0;
-            }
+        else if (ftype == F_ANGLES &&
+                 !bCalcEnerVir && fr->efep == efepNO)
+        {
+            /* No energies, shift forces, dvdl */
+            angles_noener_simd(nbn, idef->il[ftype].iatoms+nb0,
+                               idef->iparams,
+                               (const rvec*)x, f,
+                               pbc, g, lambda[efptFTYPE], md, fcd,
+                               global_atom_index);
+            v = 0;
+        }
 #endif
-            else if (ftype == F_PDIHS &&
-                     !bCalcEnerVir && fr->efep == efepNO)
-            {
-                /* No energies, shift forces, dvdl */
+        else if (ftype == F_PDIHS &&
+                 !bCalcEnerVir && fr->efep == efepNO)
+        {
+            /* No energies, shift forces, dvdl */
 #ifndef SIMD_BONDEDS
-                pdihs_noener
+            pdihs_noener
 #else
-                pdihs_noener_simd
+            pdihs_noener_simd
 #endif
-                    (nbn, idef->il[ftype].iatoms+nb0,
-                    idef->iparams,
-                    (const rvec*)x, f,
-                    pbc, g, lambda[efptFTYPE], md, fcd,
-                    global_atom_index);
-                v = 0;
-            }
-            else
-            {
-                v = interaction_function[ftype].ifunc(nbn, iatoms+nb0,
-                                                      idef->iparams,
-                                                      (const rvec*)x, f, fshift,
-                                                      pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]),
-                                                      md, fcd, global_atom_index);
-            }
-            if (bPrintSepPot)
-            {
-                fprintf(fplog, "  %-23s #%4d  V %12.5e  dVdl %12.5e\n",
-                        interaction_function[ftype].longname,
-                        nbonds/nat1, v, lambda[efptFTYPE]);
-            }
+                (nbn, idef->il[ftype].iatoms+nb0,
+                 idef->iparams,
+                 (const rvec*)x, f,
+                 pbc, g, lambda[efptFTYPE], md, fcd,
+                 global_atom_index);
+            v = 0;
         }
         else
         {
-            v = do_nonbonded_listed(ftype, nbn, iatoms+nb0, idef->iparams, (const rvec*)x, f, fshift,
-                                    pbc, g, lambda, dvdl, md, fr, grpp, global_atom_index);
-
-            if (bPrintSepPot)
-            {
-                fprintf(fplog, "  %-5s + %-15s #%4d                  dVdl %12.5e\n",
-                        interaction_function[ftype].longname,
-                        interaction_function[F_LJ14].longname, nbonds/nat1, dvdl[efptVDW]);
-                fprintf(fplog, "  %-5s + %-15s #%4d                  dVdl %12.5e\n",
-                        interaction_function[ftype].longname,
-                        interaction_function[F_COUL14].longname, nbonds/nat1, dvdl[efptCOUL]);
-            }
+            v = interaction_function[ftype].ifunc(nbn, iatoms+nb0,
+                                                  idef->iparams,
+                                                  (const rvec*)x, f, fshift,
+                                                  pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]),
+                                                  md, fcd, global_atom_index);
         }
-        if (ind != -1 && thread == 0)
+        if (bPrintSepPot)
         {
-            inc_nrnb(nrnb, ind, nbonds);
+            fprintf(fplog, "  %-23s #%4d  V %12.5e  dVdl %12.5e\n",
+                    interaction_function[ftype].longname,
+                    nbonds, v, lambda[efptFTYPE]);
         }
     }
-
-    return v;
-}
-
-/* WARNING!  THIS FUNCTION MUST EXACTLY TRACK THE calc
-   function, or horrible things will happen when doing free energy
-   calculations!  In a good coding world, this would not be a
-   different function, but for speed reasons, it needs to be made a
-   separate function.  TODO for 5.0 - figure out a way to reorganize
-   to reduce duplication.
- */
-
-static real calc_one_bond_foreign(FILE gmx_unused *fplog, int ftype, const t_idef *idef,
-                                  rvec x[], rvec f[], t_forcerec *fr,
-                                  const t_pbc *pbc, const t_graph *g,
-                                  gmx_grppairener_t *grpp, t_nrnb *nrnb,
-                                  real *lambda, real *dvdl,
-                                  const t_mdatoms *md, t_fcdata *fcd,
-                                  int *global_atom_index, gmx_bool gmx_unused bPrintSepPot)
-{
-    int      ind, nat1, nbonds, efptFTYPE, nbonds_np;
-    real     v = 0;
-    t_iatom *iatoms;
-
-    if (IS_RESTRAINT_TYPE(ftype))
-    {
-        efptFTYPE = efptRESTRAINT;
-    }
     else
     {
-        efptFTYPE = efptBONDED;
+        v = do_nonbonded_listed(ftype, nbn, iatoms+nb0, idef->iparams, (const rvec*)x, f, fshift,
+                                pbc, g, lambda, dvdl, md, fr, grpp, global_atom_index);
+        
+        if (bPrintSepPot)
+        {
+            fprintf(fplog, "  %-5s + %-15s #%4d                  dVdl %12.5e\n",
+                    interaction_function[ftype].longname,
+                    interaction_function[F_LJ14].longname, nbonds, dvdl[efptVDW]);
+            fprintf(fplog, "  %-5s + %-15s #%4d                  dVdl %12.5e\n",
+                    interaction_function[ftype].longname,
+                    interaction_function[F_COUL14].longname, nbonds, dvdl[efptCOUL]);
+        }
     }
 
-    if (ftype < F_GB12 || ftype > F_GB14)
+    if (thread == 0)
     {
-        if (interaction_function[ftype].flags & IF_BOND &&
-            !(ftype == F_CONNBONDS || ftype == F_POSRES || ftype == F_FBPOSRES))
-        {
-            ind       = interaction_function[ftype].nrnb_ind;
-            nat1      = interaction_function[ftype].nratoms+1;
-            nbonds_np = idef->il[ftype].nr_nonperturbed;
-            nbonds    = idef->il[ftype].nr - nbonds_np;
-            iatoms    = idef->il[ftype].iatoms + nbonds_np;
-            if (nbonds > 0)
-            {
-                if (!IS_LISTED_LJ_C(ftype))
-                {
-                    if (ftype == F_CMAP)
-                    {
-                        v = cmap_dihs(nbonds, iatoms,
-                                      idef->iparams, &idef->cmap_grid,
-                                      (const rvec*)x, f, fr->fshift,
-                                      pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]), md, fcd,
-                                      global_atom_index);
-                    }
-                    else
-                    {
-                        v =     interaction_function[ftype].ifunc(nbonds, iatoms,
-                                                                  idef->iparams,
-                                                                  (const rvec*)x, f, fr->fshift,
-                                                                  pbc, g, lambda[efptFTYPE], &dvdl[efptFTYPE],
-                                                                  md, fcd, global_atom_index);
-                    }
-                }
-                else
-                {
-                    v = do_nonbonded_listed(ftype, nbonds, iatoms,
-                                            idef->iparams,
-                                            (const rvec*)x, f, fr->fshift,
-                                            pbc, g, lambda, dvdl,
-                                            md, fr, grpp, global_atom_index);
-                }
-                if (ind != -1)
-                {
-                    inc_nrnb(nrnb, ind, nbonds/nat1);
-                }
-            }
-        }
+        inc_nrnb(nrnb, interaction_function[ftype].nrnb_ind, nbonds);
     }
+
     return v;
 }
 
@@ -4238,6 +4226,8 @@ void calc_bonds(FILE *fplog, const gmx_multisim_t *ms,
     char          buf[22];
     int           thread;
 
+    assert(fr->nthreads == idef->nthreads);
+
     bCalcEnerVir = (force_flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY));
 
     for (i = 0; i < efptNR; i++)
@@ -4285,13 +4275,12 @@ void calc_bonds(FILE *fplog, const gmx_multisim_t *ms,
 #pragma omp parallel for num_threads(fr->nthreads) schedule(static)
     for (thread = 0; thread < fr->nthreads; thread++)
     {
-        int                ftype, nbonds, ind, nat1;
+        int                ftype;
         real              *epot, v;
         /* thread stuff */
         rvec              *ft, *fshift;
         real              *dvdlt;
         gmx_grppairener_t *grpp;
-        int                nb0, nbn;
 
         if (thread == 0)
         {
@@ -4315,17 +4304,14 @@ void calc_bonds(FILE *fplog, const gmx_multisim_t *ms,
         /* Loop over all bonded force types to calculate the bonded forces */
         for (ftype = 0; (ftype < F_NRE); ftype++)
         {
-            if (idef->il[ftype].nr > 0 &&
-                (interaction_function[ftype].flags & IF_BOND) &&
-                (ftype < F_GB12 || ftype > F_GB14) &&
-                !(ftype == F_CONNBONDS || ftype == F_POSRES))
+            if (idef->il[ftype].nr > 0 && ftype_is_bonded_potential(ftype))
             {
                 v = calc_one_bond(fplog, thread, ftype, idef, x,
-                                  ft, fshift, fr, pbc_null, g, enerd, grpp,
+                                  ft, fshift, fr, pbc_null, g, grpp,
                                   nrnb, lambda, dvdlt,
                                   md, fcd, bCalcEnerVir,
                                   global_atom_index, bPrintSepPot);
-                epot[ftype]        += v;
+                epot[ftype] += v;
             }
         }
     }
@@ -4365,12 +4351,12 @@ void calc_bonds_lambda(FILE *fplog,
                        t_fcdata *fcd,
                        int *global_atom_index)
 {
-    int           i, ftype, nbonds_np, nbonds, ind, nat;
-    real          v, dr, dr2;
+    int           i, ftype, nr_nonperturbed, nr;
+    real          v;
     real          dvdl_dum[efptNR];
-    rvec         *f, *fshift_orig;
+    rvec         *f, *fshift;
     const  t_pbc *pbc_null;
-    t_iatom      *iatom_fe;
+    t_idef       idef_fe;
 
     if (fr->bMolPBC)
     {
@@ -4381,21 +4367,43 @@ void calc_bonds_lambda(FILE *fplog,
         pbc_null = NULL;
     }
 
+    /* Copy the whole idef, so we can modify the contents locally */
+    idef_fe          = *idef;
+    idef_fe.nthreads = 1;
+    snew(idef_fe.il_thread_division, F_NRE*(idef_fe.nthreads+1));
+
+    /* We already have the forces, so we use temp buffers here */
     snew(f, fr->natoms_force);
-    /* We want to preserve the fshift array in forcerec */
-    fshift_orig = fr->fshift;
-    snew(fr->fshift, SHIFTS);
+    snew(fshift, SHIFTS);
 
-    /* Loop over all bonded force types to calculate the bonded forces */
+    /* Loop over all bonded force types to calculate the bonded energies */
     for (ftype = 0; (ftype < F_NRE); ftype++)
     {
-        v = calc_one_bond_foreign(fplog, ftype, idef, x,
-                                  f, fr, pbc_null, g, grpp, nrnb, lambda, dvdl_dum,
-                                  md, fcd, global_atom_index, FALSE);
-        epot[ftype] += v;
+        if (ftype_is_bonded_potential(ftype))
+        {
+            /* Set the work range of thread 0 to the perturbed bondeds only */
+            nr_nonperturbed                       = idef->il[ftype].nr_nonperturbed;
+            nr                                    = idef->il[ftype].nr;
+            idef_fe.il_thread_division[ftype*2+0] = nr_nonperturbed;
+            idef_fe.il_thread_division[ftype*2+1] = nr;
+
+            /* This is only to get the flop count correct */
+            idef_fe.il[ftype].nr = nr - nr_nonperturbed;
+
+            if (nr - nr_nonperturbed > 0)
+            {
+                v = calc_one_bond(fplog, 0, ftype, &idef_fe,
+                                  x, f, fshift, fr, pbc_null, g,
+                                  grpp, nrnb, lambda, dvdl_dum,
+                                  md, fcd, TRUE,
+                                  global_atom_index, FALSE);
+                epot[ftype] += v;
+            }
+        }
     }
 
-    sfree(fr->fshift);
-    fr->fshift = fshift_orig;
+    sfree(fshift);
     sfree(f);
+
+    sfree(idef_fe.il_thread_division);
 }
index d70d6c150d847d1ec0f8cde36835240dee946606..6893ccbc83c71bfc0077bf641d1ce929eca60cbc 100644 (file)
@@ -154,12 +154,12 @@ t_ifunc tab_bonds, tab_angles, tab_dihs;
 t_ifunc polarize, anharm_polarize, water_pol, thole_pol, angres, angresz, dihres, unimplemented;
 
 
-/* Initialize the setup for the bonded force buffer reduction
- * over threads. This should be called each time the bonded setup
- * changes; i.e. at start-up without domain decomposition and at DD.
+/* Divided the bonded interactions over the threads, count=fr->nthreads
+ * and set up the bonded thread-force buffer reduction.
+ * This should be called each time the bonded setup changes;
+ * i.e. at start-up without domain decomposition and at DD.
  */
-void init_bonded_thread_force_reduction(t_forcerec   *fr,
-                                        const t_idef *idef);
+void setup_bonded_threading(t_forcerec *fr, t_idef *idef);
 
 #ifdef __cplusplus
 }
index 37e94880ef1bcaa1fd1850ca8ded0870dc5fd3be..f403f13963aa07944be6f8faba7e6c4029b6942e 100644 (file)
 #define gmx_sub_pr        _mm_sub_ps
 #define gmx_mul_pr        _mm_mul_ps
 #ifdef GMX_X86_AVX_128_FMA
+#define GMX_SIMD_HAVE_FMA
 #define gmx_madd_pr(a, b, c)   _mm_macc_ps(a, b, c)
 #define gmx_nmsub_pr(a, b, c)  _mm_nmacc_ps(a, b, c)
 #else
@@ -318,6 +319,7 @@ static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_
 #define gmx_sub_pr        _mm_sub_pd
 #define gmx_mul_pr        _mm_mul_pd
 #ifdef GMX_X86_AVX_128_FMA
+#define GMX_SIMD_HAVE_FMA
 #define gmx_madd_pr(a, b, c)   _mm_macc_pd(a, b, c)
 #define gmx_nmsub_pr(a, b, c)  _mm_nmacc_pd(a, b, c)
 #else
index 28feeaa07519c1a4fdc468cb421344d32b530751..9309b8d438d828e8f33d34e10349511150a9c37b 100644 (file)
 static gmx_inline gmx_mm_pr
 gmx_invsqrt_pr(gmx_mm_pr x)
 {
+    /* This is one of the few cases where FMA adds a FLOP, but ends up with
+     * less instructions in total when FMA is available in hardware.
+     * Usually we would not optimize this far, but invsqrt is used often.
+     */
+#ifdef GMX_SIMD_HAVE_FMA
     const gmx_mm_pr half  = gmx_set1_pr(0.5);
     const gmx_mm_pr one   = gmx_set1_pr(1.0);
 
     gmx_mm_pr       lu = gmx_rsqrt_pr(x);
 
     return gmx_madd_pr(gmx_nmsub_pr(x, gmx_mul_pr(lu, lu), one), gmx_mul_pr(lu, half), lu);
+#else
+    const gmx_mm_pr half  = gmx_set1_pr(0.5);
+    const gmx_mm_pr three = gmx_set1_pr(3.0);
+
+    gmx_mm_pr       lu = gmx_rsqrt_pr(x);
+    
+    return gmx_mul_pr(half, gmx_mul_pr(gmx_sub_pr(three, gmx_mul_pr(gmx_mul_pr(lu, lu), x)), lu));
+#endif
 }
 
 
index 42b8428dd16d2eb10e93ad3f5edb1a468849290f..e405926981bfe5bc8f886cfc1edb62fe01471d12 100644 (file)
@@ -40,6 +40,7 @@
 #include "typedefs.h"
 #include "gmxcomplex.h"
 #include "gmx_wallcycle.h"
+#include "sim_util.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -97,6 +98,7 @@ int gmx_pme_do(gmx_pme_t pme,
 int gmx_pmeonly(gmx_pme_t pme,
                 t_commrec *cr,     t_nrnb *mynrnb,
                 gmx_wallcycle_t wcycle,
+                gmx_runtime_t *runtime,
                 real ewaldcoeff,
                 t_inputrec *ir);
 /* Called on the nodes that do PME exclusively (as slaves)
index e5fb5c73dab3ba24bae76e35288f80da3b389b80..c8011a3be57861d732f52e73510de24da4bfdde1 100644 (file)
@@ -351,6 +351,9 @@ typedef struct
 
     t_ilist     il[F_NRE];
     int         ilsort;
+    int         nthreads;
+    int        *il_thread_division;
+    int         il_thread_division_nalloc;
 } t_idef;
 
 /*
@@ -383,6 +386,17 @@ typedef struct
  *   t_ilist il[F_NRE]
  *      The list of interactions for each type. Note that some,
  *      such as LJ and COUL will have 0 entries.
+ *   int ilsort
+ *      The state of the sorting of il, values are provided above.
+ *   int nthreads
+ *      The number of threads used to set il_thread_division.
+ *   int *il_thread_division
+ *      The division of the normal bonded interactions of threads.
+ *      il_thread_division[ftype*(nthreads+1)+t] contains an index
+ *      into il[ftype].iatoms; thread th operates on t=th to t=th+1.
+ *   int il_thread_division_nalloc
+ *      The allocated size of il_thread_division,
+ *      should be at least F_NRE*(nthreads+1).
  */
 
 typedef struct {
index e2329ddc150d8927886329e2ce2064bd38670dd9..846d85e11447d5a3b1e90e1f6a5ead868d52a20c 100644 (file)
@@ -9652,7 +9652,7 @@ void dd_partition_system(FILE                *fplog,
         make_local_gb(cr, fr->born, ir->gb_algorithm);
     }
 
-    init_bonded_thread_force_reduction(fr, &top_local->idef);
+    setup_bonded_threading(fr, &top_local->idef);
 
     if (!(cr->duty & DUTY_PME))
     {
index 5fa7e693cd828f2fe96f805c7d3193d67ae840ce..c15fe93bd59bc3691385bcaa5b83ae57b3baff61 100644 (file)
@@ -385,7 +385,7 @@ void init_em(FILE *fplog, const char *title,
 
         forcerec_set_excl_load(fr, *top, cr);
 
-        init_bonded_thread_force_reduction(fr, &(*top)->idef);
+        setup_bonded_threading(fr, &(*top)->idef);
 
         if (ir->ePBC != epbcNONE && !fr->bMolPBC)
         {
index 0f804b37db7df7fc5161b2e22302feee42505901..503e1b449e45ca34add3696e0037b58e9d9aa91d 100644 (file)
@@ -4110,6 +4110,7 @@ void gmx_pme_calc_energy(gmx_pme_t pme, int n, rvec *x, real *q, real *V)
 
 
 static void reset_pmeonly_counters(gmx_wallcycle_t wcycle,
+                                   gmx_runtime_t *runtime,
                                    t_nrnb *nrnb, t_inputrec *ir,
                                    gmx_large_int_t step)
 {
@@ -4124,6 +4125,7 @@ static void reset_pmeonly_counters(gmx_wallcycle_t wcycle,
     }
     ir->init_step = step;
     wallcycle_start(wcycle, ewcRUN);
+    runtime_start(runtime);
 }
 
 
@@ -4164,6 +4166,7 @@ static void gmx_pmeonly_switch(int *npmedata, gmx_pme_t **pmedata,
 int gmx_pmeonly(gmx_pme_t pme,
                 t_commrec *cr,    t_nrnb *nrnb,
                 gmx_wallcycle_t wcycle,
+                gmx_runtime_t *runtime,
                 real ewaldcoeff,
                 t_inputrec *ir)
 {
@@ -4219,7 +4222,7 @@ int gmx_pmeonly(gmx_pme_t pme,
             if (ret == pmerecvqxRESETCOUNTERS)
             {
                 /* Reset the cycle and flop counters */
-                reset_pmeonly_counters(wcycle, nrnb, ir, step);
+                reset_pmeonly_counters(wcycle, runtime, nrnb, ir, step);
             }
         }
         while (ret == pmerecvqxSWITCHGRID || ret == pmerecvqxRESETCOUNTERS);
@@ -4235,6 +4238,7 @@ int gmx_pmeonly(gmx_pme_t pme,
         if (count == 0)
         {
             wallcycle_start(wcycle, ewcRUN);
+            runtime_start(runtime);
         }
 
         wallcycle_start(wcycle, ewcPMEMESH);
@@ -4256,6 +4260,8 @@ int gmx_pmeonly(gmx_pme_t pme,
     } /***** end of quasi-loop, we stop with the break above */
     while (TRUE);
 
+    runtime_end(runtime);
+
     return 0;
 }
 
index a5e558431c48968d301c3f0d43cb5f939f9aff7f..4c48b7266bd828dc97b5d65a55fc6ff6fff14a55 100644 (file)
@@ -422,7 +422,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             make_local_shells(cr, mdatoms, shellfc);
         }
 
-        init_bonded_thread_force_reduction(fr, &top->idef);
+        setup_bonded_threading(fr, &top->idef);
 
         if (ir->pull && PAR(cr))
         {
index 1936e1b37c4d5277a1d74d7d06c6a9c8c2dc3610..53117dcd1f45f0d8554ca52bdccf3e04a817dec8 100644 (file)
@@ -1592,7 +1592,7 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
     else
     {
         /* do PME only */
-        gmx_pmeonly(*pmedata, cr, nrnb, wcycle, ewaldcoeff, inputrec);
+        gmx_pmeonly(*pmedata, cr, nrnb, wcycle, &runtime, ewaldcoeff, inputrec);
     }
 
     if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))