Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / mdlib / pme.c
index cd59ff79893816d5665f3a643489c04123bf2500..feeec50fb164d64966f1ac4cd0cae147c0e58a13 100644 (file)
@@ -263,7 +263,8 @@ typedef struct gmx_pme {
     MPI_Datatype  rvec_mpi;      /* the pme vector's MPI type */
 #endif
 
-    int        nthread;       /* The number of threads doing PME */
+    gmx_bool   bUseThreads;   /* Does any of the PME ranks have nthread>1 ?  */
+    int        nthread;       /* The number of threads doing PME on our rank */
 
     gmx_bool   bPPnode;       /* Node also does particle-particle forces */
     gmx_bool   bFEP;          /* Compute Free energy contribution */
@@ -1703,6 +1704,7 @@ static void make_subgrid_division(const ivec n, int ovl, int nthread,
 static void pmegrids_init(pmegrids_t *grids,
                           int nx, int ny, int nz, int nz_base,
                           int pme_order,
+                          gmx_bool bUseThreads,
                           int nthread,
                           int overlap_x,
                           int overlap_y)
@@ -1725,7 +1727,7 @@ static void pmegrids_init(pmegrids_t *grids,
 
     make_subgrid_division(n_base, pme_order-1, grids->nthread, grids->nc);
 
-    if (grids->nthread > 1)
+    if (bUseThreads)
     {
         ivec nst;
         int gridsize;
@@ -1775,6 +1777,10 @@ static void pmegrids_init(pmegrids_t *grids,
             }
         }
     }
+    else
+    {
+        grids->grid_th = NULL;
+    }
 
     snew(grids->g2t, DIM);
     tfac = 1;
@@ -3018,29 +3024,59 @@ static pme_spline_work_t *make_pme_spline_work(int order)
     return work;
 }
 
-static void
-gmx_pme_check_grid_restrictions(FILE *fplog, char dim, int nnodes, int *nk)
+void gmx_pme_check_restrictions(int pme_order,
+                                int nkx, int nky, int nkz,
+                                int nnodes_major,
+                                int nnodes_minor,
+                                gmx_bool bUseThreads,
+                                gmx_bool bFatal,
+                                gmx_bool *bValidSettings)
 {
-    int nk_new;
-
-    if (*nk % nnodes != 0)
+    if (pme_order > PME_ORDER_MAX)
     {
-        nk_new = nnodes*(*nk/nnodes + 1);
+        if (!bFatal)
+        {
+            *bValidSettings = FALSE;
+            return;
+        }
+        gmx_fatal(FARGS, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
+                  pme_order, PME_ORDER_MAX);
+    }
 
-        if (2*nk_new >= 3*(*nk))
+    if (nkx <= pme_order*(nnodes_major > 1 ? 2 : 1) ||
+        nky <= pme_order*(nnodes_minor > 1 ? 2 : 1) ||
+        nkz <= pme_order)
+    {
+        if (!bFatal)
         {
-            gmx_fatal(FARGS, "The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
-                      dim, *nk, dim, nnodes, dim);
+            *bValidSettings = FALSE;
+            return;
         }
+        gmx_fatal(FARGS, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order",
+                  pme_order);
+    }
 
-        if (fplog != NULL)
+    /* Check for a limitation of the (current) sum_fftgrid_dd code.
+     * We only allow multiple communication pulses in dim 1, not in dim 0.
+     */
+    if (bUseThreads && (nkx < nnodes_major*pme_order &&
+                        nkx != nnodes_major*(pme_order - 1)))
+    {
+        if (!bFatal)
         {
-            fprintf(fplog, "\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
-                    dim, *nk, dim, nnodes, dim, nk_new, dim);
+            *bValidSettings = FALSE;
+            return;
         }
+        gmx_fatal(FARGS, "The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
+                  nkx/(double)nnodes_major, pme_order);
+    }
 
-        *nk = nk_new;
+    if (bValidSettings != NULL)
+    {
+        *bValidSettings = TRUE;
     }
+
+    return;
 }
 
 int gmx_pme_init(gmx_pme_t *         pmedata,
@@ -3055,7 +3091,7 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
 {
     gmx_pme_t pme = NULL;
 
-    pme_atomcomm_t *atc;
+    int  use_threads, sum_use_threads;
     ivec ndata;
 
     if (debug)
@@ -3155,6 +3191,21 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
 
     pme->nthread = nthread;
 
+     /* Check if any of the PME MPI ranks uses threads */
+    use_threads = (pme->nthread > 1 ? 1 : 0);
+#ifdef GMX_MPI
+    if (pme->nnodes > 1)
+    {
+        MPI_Allreduce(&use_threads, &sum_use_threads, 1, MPI_INT,
+                      MPI_SUM, pme->mpi_comm);
+    }
+    else
+#endif
+    {
+        sum_use_threads = use_threads;
+    }
+    pme->bUseThreads = (sum_use_threads > 0);
+
     if (ir->ePBC == epbcSCREW)
     {
         gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
@@ -3168,37 +3219,14 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
     pme->pme_order   = ir->pme_order;
     pme->epsilon_r   = ir->epsilon_r;
 
-    if (pme->pme_order > PME_ORDER_MAX)
-    {
-        gmx_fatal(FARGS, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
-                  pme->pme_order, PME_ORDER_MAX);
-    }
-
-    /* Currently pme.c supports only the fft5d FFT code.
-     * Therefore the grid always needs to be divisible by nnodes.
-     * When the old 1D code is also supported again, change this check.
-     *
-     * This check should be done before calling gmx_pme_init
-     * and fplog should be passed iso stderr.
-     *
-       if (pme->ndecompdim >= 2)
-     */
-    if (pme->ndecompdim >= 1)
-    {
-        /*
-           gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
-                                        'x',nnodes_major,&pme->nkx);
-           gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
-                                        'y',nnodes_minor,&pme->nky);
-         */
-    }
-
-    if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
-        pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
-        pme->nkz <= pme->pme_order)
-    {
-        gmx_fatal(FARGS, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order", pme->pme_order);
-    }
+    /* If we violate restrictions, generate a fatal error here */
+    gmx_pme_check_restrictions(pme->pme_order,
+                               pme->nkx, pme->nky, pme->nkz,
+                               pme->nnodes_major,
+                               pme->nnodes_minor,
+                               pme->bUseThreads,
+                               TRUE,
+                               NULL);
 
     if (pme->nnodes > 1)
     {
@@ -3256,14 +3284,12 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
                       pme->nky,
                       (div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
 
-    /* Check for a limitation of the (current) sum_fftgrid_dd code.
-     * We only allow multiple communication pulses in dim 1, not in dim 0.
+    /* Double-check for a limitation of the (current) sum_fftgrid_dd code.
+     * Note that gmx_pme_check_restrictions checked for this already.
      */
-    if (pme->nthread > 1 && (pme->overlap[0].noverlap_nodes > 1 ||
-                             pme->nkx < pme->nnodes_major*pme->pme_order))
+    if (pme->bUseThreads && pme->overlap[0].noverlap_nodes > 1)
     {
-        gmx_fatal(FARGS, "The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x and should be >= pme_order (%d). To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
-                  pme->nkx/(double)pme->nnodes_major, pme->pme_order);
+        gmx_incons("More than one communication pulse required for grid overlap communication along the major dimension while using threads");
     }
 
     snew(pme->bsp_mod[XX], pme->nkx);
@@ -3302,6 +3328,7 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
                   pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
                   pme->pmegrid_nz_base,
                   pme->pme_order,
+                  pme->bUseThreads,
                   pme->nthread,
                   pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
                   pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
@@ -3325,6 +3352,7 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
                       pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
                       pme->pmegrid_nz_base,
                       pme->pme_order,
+                      pme->bUseThreads,
                       pme->nthread,
                       pme->nkx % pme->nnodes_major != 0,
                       pme->nky % pme->nnodes_minor != 0);
@@ -3398,7 +3426,7 @@ static void reuse_pmegrids(const pmegrids_t *old, pmegrids_t *new)
     sfree_aligned(new->grid.grid);
     new->grid.grid = old->grid.grid;
 
-    if (new->nthread > 1 && new->nthread == old->nthread)
+    if (new->grid_th != NULL && new->nthread == old->nthread)
     {
         sfree_aligned(new->grid_all);
         for (t = 0; t < new->nthread; t++)
@@ -3929,22 +3957,34 @@ static void spread_on_grid(gmx_pme_t pme,
     for (thread = 0; thread < nthread; thread++)
     {
         splinedata_t *spline;
-        pmegrid_t *grid;
+        pmegrid_t *grid = NULL;
 
         /* make local bsplines  */
-        if (grids == NULL || grids->nthread == 1)
+        if (grids == NULL || !pme->bUseThreads)
         {
             spline = &atc->spline[0];
 
             spline->n = atc->n;
 
-            grid = &grids->grid;
+            if (bSpread)
+            {
+                grid = &grids->grid;
+            }
         }
         else
         {
             spline = &atc->spline[thread];
 
-            make_thread_local_ind(atc, thread, spline);
+            if (grids->nthread == 1)
+            {
+                /* One thread, we operate on all charges */
+                spline->n = atc->n;
+            }
+            else
+            {
+                /* Get the indices our thread should operate on */
+                make_thread_local_ind(atc, thread, spline);
+            }
 
             grid = &grids->grid_th[thread];
         }
@@ -3963,7 +4003,7 @@ static void spread_on_grid(gmx_pme_t pme,
 #endif
             spread_q_bsplines_thread(grid, atc, spline, pme->spline_work);
 
-            if (grids->nthread > 1)
+            if (pme->bUseThreads)
             {
                 copy_local_grid(pme, grids, thread, fftgrid);
             }
@@ -3978,7 +4018,7 @@ static void spread_on_grid(gmx_pme_t pme,
     cs2 += (double)c2;
 #endif
 
-    if (bSpread && grids->nthread > 1)
+    if (bSpread && pme->bUseThreads)
     {
 #ifdef PME_TIME_THREADS
         c3 = omp_cyc_start();
@@ -3998,7 +4038,10 @@ static void spread_on_grid(gmx_pme_t pme,
 
         if (pme->nnodes > 1)
         {
-            /* Communicate the overlapping part of the fftgrid */
+            /* Communicate the overlapping part of the fftgrid.
+             * For this communication call we need to check pme->bUseThreads
+             * to have all ranks communicate here, regardless of pme->nthread.
+             */
             sum_fftgrid_dd(pme, fftgrid);
         }
     }
@@ -4093,6 +4136,7 @@ void gmx_pme_calc_energy(gmx_pme_t pme, int n, rvec *x, real *q, real *V)
     /* We only use the A-charges grid */
     grid = &pme->pmegridA;
 
+    /* Only calculate the spline coefficients, don't actually spread */
     spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgridA);
 
     *V = gather_energy_bsplines(pme, grid->grid.grid, atc);
@@ -4408,7 +4452,7 @@ int gmx_pme_do(gmx_pme_t pme,
             inc_nrnb(nrnb, eNR_SPREADQBSP,
                      pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
 
-            if (pme->nthread == 1)
+            if (!pme->bUseThreads)
             {
                 wrap_periodic_pmegrid(pme, grid);