Sort all includes in src/gromacs
[alexxy/gromacs.git] / src / gromacs / mdlib / pme.c
index 924420ce0f3ad61516c82b2b172f58b9612330f6..67a12ab58db8d0a8f5b4dc4a2f53f75214da1568 100644 (file)
  * /Erik 001109
  */
 
+#include "gmxpre.h"
+
+#include "gromacs/legacyheaders/pme.h"
+
 #include "config.h"
 
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 
-#include "typedefs.h"
-#include "txtdump.h"
-#include "gromacs/math/vec.h"
-#include "gromacs/utility/smalloc.h"
-#include "coulomb.h"
-#include "gromacs/utility/fatalerror.h"
-#include "pme.h"
-#include "network.h"
-#include "gromacs/math/units.h"
-#include "nrnb.h"
-#include "macros.h"
-
-#include "gromacs/legacyheaders/types/commrec.h"
 #include "gromacs/fft/parallel_3dfft.h"
-#include "gromacs/utility/futil.h"
 #include "gromacs/fileio/pdbio.h"
+#include "gromacs/legacyheaders/coulomb.h"
+#include "gromacs/legacyheaders/macros.h"
+#include "gromacs/legacyheaders/network.h"
+#include "gromacs/legacyheaders/nrnb.h"
+#include "gromacs/legacyheaders/txtdump.h"
+#include "gromacs/legacyheaders/typedefs.h"
+#include "gromacs/legacyheaders/types/commrec.h"
 #include "gromacs/math/gmxcomplex.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/vec.h"
 #include "gromacs/timing/cyclecounter.h"
 #include "gromacs/timing/wallcycle.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/futil.h"
 #include "gromacs/utility/gmxmpi.h"
 #include "gromacs/utility/gmxomp.h"
+#include "gromacs/utility/smalloc.h"
 
 /* Include the SIMD macro file and then check for support */
 #include "gromacs/simd/simd.h"
@@ -1434,7 +1436,7 @@ static void spread_coefficients_bsplines_thread(pmegrid_t                    *pm
 #define PME_SPREAD_SIMD4_ALIGNED
 #define PME_ORDER 4
 #endif
-#include "pme_simd4.h"
+#include "gromacs/mdlib/pme_simd4.h"
 #else
                     DO_BSPLINE(4);
 #endif
@@ -1443,7 +1445,7 @@ static void spread_coefficients_bsplines_thread(pmegrid_t                    *pm
 #ifdef PME_SIMD4_SPREAD_GATHER
 #define PME_SPREAD_SIMD4_ALIGNED
 #define PME_ORDER 5
-#include "pme_simd4.h"
+#include "gromacs/mdlib/pme_simd4.h"
 #else
                     DO_BSPLINE(5);
 #endif
@@ -2597,7 +2599,7 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid,
 #define PME_GATHER_F_SIMD4_ALIGNED
 #define PME_ORDER 4
 #endif
-#include "pme_simd4.h"
+#include "gromacs/mdlib/pme_simd4.h"
 #else
                     DO_FSPLINE(4);
 #endif
@@ -2606,7 +2608,7 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid,
 #ifdef PME_SIMD4_SPREAD_GATHER
 #define PME_GATHER_F_SIMD4_ALIGNED
 #define PME_ORDER 5
-#include "pme_simd4.h"
+#include "gromacs/mdlib/pme_simd4.h"
 #else
                     DO_FSPLINE(5);
 #endif
@@ -3131,8 +3133,9 @@ init_overlap_comm(pme_overlap_t *  ol,
     {
         /* s2g0 the local interpolation grid start.
          * s2g1 the local interpolation grid end.
-         * Because grid overlap communication only goes forward,
-         * the grid the slabs for fft's should be rounded down.
+         * Since in calc_pidx we divide particles, and not grid lines,
+         * spatially uniform along dimension x or y, we need to round
+         * s2g0 down and s2g1 up.
          */
         ol->s2g0[i] = ( i   *ndata + 0       )/nnodes;
         ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
@@ -3834,7 +3837,7 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
     int  fft_my, fft_mz;
     int  buf_my = -1;
     int  nsx, nsy, nsz;
-    ivec ne;
+    ivec localcopy_end;
     int  offx, offy, offz, x, y, z, i0, i0t;
     int  sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
     gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
@@ -3870,8 +3873,14 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
 
     for (d = 0; d < DIM; d++)
     {
-        ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
-                    local_fft_ndata[d]);
+        /* Determine up to where our thread needs to copy from the
+         * thread-local charge spreading grid to the rank-local FFT grid.
+         * This is up to our spreading grid end minus order-1 and
+         * not beyond the local FFT grid.
+         */
+        localcopy_end[d] =
+            min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
+                local_fft_ndata[d]);
     }
 
     offx = pmegrid->offset[XX];
@@ -3903,7 +3912,16 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
         pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
         ox       += pmegrid_g->offset[XX];
         /* Determine the end of our part of the source grid */
-        tx1 = min(ox + pmegrid_g->n[XX], ne[XX]);
+        if (!bCommX)
+        {
+            /* Use our thread local source grid and target grid part */
+            tx1 = min(ox + pmegrid_g->n[XX], localcopy_end[XX]);
+        }
+        else
+        {
+            /* Use our thread local source grid and the spreading range */
+            tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order);
+        }
 
         for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
         {
@@ -3919,7 +3937,16 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
             pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
             oy       += pmegrid_g->offset[YY];
             /* Determine the end of our part of the source grid */
-            ty1 = min(oy + pmegrid_g->n[YY], ne[YY]);
+            if (!bCommY)
+            {
+                /* Use our thread local source grid and target grid part */
+                ty1 = min(oy + pmegrid_g->n[YY], localcopy_end[YY]);
+            }
+            else
+            {
+                /* Use our thread local source grid and the spreading range */
+                ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order);
+            }
 
             for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
             {
@@ -3932,7 +3959,7 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
                 }
                 pmegrid_g = &pmegrids->grid_th[fz];
                 oz       += pmegrid_g->offset[ZZ];
-                tz1       = min(oz + pmegrid_g->n[ZZ], ne[ZZ]);
+                tz1       = min(oz + pmegrid_g->n[ZZ], localcopy_end[ZZ]);
 
                 if (sx == 0 && sy == 0 && sz == 0)
                 {
@@ -3988,7 +4015,13 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
                     if (bCommY)
                     {
                         commbuf = commbuf_y;
-                        buf_my  = ty1 - offy;
+                        /* The y-size of the communication buffer is set by
+                         * the overlap of the grid part of our local slab
+                         * with the part starting at the next slab.
+                         */
+                        buf_my  =
+                            pme->overlap[1].s2g1[pme->nodeid_minor] -
+                            pme->overlap[1].s2g0[pme->nodeid_minor+1];
                         if (bCommX)
                         {
                             /* We index commbuf modulo the local grid size */
@@ -4101,6 +4134,12 @@ static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid, int grid_index)
             sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
             recvptr = overlap->recvbuf;
 
+            if (debug != NULL)
+            {
+                fprintf(debug, "PME fftgrid comm y %2d x %2d x %2d\n",
+                        local_fft_ndata[XX], send_nindex, local_fft_ndata[ZZ]);
+            }
+
 #ifdef GMX_MPI
             MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
                          send_id, ipulse,
@@ -4167,7 +4206,7 @@ static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid, int grid_index)
 
         if (debug != NULL)
         {
-            fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
+            fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
                     send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
         }