src/gromacs/ewald/pme_spread.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "pme_spread.h"
  40
  41 #include "config.h"
  42
  43 #include <cassert>
  44
  45 #include <algorithm>
  46
  47 #include "gromacs/ewald/pme.h"
  48 #include "gromacs/fft/parallel_3dfft.h"
  49 #include "gromacs/simd/simd.h"
  50 #include "gromacs/utility/basedefinitions.h"
  51 #include "gromacs/utility/exceptions.h"
  52 #include "gromacs/utility/fatalerror.h"
  53 #include "gromacs/utility/smalloc.h"
  54
  55 #include "pme_grid.h"
  56 #include "pme_internal.h"
  57 #include "pme_simd.h"
  58 #include "pme_spline_work.h"
  59
  60 /* TODO consider split of pme-spline from this file */
  61
  62 static void calc_interpolation_idx(const gmx_pme_t *pme, PmeAtomComm *atc,
  63                                    int start, int grid_index, int end, int thread)
  64 {
  65     int             i;
  66     int            *idxptr, tix, tiy, tiz;
  67     const real     *xptr;
  68     real           *fptr, tx, ty, tz;
  69     real            rxx, ryx, ryy, rzx, rzy, rzz;
  70     int             nx, ny, nz;
  71     int            *g2tx, *g2ty, *g2tz;
  72     gmx_bool        bThreads;
  73     int            *thread_idx = nullptr;
  74     int            *tpl_n      = nullptr;
  75     int             thread_i;
  76
  77     nx  = pme->nkx;
  78     ny  = pme->nky;
  79     nz  = pme->nkz;
  80
  81     rxx = pme->recipbox[XX][XX];
  82     ryx = pme->recipbox[YY][XX];
  83     ryy = pme->recipbox[YY][YY];
  84     rzx = pme->recipbox[ZZ][XX];
  85     rzy = pme->recipbox[ZZ][YY];
  86     rzz = pme->recipbox[ZZ][ZZ];
  87
  88     g2tx = pme->pmegrid[grid_index].g2t[XX];
  89     g2ty = pme->pmegrid[grid_index].g2t[YY];
  90     g2tz = pme->pmegrid[grid_index].g2t[ZZ];
  91
  92     bThreads = (atc->nthread > 1);
  93     if (bThreads)
  94     {
  95         thread_idx = atc->thread_idx.data();
  96
  97         tpl_n = atc->threadMap[thread].n;
  98         for (i = 0; i < atc->nthread; i++)
  99         {
 100             tpl_n[i] = 0;
 101         }
 102     }
 103
 104     const real shift = c_pmeMaxUnitcellShift;
 105
 106     for (i = start; i < end; i++)
 107     {
 108         xptr   = atc->x[i];
 109         idxptr = atc->idx[i];
 110         fptr   = atc->fractx[i];
 111
 112         /* Fractional coordinates along box vectors, add a positive shift to ensure tx/ty/tz are positive for triclinic boxes */
 113         tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + shift );
 114         ty = ny * (                  xptr[YY] * ryy + xptr[ZZ] * rzy + shift );
 115         tz = nz * (                                   xptr[ZZ] * rzz + shift );
 116
 117         tix = static_cast<int>(tx);
 118         tiy = static_cast<int>(ty);
 119         tiz = static_cast<int>(tz);
 120
 121 #ifdef DEBUG
 122         range_check(tix, 0, c_pmeNeighborUnitcellCount * nx);
 123         range_check(tiy, 0, c_pmeNeighborUnitcellCount * ny);
 124         range_check(tiz, 0, c_pmeNeighborUnitcellCount * nz);
 125 #endif
 126         /* Because decomposition only occurs in x and y,
 127          * we never have a fraction correction in z.
 128          */
 129         fptr[XX] = tx - tix + pme->fshx[tix];
 130         fptr[YY] = ty - tiy + pme->fshy[tiy];
 131         fptr[ZZ] = tz - tiz;
 132
 133         idxptr[XX] = pme->nnx[tix];
 134         idxptr[YY] = pme->nny[tiy];
 135         idxptr[ZZ] = pme->nnz[tiz];
 136
 137 #ifdef DEBUG
 138         range_check(idxptr[XX], 0, pme->pmegrid_nx);
 139         range_check(idxptr[YY], 0, pme->pmegrid_ny);
 140         range_check(idxptr[ZZ], 0, pme->pmegrid_nz);
 141 #endif
 142
 143         if (bThreads)
 144         {
 145             thread_i      = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
 146             thread_idx[i] = thread_i;
 147             tpl_n[thread_i]++;
 148         }
 149     }
 150
 151     if (bThreads)
 152     {
 153         /* Make a list of particle indices sorted on thread */
 154
 155         /* Get the cumulative count */
 156         for (i = 1; i < atc->nthread; i++)
 157         {
 158             tpl_n[i] += tpl_n[i-1];
 159         }
 160         /* The current implementation distributes particles equally
 161          * over the threads, so we could actually allocate for that
 162          * in pme_realloc_atomcomm_things.
 163          */
 164         AtomToThreadMap &threadMap = atc->threadMap[thread];
 165         threadMap.i.resize(tpl_n[atc->nthread - 1]);
 166         /* Set tpl_n to the cumulative start */
 167         for (i = atc->nthread-1; i >= 1; i--)
 168         {
 169             tpl_n[i] = tpl_n[i-1];
 170         }
 171         tpl_n[0] = 0;
 172
 173         /* Fill our thread local array with indices sorted on thread */
 174         for (i = start; i < end; i++)
 175         {
 176             threadMap.i[tpl_n[atc->thread_idx[i]]++] = i;
 177         }
 178         /* Now tpl_n contains the cummulative count again */
 179     }
 180 }
 181
 182 static void make_thread_local_ind(const PmeAtomComm *atc,
 183                                   int thread, splinedata_t *spline)
 184 {
 185     int             n, t, i, start, end;
 186
 187     /* Combine the indices made by each thread into one index */
 188
 189     n     = 0;
 190     start = 0;
 191     for (t = 0; t < atc->nthread; t++)
 192     {
 193         const AtomToThreadMap &threadMap = atc->threadMap[t];
 194         /* Copy our part (start - end) from the list of thread t */
 195         if (thread > 0)
 196         {
 197             start = threadMap.n[thread-1];
 198         }
 199         end = threadMap.n[thread];
 200         for (i = start; i < end; i++)
 201         {
 202             spline->ind[n++] = threadMap.i[i];
 203         }
 204     }
 205
 206     spline->n = n;
 207 }
 208
 209 // At run time, the values of order used and asserted upon mean that
 210 // indexing out of bounds does not occur. However compilers don't
 211 // always understand that, so we suppress this warning for this code
 212 // region.
 213 #pragma GCC diagnostic push
 214 #pragma GCC diagnostic ignored "-Warray-bounds"
 215
 216 /* Macro to force loop unrolling by fixing order.
 217  * This gives a significant performance gain.
 218  */
 219 #define CALC_SPLINE(order)                     \
 220     {                                              \
 221         for (int j = 0; (j < DIM); j++)            \
 222         {                                          \
 223             real dr, div;                          \
 224             real data[PME_ORDER_MAX];              \
 225                                                    \
 226             dr  = xptr[j];                         \
 227                                                \
 228             /* dr is relative offset from lower cell limit */ \
 229             data[(order)-1] = 0;                     \
 230             data[1]         = dr;                          \
 231             data[0]         = 1 - dr;                      \
 232                                                \
 233             for (int k = 3; (k < (order)); k++)      \
 234             {                                      \
 235                 div       = 1.0/(k - 1.0);               \
 236                 data[k-1] = div*dr*data[k-2];      \
 237                 for (int l = 1; (l < (k-1)); l++)  \
 238                 {                                  \
 239                     data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
 240                                        data[k-l-1]);                \
 241                 }                                  \
 242                 data[0] = div*(1-dr)*data[0];      \
 243             }                                      \
 244             /* differentiate */                    \
 245             dtheta[j][i*(order)+0] = -data[0];       \
 246             for (int k = 1; (k < (order)); k++)      \
 247             {                                      \
 248                 dtheta[j][i*(order)+k] = data[k-1] - data[k]; \
 249             }                                      \
 250                                                \
 251             div             = 1.0/((order) - 1);                 \
 252             data[(order)-1] = div*dr*data[(order)-2];  \
 253             for (int l = 1; (l < ((order)-1)); l++)  \
 254             {                                      \
 255                 data[(order)-l-1] = div*((dr+l)*data[(order)-l-2]+    \
 256                                          ((order)-l-dr)*data[(order)-l-1]); \
 257             }                                      \
 258             data[0] = div*(1 - dr)*data[0];        \
 259                                                \
 260             for (int k = 0; k < (order); k++)        \
 261             {                                      \
 262                 theta[j][i*(order)+k]  = data[k];    \
 263             }                                      \
 264         }                                          \
 265     }
 266
 267 static void make_bsplines(splinevec theta, splinevec dtheta, int order,
 268                           rvec fractx[], int nr, const int ind[], const real coefficient[],
 269                           gmx_bool bDoSplines)
 270 {
 271     /* construct splines for local atoms */
 272     int   i, ii;
 273     real *xptr;
 274
 275     for (i = 0; i < nr; i++)
 276     {
 277         /* With free energy we do not use the coefficient check.
 278          * In most cases this will be more efficient than calling make_bsplines
 279          * twice, since usually more than half the particles have non-zero coefficients.
 280          */
 281         ii = ind[i];
 282         if (bDoSplines || coefficient[ii] != 0.0)
 283         {
 284             xptr = fractx[ii];
 285             assert(order >= 3 && order <= PME_ORDER_MAX);
 286             switch (order)
 287             {
 288                 case 4:  CALC_SPLINE(4);     break;
 289                 case 5:  CALC_SPLINE(5);     break;
 290                 default: CALC_SPLINE(order); break;
 291             }
 292         }
 293     }
 294 }
 295
 296 #pragma GCC diagnostic pop
 297
 298 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
 299 #define DO_BSPLINE(order)                            \
 300     for (ithx = 0; (ithx < (order)); ithx++)                    \
 301     {                                                    \
 302         index_x = (i0+ithx)*pny*pnz;                     \
 303         valx    = coefficient*thx[ithx];                          \
 304                                                      \
 305         for (ithy = 0; (ithy < (order)); ithy++)                \
 306         {                                                \
 307             valxy    = valx*thy[ithy];                   \
 308             index_xy = index_x+(j0+ithy)*pnz;            \
 309                                                      \
 310             for (ithz = 0; (ithz < (order)); ithz++)            \
 311             {                                            \
 312                 index_xyz        = index_xy+(k0+ithz);   \
 313                 grid[index_xyz] += valxy*thz[ithz];      \
 314             }                                            \
 315         }                                                \
 316     }
 317
 318
 319 static void spread_coefficients_bsplines_thread(const pmegrid_t                   *pmegrid,
 320                                                 const PmeAtomComm                 *atc,
 321                                                 splinedata_t                      *spline,
 322                                                 struct pme_spline_work gmx_unused *work)
 323 {
 324
 325     /* spread coefficients from home atoms to local grid */
 326     real          *grid;
 327     int            i, nn, n, ithx, ithy, ithz, i0, j0, k0;
 328     const int     *idxptr;
 329     int            order, norder, index_x, index_xy, index_xyz;
 330     real           valx, valxy, coefficient;
 331     real          *thx, *thy, *thz;
 332     int            pnx, pny, pnz, ndatatot;
 333     int            offx, offy, offz;
 334
 335 #if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
 336     alignas(GMX_SIMD_ALIGNMENT) real  thz_aligned[GMX_SIMD4_WIDTH*2];
 337 #endif
 338
 339     pnx = pmegrid->s[XX];
 340     pny = pmegrid->s[YY];
 341     pnz = pmegrid->s[ZZ];
 342
 343     offx = pmegrid->offset[XX];
 344     offy = pmegrid->offset[YY];
 345     offz = pmegrid->offset[ZZ];
 346
 347     ndatatot = pnx*pny*pnz;
 348     grid     = pmegrid->grid;
 349     for (i = 0; i < ndatatot; i++)
 350     {
 351         grid[i] = 0;
 352     }
 353
 354     order = pmegrid->order;
 355
 356     for (nn = 0; nn < spline->n; nn++)
 357     {
 358         n           = spline->ind[nn];
 359         coefficient = atc->coefficient[n];
 360
 361         if (coefficient != 0)
 362         {
 363             idxptr = atc->idx[n];
 364             norder = nn*order;
 365
 366             i0   = idxptr[XX] - offx;
 367             j0   = idxptr[YY] - offy;
 368             k0   = idxptr[ZZ] - offz;
 369
 370             thx = spline->theta.coefficients[XX] + norder;
 371             thy = spline->theta.coefficients[YY] + norder;
 372             thz = spline->theta.coefficients[ZZ] + norder;
 373
 374             switch (order)
 375             {
 376                 case 4:
 377 #ifdef PME_SIMD4_SPREAD_GATHER
 378 #ifdef PME_SIMD4_UNALIGNED
 379 #define PME_SPREAD_SIMD4_ORDER4
 380 #else
 381 #define PME_SPREAD_SIMD4_ALIGNED
 382 #define PME_ORDER 4
 383 #endif
 384 #include "pme_simd4.h"
 385 #else
 386                     DO_BSPLINE(4);
 387 #endif
 388                     break;
 389                 case 5:
 390 #ifdef PME_SIMD4_SPREAD_GATHER
 391 #define PME_SPREAD_SIMD4_ALIGNED
 392 #define PME_ORDER 5
 393 #include "pme_simd4.h"
 394 #else
 395                     DO_BSPLINE(5);
 396 #endif
 397                     break;
 398                 default:
 399                     DO_BSPLINE(order);
 400                     break;
 401             }
 402         }
 403     }
 404 }
 405
 406 static void copy_local_grid(const gmx_pme_t *pme, const pmegrids_t *pmegrids,
 407                             int grid_index, int thread, real *fftgrid)
 408 {
 409     ivec local_fft_ndata, local_fft_offset, local_fft_size;
 410     int  fft_my, fft_mz;
 411     int  nsy, nsz;
 412     ivec nf;
 413     int  offx, offy, offz, x, y, z, i0, i0t;
 414     int  d;
 415     real *grid_th;
 416
 417     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
 418                                    local_fft_ndata,
 419                                    local_fft_offset,
 420                                    local_fft_size);
 421     fft_my = local_fft_size[YY];
 422     fft_mz = local_fft_size[ZZ];
 423
 424     const pmegrid_t *pmegrid = &pmegrids->grid_th[thread];
 425
 426     nsy = pmegrid->s[YY];
 427     nsz = pmegrid->s[ZZ];
 428
 429     for (d = 0; d < DIM; d++)
 430     {
 431         nf[d] = std::min(pmegrid->n[d] - (pmegrid->order - 1),
 432                          local_fft_ndata[d] - pmegrid->offset[d]);
 433     }
 434
 435     offx = pmegrid->offset[XX];
 436     offy = pmegrid->offset[YY];
 437     offz = pmegrid->offset[ZZ];
 438
 439     /* Directly copy the non-overlapping parts of the local grids.
 440      * This also initializes the full grid.
 441      */
 442     grid_th = pmegrid->grid;
 443     for (x = 0; x < nf[XX]; x++)
 444     {
 445         for (y = 0; y < nf[YY]; y++)
 446         {
 447             i0  = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
 448             i0t = (x*nsy + y)*nsz;
 449             for (z = 0; z < nf[ZZ]; z++)
 450             {
 451                 fftgrid[i0+z] = grid_th[i0t+z];
 452             }
 453         }
 454     }
 455 }
 456
 457 static void
 458 reduce_threadgrid_overlap(const gmx_pme_t *pme,
 459                           const pmegrids_t *pmegrids, int thread,
 460                           real *fftgrid, real *commbuf_x, real *commbuf_y,
 461                           int grid_index)
 462 {
 463     ivec local_fft_ndata, local_fft_offset, local_fft_size;
 464     int  fft_nx, fft_ny, fft_nz;
 465     int  fft_my, fft_mz;
 466     int  buf_my = -1;
 467     int  nsy, nsz;
 468     ivec localcopy_end, commcopy_end;
 469     int  offx, offy, offz, x, y, z, i0, i0t;
 470     int  sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
 471     gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
 472     gmx_bool bCommX, bCommY;
 473     int  d;
 474     int  thread_f;
 475     const pmegrid_t *pmegrid, *pmegrid_g, *pmegrid_f;
 476     const real *grid_th;
 477     real *commbuf = nullptr;
 478
 479     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
 480                                    local_fft_ndata,
 481                                    local_fft_offset,
 482                                    local_fft_size);
 483     fft_nx = local_fft_ndata[XX];
 484     fft_ny = local_fft_ndata[YY];
 485     fft_nz = local_fft_ndata[ZZ];
 486
 487     fft_my = local_fft_size[YY];
 488     fft_mz = local_fft_size[ZZ];
 489
 490     /* This routine is called when all thread have finished spreading.
 491      * Here each thread sums grid contributions calculated by other threads
 492      * to the thread local grid volume.
 493      * To minimize the number of grid copying operations,
 494      * this routines sums immediately from the pmegrid to the fftgrid.
 495      */
 496
 497     /* Determine which part of the full node grid we should operate on,
 498      * this is our thread local part of the full grid.
 499      */
 500     pmegrid = &pmegrids->grid_th[thread];
 501
 502     for (d = 0; d < DIM; d++)
 503     {
 504         /* Determine up to where our thread needs to copy from the
 505          * thread-local charge spreading grid to the rank-local FFT grid.
 506          * This is up to our spreading grid end minus order-1 and
 507          * not beyond the local FFT grid.
 508          */
 509         localcopy_end[d] =
 510             std::min(pmegrid->offset[d] + pmegrid->n[d] - (pmegrid->order - 1),
 511                      local_fft_ndata[d]);
 512
 513         /* Determine up to where our thread needs to copy from the
 514          * thread-local charge spreading grid to the communication buffer.
 515          * Note: only relevant with communication, ignored otherwise.
 516          */
 517         commcopy_end[d]  = localcopy_end[d];
 518         if (pmegrid->ci[d] == pmegrids->nc[d] - 1)
 519         {
 520             /* The last thread should copy up to the last pme grid line.
 521              * When the rank-local FFT grid is narrower than pme-order,
 522              * we need the max below to ensure copying of all data.
 523              */
 524             commcopy_end[d] = std::max(commcopy_end[d], pme->pme_order);
 525         }
 526     }
 527
 528     offx = pmegrid->offset[XX];
 529     offy = pmegrid->offset[YY];
 530     offz = pmegrid->offset[ZZ];
 531
 532
 533     bClearBufX  = TRUE;
 534     bClearBufY  = TRUE;
 535     bClearBufXY = TRUE;
 536
 537     /* Now loop over all the thread data blocks that contribute
 538      * to the grid region we (our thread) are operating on.
 539      */
 540     /* Note that fft_nx/y is equal to the number of grid points
 541      * between the first point of our node grid and the one of the next node.
 542      */
 543     for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
 544     {
 545         fx     = pmegrid->ci[XX] + sx;
 546         ox     = 0;
 547         bCommX = FALSE;
 548         if (fx < 0)
 549         {
 550             fx    += pmegrids->nc[XX];
 551             ox    -= fft_nx;
 552             bCommX = (pme->nnodes_major > 1);
 553         }
 554         pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
 555         ox       += pmegrid_g->offset[XX];
 556         /* Determine the end of our part of the source grid.
 557          * Use our thread local source grid and target grid part
 558          */
 559         tx1 = std::min(ox + pmegrid_g->n[XX],
 560                        !bCommX ? localcopy_end[XX] : commcopy_end[XX]);
 561
 562         for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
 563         {
 564             fy     = pmegrid->ci[YY] + sy;
 565             oy     = 0;
 566             bCommY = FALSE;
 567             if (fy < 0)
 568             {
 569                 fy    += pmegrids->nc[YY];
 570                 oy    -= fft_ny;
 571                 bCommY = (pme->nnodes_minor > 1);
 572             }
 573             pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
 574             oy       += pmegrid_g->offset[YY];
 575             /* Determine the end of our part of the source grid.
 576              * Use our thread local source grid and target grid part
 577              */
 578             ty1 = std::min(oy + pmegrid_g->n[YY],
 579                            !bCommY ? localcopy_end[YY] : commcopy_end[YY]);
 580
 581             for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
 582             {
 583                 fz = pmegrid->ci[ZZ] + sz;
 584                 oz = 0;
 585                 if (fz < 0)
 586                 {
 587                     fz += pmegrids->nc[ZZ];
 588                     oz -= fft_nz;
 589                 }
 590                 pmegrid_g = &pmegrids->grid_th[fz];
 591                 oz       += pmegrid_g->offset[ZZ];
 592                 tz1       = std::min(oz + pmegrid_g->n[ZZ], localcopy_end[ZZ]);
 593
 594                 if (sx == 0 && sy == 0 && sz == 0)
 595                 {
 596                     /* We have already added our local contribution
 597                      * before calling this routine, so skip it here.
 598                      */
 599                     continue;
 600                 }
 601
 602                 thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
 603
 604                 pmegrid_f = &pmegrids->grid_th[thread_f];
 605
 606                 grid_th = pmegrid_f->grid;
 607
 608                 nsy = pmegrid_f->s[YY];
 609                 nsz = pmegrid_f->s[ZZ];
 610
 611 #ifdef DEBUG_PME_REDUCE
 612                 printf("n%d t%d add %d  %2d %2d %2d  %2d %2d %2d  %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
 613                        pme->nodeid, thread, thread_f,
 614                        pme->pmegrid_start_ix,
 615                        pme->pmegrid_start_iy,
 616                        pme->pmegrid_start_iz,
 617                        sx, sy, sz,
 618                        offx-ox, tx1-ox, offx, tx1,
 619                        offy-oy, ty1-oy, offy, ty1,
 620                        offz-oz, tz1-oz, offz, tz1);
 621 #endif
 622
 623                 if (!(bCommX || bCommY))
 624                 {
 625                     /* Copy from the thread local grid to the node grid */
 626                     for (x = offx; x < tx1; x++)
 627                     {
 628                         for (y = offy; y < ty1; y++)
 629                         {
 630                             i0  = (x*fft_my + y)*fft_mz;
 631                             i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
 632                             for (z = offz; z < tz1; z++)
 633                             {
 634                                 fftgrid[i0+z] += grid_th[i0t+z];
 635                             }
 636                         }
 637                     }
 638                 }
 639                 else
 640                 {
 641                     /* The order of this conditional decides
 642                      * where the corner volume gets stored with x+y decomp.
 643                      */
 644                     if (bCommY)
 645                     {
 646                         commbuf = commbuf_y;
 647                         /* The y-size of the communication buffer is set by
 648                          * the overlap of the grid part of our local slab
 649                          * with the part starting at the next slab.
 650                          */
 651                         buf_my  =
 652                             pme->overlap[1].s2g1[pme->nodeid_minor] -
 653                             pme->overlap[1].s2g0[pme->nodeid_minor+1];
 654                         if (bCommX)
 655                         {
 656                             /* We index commbuf modulo the local grid size */
 657                             commbuf += buf_my*fft_nx*fft_nz;
 658
 659                             bClearBuf   = bClearBufXY;
 660                             bClearBufXY = FALSE;
 661                         }
 662                         else
 663                         {
 664                             bClearBuf  = bClearBufY;
 665                             bClearBufY = FALSE;
 666                         }
 667                     }
 668                     else
 669                     {
 670                         commbuf    = commbuf_x;
 671                         buf_my     = fft_ny;
 672                         bClearBuf  = bClearBufX;
 673                         bClearBufX = FALSE;
 674                     }
 675
 676                     /* Copy to the communication buffer */
 677                     for (x = offx; x < tx1; x++)
 678                     {
 679                         for (y = offy; y < ty1; y++)
 680                         {
 681                             i0  = (x*buf_my + y)*fft_nz;
 682                             i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
 683
 684                             if (bClearBuf)
 685                             {
 686                                 /* First access of commbuf, initialize it */
 687                                 for (z = offz; z < tz1; z++)
 688                                 {
 689                                     commbuf[i0+z]  = grid_th[i0t+z];
 690                                 }
 691                             }
 692                             else
 693                             {
 694                                 for (z = offz; z < tz1; z++)
 695                                 {
 696                                     commbuf[i0+z] += grid_th[i0t+z];
 697                                 }
 698                             }
 699                         }
 700                     }
 701                 }
 702             }
 703         }
 704     }
 705 }
 706
 707
 708 static void sum_fftgrid_dd(const gmx_pme_t *pme, real *fftgrid, int grid_index)
 709 {
 710     ivec local_fft_ndata, local_fft_offset, local_fft_size;
 711     int  send_index0, send_nindex;
 712     int  recv_nindex;
 713 #if GMX_MPI
 714     MPI_Status stat;
 715 #endif
 716     int  recv_size_y;
 717     int  size_yx;
 718     int  x, y, z, indg, indb;
 719
 720     /* Note that this routine is only used for forward communication.
 721      * Since the force gathering, unlike the coefficient spreading,
 722      * can be trivially parallelized over the particles,
 723      * the backwards process is much simpler and can use the "old"
 724      * communication setup.
 725      */
 726
 727     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
 728                                    local_fft_ndata,
 729                                    local_fft_offset,
 730                                    local_fft_size);
 731
 732     if (pme->nnodes_minor > 1)
 733     {
 734         /* Major dimension */
 735         const pme_overlap_t *overlap = &pme->overlap[1];
 736
 737         if (pme->nnodes_major > 1)
 738         {
 739             size_yx = pme->overlap[0].comm_data[0].send_nindex;
 740         }
 741         else
 742         {
 743             size_yx = 0;
 744         }
 745 #if GMX_MPI
 746         int datasize = (local_fft_ndata[XX] + size_yx)*local_fft_ndata[ZZ];
 747
 748         int send_size_y = overlap->send_size;
 749 #endif
 750
 751         for (size_t ipulse = 0; ipulse < overlap->comm_data.size(); ipulse++)
 752         {
 753             send_index0   =
 754                 overlap->comm_data[ipulse].send_index0 -
 755                 overlap->comm_data[0].send_index0;
 756             send_nindex   = overlap->comm_data[ipulse].send_nindex;
 757             /* We don't use recv_index0, as we always receive starting at 0 */
 758             recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 759             recv_size_y   = overlap->comm_data[ipulse].recv_size;
 760
 761             auto *sendptr = const_cast<real *>(overlap->sendbuf.data()) + send_index0 * local_fft_ndata[ZZ];
 762             auto *recvptr = const_cast<real *>(overlap->recvbuf.data());
 763
 764             if (debug != nullptr)
 765             {
 766                 fprintf(debug, "PME fftgrid comm y %2d x %2d x %2d\n",
 767                         local_fft_ndata[XX], send_nindex, local_fft_ndata[ZZ]);
 768             }
 769
 770 #if GMX_MPI
 771             int send_id = overlap->comm_data[ipulse].send_id;
 772             int recv_id = overlap->comm_data[ipulse].recv_id;
 773             MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
 774                          send_id, ipulse,
 775                          recvptr, recv_size_y*datasize, GMX_MPI_REAL,
 776                          recv_id, ipulse,
 777                          overlap->mpi_comm, &stat);
 778 #endif
 779
 780             for (x = 0; x < local_fft_ndata[XX]; x++)
 781             {
 782                 for (y = 0; y < recv_nindex; y++)
 783                 {
 784                     indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
 785                     indb = (x*recv_size_y        + y)*local_fft_ndata[ZZ];
 786                     for (z = 0; z < local_fft_ndata[ZZ]; z++)
 787                     {
 788                         fftgrid[indg+z] += recvptr[indb+z];
 789                     }
 790                 }
 791             }
 792
 793             if (pme->nnodes_major > 1)
 794             {
 795                 /* Copy from the received buffer to the send buffer for dim 0 */
 796                 sendptr = const_cast<real *>(pme->overlap[0].sendbuf.data());
 797                 for (x = 0; x < size_yx; x++)
 798                 {
 799                     for (y = 0; y < recv_nindex; y++)
 800                     {
 801                         indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
 802                         indb = ((local_fft_ndata[XX] + x)*recv_size_y + y)*local_fft_ndata[ZZ];
 803                         for (z = 0; z < local_fft_ndata[ZZ]; z++)
 804                         {
 805                             sendptr[indg+z] += recvptr[indb+z];
 806                         }
 807                     }
 808                 }
 809             }
 810         }
 811     }
 812
 813     /* We only support a single pulse here.
 814      * This is not a severe limitation, as this code is only used
 815      * with OpenMP and with OpenMP the (PME) domains can be larger.
 816      */
 817     if (pme->nnodes_major > 1)
 818     {
 819         /* Major dimension */
 820         const pme_overlap_t *overlap = &pme->overlap[0];
 821
 822         size_t ipulse = 0;
 823
 824         send_nindex   = overlap->comm_data[ipulse].send_nindex;
 825         /* We don't use recv_index0, as we always receive starting at 0 */
 826         recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 827
 828         if (debug != nullptr)
 829         {
 830             fprintf(debug, "PME fftgrid comm x %2d x %2d x %2d\n",
 831                     send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
 832         }
 833
 834 #if GMX_MPI
 835         int datasize  = local_fft_ndata[YY]*local_fft_ndata[ZZ];
 836         int send_id   = overlap->comm_data[ipulse].send_id;
 837         int recv_id   = overlap->comm_data[ipulse].recv_id;
 838         auto *sendptr = const_cast<real *>(overlap->sendbuf.data());
 839         auto *recvptr = const_cast<real *>(overlap->recvbuf.data());
 840         MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
 841                      send_id, ipulse,
 842                      recvptr, recv_nindex*datasize, GMX_MPI_REAL,
 843                      recv_id, ipulse,
 844                      overlap->mpi_comm, &stat);
 845 #endif
 846
 847         for (x = 0; x < recv_nindex; x++)
 848         {
 849             for (y = 0; y < local_fft_ndata[YY]; y++)
 850             {
 851                 indg = (x*local_fft_size[YY]  + y)*local_fft_size[ZZ];
 852                 indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
 853                 for (z = 0; z < local_fft_ndata[ZZ]; z++)
 854                 {
 855                     fftgrid[indg + z] += overlap->recvbuf[indb + z];
 856                 }
 857             }
 858         }
 859     }
 860 }
 861
 862 void spread_on_grid(const gmx_pme_t *pme,
 863                     PmeAtomComm *atc, const pmegrids_t *grids,
 864                     gmx_bool bCalcSplines, gmx_bool bSpread,
 865                     real *fftgrid, gmx_bool bDoSplines, int grid_index)
 866 {
 867     int nthread, thread;
 868 #ifdef PME_TIME_THREADS
 869     gmx_cycles_t c1, c2, c3, ct1a, ct1b, ct1c;
 870     static double cs1     = 0, cs2 = 0, cs3 = 0;
 871     static double cs1a[6] = {0, 0, 0, 0, 0, 0};
 872     static int cnt        = 0;
 873 #endif
 874
 875     nthread = pme->nthread;
 876     assert(nthread > 0);
 877
 878 #ifdef PME_TIME_THREADS
 879     c1 = omp_cyc_start();
 880 #endif
 881     if (bCalcSplines)
 882     {
 883 #pragma omp parallel for num_threads(nthread) schedule(static)
 884         for (thread = 0; thread < nthread; thread++)
 885         {
 886             try
 887             {
 888                 int start, end;
 889
 890                 start = atc->numAtoms()* thread   /nthread;
 891                 end   = atc->numAtoms()*(thread+1)/nthread;
 892
 893                 /* Compute fftgrid index for all atoms,
 894                  * with help of some extra variables.
 895                  */
 896                 calc_interpolation_idx(pme, atc, start, grid_index, end, thread);
 897             }
 898             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 899         }
 900     }
 901 #ifdef PME_TIME_THREADS
 902     c1   = omp_cyc_end(c1);
 903     cs1 += (double)c1;
 904 #endif
 905
 906 #ifdef PME_TIME_THREADS
 907     c2 = omp_cyc_start();
 908 #endif
 909 #pragma omp parallel for num_threads(nthread) schedule(static)
 910     for (thread = 0; thread < nthread; thread++)
 911     {
 912         try
 913         {
 914             splinedata_t *spline;
 915
 916             /* make local bsplines  */
 917             if (grids == nullptr || !pme->bUseThreads)
 918             {
 919                 spline = &atc->spline[0];
 920
 921                 spline->n = atc->numAtoms();
 922             }
 923             else
 924             {
 925                 spline = &atc->spline[thread];
 926
 927                 if (grids->nthread == 1)
 928                 {
 929                     /* One thread, we operate on all coefficients */
 930                     spline->n = atc->numAtoms();
 931                 }
 932                 else
 933                 {
 934                     /* Get the indices our thread should operate on */
 935                     make_thread_local_ind(atc, thread, spline);
 936                 }
 937             }
 938
 939             if (bCalcSplines)
 940             {
 941                 make_bsplines(spline->theta.coefficients,
 942                               spline->dtheta.coefficients,
 943                               pme->pme_order,
 944                               as_rvec_array(atc->fractx.data()), spline->n, spline->ind.data(),
 945                               atc->coefficient.data(), bDoSplines);
 946             }
 947
 948             if (bSpread)
 949             {
 950                 /* put local atoms on grid. */
 951                 const pmegrid_t *grid = pme->bUseThreads ? &grids->grid_th[thread] : &grids->grid;
 952
 953 #ifdef PME_TIME_SPREAD
 954                 ct1a = omp_cyc_start();
 955 #endif
 956                 spread_coefficients_bsplines_thread(grid, atc, spline, pme->spline_work);
 957
 958                 if (pme->bUseThreads)
 959                 {
 960                     copy_local_grid(pme, grids, grid_index, thread, fftgrid);
 961                 }
 962 #ifdef PME_TIME_SPREAD
 963                 ct1a          = omp_cyc_end(ct1a);
 964                 cs1a[thread] += (double)ct1a;
 965 #endif
 966             }
 967         }
 968         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 969     }
 970 #ifdef PME_TIME_THREADS
 971     c2   = omp_cyc_end(c2);
 972     cs2 += (double)c2;
 973 #endif
 974
 975     if (bSpread && pme->bUseThreads)
 976     {
 977 #ifdef PME_TIME_THREADS
 978         c3 = omp_cyc_start();
 979 #endif
 980 #pragma omp parallel for num_threads(grids->nthread) schedule(static)
 981         for (thread = 0; thread < grids->nthread; thread++)
 982         {
 983             try
 984             {
 985                 reduce_threadgrid_overlap(pme, grids, thread,
 986                                           fftgrid,
 987                                           const_cast<real *>(pme->overlap[0].sendbuf.data()),
 988                                           const_cast<real *>(pme->overlap[1].sendbuf.data()),
 989                                           grid_index);
 990             }
 991             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 992         }
 993 #ifdef PME_TIME_THREADS
 994         c3   = omp_cyc_end(c3);
 995         cs3 += (double)c3;
 996 #endif
 997
 998         if (pme->nnodes > 1)
 999         {
1000             /* Communicate the overlapping part of the fftgrid.
1001              * For this communication call we need to check pme->bUseThreads
1002              * to have all ranks communicate here, regardless of pme->nthread.
1003              */
1004             sum_fftgrid_dd(pme, fftgrid, grid_index);
1005         }
1006     }
1007
1008 #ifdef PME_TIME_THREADS
1009     cnt++;
1010     if (cnt % 20 == 0)
1011     {
1012         printf("idx %.2f spread %.2f red %.2f",
1013                cs1*1e-9, cs2*1e-9, cs3*1e-9);
1014 #ifdef PME_TIME_SPREAD
1015         for (thread = 0; thread < nthread; thread++)
1016         {
1017             printf(" %.2f", cs1a[thread]*1e-9);
1018         }
1019 #endif
1020         printf("\n");
1021     }
1022 #endif
1023 }