src/gromacs/ewald/pme-spread.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "pme-spread.h"
  40
  41 #include "config.h"
  42
  43 #include <assert.h>
  44
  45 #include <algorithm>
  46
  47 #include "gromacs/ewald/pme-internal.h"
  48 #include "gromacs/ewald/pme-simd.h"
  49 #include "gromacs/ewald/pme-spline-work.h"
  50 #include "gromacs/legacyheaders/macros.h"
  51 #include "gromacs/utility/smalloc.h"
  52
  53 /* TODO consider split of pme-spline from this file */
  54
  55 static void calc_interpolation_idx(struct gmx_pme_t *pme, pme_atomcomm_t *atc,
  56                                    int start, int grid_index, int end, int thread)
  57 {
  58     int             i;
  59     int            *idxptr, tix, tiy, tiz;
  60     real           *xptr, *fptr, tx, ty, tz;
  61     real            rxx, ryx, ryy, rzx, rzy, rzz;
  62     int             nx, ny, nz;
  63     int            *g2tx, *g2ty, *g2tz;
  64     gmx_bool        bThreads;
  65     int            *thread_idx = NULL;
  66     thread_plist_t *tpl        = NULL;
  67     int            *tpl_n      = NULL;
  68     int             thread_i;
  69
  70     nx  = pme->nkx;
  71     ny  = pme->nky;
  72     nz  = pme->nkz;
  73
  74     rxx = pme->recipbox[XX][XX];
  75     ryx = pme->recipbox[YY][XX];
  76     ryy = pme->recipbox[YY][YY];
  77     rzx = pme->recipbox[ZZ][XX];
  78     rzy = pme->recipbox[ZZ][YY];
  79     rzz = pme->recipbox[ZZ][ZZ];
  80
  81     g2tx = pme->pmegrid[grid_index].g2t[XX];
  82     g2ty = pme->pmegrid[grid_index].g2t[YY];
  83     g2tz = pme->pmegrid[grid_index].g2t[ZZ];
  84
  85     bThreads = (atc->nthread > 1);
  86     if (bThreads)
  87     {
  88         thread_idx = atc->thread_idx;
  89
  90         tpl   = &atc->thread_plist[thread];
  91         tpl_n = tpl->n;
  92         for (i = 0; i < atc->nthread; i++)
  93         {
  94             tpl_n[i] = 0;
  95         }
  96     }
  97
  98     for (i = start; i < end; i++)
  99     {
 100         xptr   = atc->x[i];
 101         idxptr = atc->idx[i];
 102         fptr   = atc->fractx[i];
 103
 104         /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
 105         tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
 106         ty = ny * (                  xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
 107         tz = nz * (                                   xptr[ZZ] * rzz + 2.0 );
 108
 109         tix = (int)(tx);
 110         tiy = (int)(ty);
 111         tiz = (int)(tz);
 112
 113         /* Because decomposition only occurs in x and y,
 114          * we never have a fraction correction in z.
 115          */
 116         fptr[XX] = tx - tix + pme->fshx[tix];
 117         fptr[YY] = ty - tiy + pme->fshy[tiy];
 118         fptr[ZZ] = tz - tiz;
 119
 120         idxptr[XX] = pme->nnx[tix];
 121         idxptr[YY] = pme->nny[tiy];
 122         idxptr[ZZ] = pme->nnz[tiz];
 123
 124 #ifdef DEBUG
 125         range_check(idxptr[XX], 0, pme->pmegrid_nx);
 126         range_check(idxptr[YY], 0, pme->pmegrid_ny);
 127         range_check(idxptr[ZZ], 0, pme->pmegrid_nz);
 128 #endif
 129
 130         if (bThreads)
 131         {
 132             thread_i      = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
 133             thread_idx[i] = thread_i;
 134             tpl_n[thread_i]++;
 135         }
 136     }
 137
 138     if (bThreads)
 139     {
 140         /* Make a list of particle indices sorted on thread */
 141
 142         /* Get the cumulative count */
 143         for (i = 1; i < atc->nthread; i++)
 144         {
 145             tpl_n[i] += tpl_n[i-1];
 146         }
 147         /* The current implementation distributes particles equally
 148          * over the threads, so we could actually allocate for that
 149          * in pme_realloc_atomcomm_things.
 150          */
 151         if (tpl_n[atc->nthread-1] > tpl->nalloc)
 152         {
 153             tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
 154             srenew(tpl->i, tpl->nalloc);
 155         }
 156         /* Set tpl_n to the cumulative start */
 157         for (i = atc->nthread-1; i >= 1; i--)
 158         {
 159             tpl_n[i] = tpl_n[i-1];
 160         }
 161         tpl_n[0] = 0;
 162
 163         /* Fill our thread local array with indices sorted on thread */
 164         for (i = start; i < end; i++)
 165         {
 166             tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
 167         }
 168         /* Now tpl_n contains the cummulative count again */
 169     }
 170 }
 171
 172 static void make_thread_local_ind(pme_atomcomm_t *atc,
 173                                   int thread, splinedata_t *spline)
 174 {
 175     int             n, t, i, start, end;
 176     thread_plist_t *tpl;
 177
 178     /* Combine the indices made by each thread into one index */
 179
 180     n     = 0;
 181     start = 0;
 182     for (t = 0; t < atc->nthread; t++)
 183     {
 184         tpl = &atc->thread_plist[t];
 185         /* Copy our part (start - end) from the list of thread t */
 186         if (thread > 0)
 187         {
 188             start = tpl->n[thread-1];
 189         }
 190         end = tpl->n[thread];
 191         for (i = start; i < end; i++)
 192         {
 193             spline->ind[n++] = tpl->i[i];
 194         }
 195     }
 196
 197     spline->n = n;
 198 }
 199
 200 /* Macro to force loop unrolling by fixing order.
 201  * This gives a significant performance gain.
 202  */
 203 #define CALC_SPLINE(order)                     \
 204     {                                              \
 205         int  j, k, l;                                 \
 206         real dr, div;                               \
 207         real data[PME_ORDER_MAX];                  \
 208         real ddata[PME_ORDER_MAX];                 \
 209                                                \
 210         for (j = 0; (j < DIM); j++)                     \
 211         {                                          \
 212             dr  = xptr[j];                         \
 213                                                \
 214             /* dr is relative offset from lower cell limit */ \
 215             data[order-1] = 0;                     \
 216             data[1]       = dr;                          \
 217             data[0]       = 1 - dr;                      \
 218                                                \
 219             for (k = 3; (k < order); k++)               \
 220             {                                      \
 221                 div       = 1.0/(k - 1.0);               \
 222                 data[k-1] = div*dr*data[k-2];      \
 223                 for (l = 1; (l < (k-1)); l++)           \
 224                 {                                  \
 225                     data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
 226                                        data[k-l-1]);                \
 227                 }                                  \
 228                 data[0] = div*(1-dr)*data[0];      \
 229             }                                      \
 230             /* differentiate */                    \
 231             ddata[0] = -data[0];                   \
 232             for (k = 1; (k < order); k++)               \
 233             {                                      \
 234                 ddata[k] = data[k-1] - data[k];    \
 235             }                                      \
 236                                                \
 237             div           = 1.0/(order - 1);                 \
 238             data[order-1] = div*dr*data[order-2];  \
 239             for (l = 1; (l < (order-1)); l++)           \
 240             {                                      \
 241                 data[order-l-1] = div*((dr+l)*data[order-l-2]+    \
 242                                        (order-l-dr)*data[order-l-1]); \
 243             }                                      \
 244             data[0] = div*(1 - dr)*data[0];        \
 245                                                \
 246             for (k = 0; k < order; k++)                 \
 247             {                                      \
 248                 theta[j][i*order+k]  = data[k];    \
 249                 dtheta[j][i*order+k] = ddata[k];   \
 250             }                                      \
 251         }                                          \
 252     }
 253
 254 static void make_bsplines(splinevec theta, splinevec dtheta, int order,
 255                           rvec fractx[], int nr, int ind[], real coefficient[],
 256                           gmx_bool bDoSplines)
 257 {
 258     /* construct splines for local atoms */
 259     int   i, ii;
 260     real *xptr;
 261
 262     for (i = 0; i < nr; i++)
 263     {
 264         /* With free energy we do not use the coefficient check.
 265          * In most cases this will be more efficient than calling make_bsplines
 266          * twice, since usually more than half the particles have non-zero coefficients.
 267          */
 268         ii = ind[i];
 269         if (bDoSplines || coefficient[ii] != 0.0)
 270         {
 271             xptr = fractx[ii];
 272             switch (order)
 273             {
 274                 case 4:  CALC_SPLINE(4);     break;
 275                 case 5:  CALC_SPLINE(5);     break;
 276                 default: CALC_SPLINE(order); break;
 277             }
 278         }
 279     }
 280 }
 281
 282 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
 283 #define DO_BSPLINE(order)                            \
 284     for (ithx = 0; (ithx < order); ithx++)                    \
 285     {                                                    \
 286         index_x = (i0+ithx)*pny*pnz;                     \
 287         valx    = coefficient*thx[ithx];                          \
 288                                                      \
 289         for (ithy = 0; (ithy < order); ithy++)                \
 290         {                                                \
 291             valxy    = valx*thy[ithy];                   \
 292             index_xy = index_x+(j0+ithy)*pnz;            \
 293                                                      \
 294             for (ithz = 0; (ithz < order); ithz++)            \
 295             {                                            \
 296                 index_xyz        = index_xy+(k0+ithz);   \
 297                 grid[index_xyz] += valxy*thz[ithz];      \
 298             }                                            \
 299         }                                                \
 300     }
 301
 302
 303 static void spread_coefficients_bsplines_thread(pmegrid_t                         *pmegrid,
 304                                                 pme_atomcomm_t                    *atc,
 305                                                 splinedata_t                      *spline,
 306                                                 struct pme_spline_work gmx_unused *work)
 307 {
 308
 309     /* spread coefficients from home atoms to local grid */
 310     real          *grid;
 311     int            i, nn, n, ithx, ithy, ithz, i0, j0, k0;
 312     int       *    idxptr;
 313     int            order, norder, index_x, index_xy, index_xyz;
 314     real           valx, valxy, coefficient;
 315     real          *thx, *thy, *thz;
 316     int            pnx, pny, pnz, ndatatot;
 317     int            offx, offy, offz;
 318
 319 #if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
 320     real           thz_buffer[GMX_SIMD4_WIDTH*3], *thz_aligned;
 321
 322     thz_aligned = gmx_simd4_align_r(thz_buffer);
 323 #endif
 324
 325     pnx = pmegrid->s[XX];
 326     pny = pmegrid->s[YY];
 327     pnz = pmegrid->s[ZZ];
 328
 329     offx = pmegrid->offset[XX];
 330     offy = pmegrid->offset[YY];
 331     offz = pmegrid->offset[ZZ];
 332
 333     ndatatot = pnx*pny*pnz;
 334     grid     = pmegrid->grid;
 335     for (i = 0; i < ndatatot; i++)
 336     {
 337         grid[i] = 0;
 338     }
 339
 340     order = pmegrid->order;
 341
 342     for (nn = 0; nn < spline->n; nn++)
 343     {
 344         n           = spline->ind[nn];
 345         coefficient = atc->coefficient[n];
 346
 347         if (coefficient != 0)
 348         {
 349             idxptr = atc->idx[n];
 350             norder = nn*order;
 351
 352             i0   = idxptr[XX] - offx;
 353             j0   = idxptr[YY] - offy;
 354             k0   = idxptr[ZZ] - offz;
 355
 356             thx = spline->theta[XX] + norder;
 357             thy = spline->theta[YY] + norder;
 358             thz = spline->theta[ZZ] + norder;
 359
 360             switch (order)
 361             {
 362                 case 4:
 363 #ifdef PME_SIMD4_SPREAD_GATHER
 364 #ifdef PME_SIMD4_UNALIGNED
 365 #define PME_SPREAD_SIMD4_ORDER4
 366 #else
 367 #define PME_SPREAD_SIMD4_ALIGNED
 368 #define PME_ORDER 4
 369 #endif
 370 #include "gromacs/ewald/pme-simd4.h"
 371 #else
 372                     DO_BSPLINE(4);
 373 #endif
 374                     break;
 375                 case 5:
 376 #ifdef PME_SIMD4_SPREAD_GATHER
 377 #define PME_SPREAD_SIMD4_ALIGNED
 378 #define PME_ORDER 5
 379 #include "gromacs/ewald/pme-simd4.h"
 380 #else
 381                     DO_BSPLINE(5);
 382 #endif
 383                     break;
 384                 default:
 385                     DO_BSPLINE(order);
 386                     break;
 387             }
 388         }
 389     }
 390 }
 391
 392 static void copy_local_grid(struct gmx_pme_t *pme, pmegrids_t *pmegrids,
 393                             int grid_index, int thread, real *fftgrid)
 394 {
 395     ivec local_fft_ndata, local_fft_offset, local_fft_size;
 396     int  fft_my, fft_mz;
 397     int  nsy, nsz;
 398     ivec nf;
 399     int  offx, offy, offz, x, y, z, i0, i0t;
 400     int  d;
 401     pmegrid_t *pmegrid;
 402     real *grid_th;
 403
 404     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
 405                                    local_fft_ndata,
 406                                    local_fft_offset,
 407                                    local_fft_size);
 408     fft_my = local_fft_size[YY];
 409     fft_mz = local_fft_size[ZZ];
 410
 411     pmegrid = &pmegrids->grid_th[thread];
 412
 413     nsy = pmegrid->s[YY];
 414     nsz = pmegrid->s[ZZ];
 415
 416     for (d = 0; d < DIM; d++)
 417     {
 418         nf[d] = std::min(pmegrid->n[d] - (pmegrid->order - 1),
 419                          local_fft_ndata[d] - pmegrid->offset[d]);
 420     }
 421
 422     offx = pmegrid->offset[XX];
 423     offy = pmegrid->offset[YY];
 424     offz = pmegrid->offset[ZZ];
 425
 426     /* Directly copy the non-overlapping parts of the local grids.
 427      * This also initializes the full grid.
 428      */
 429     grid_th = pmegrid->grid;
 430     for (x = 0; x < nf[XX]; x++)
 431     {
 432         for (y = 0; y < nf[YY]; y++)
 433         {
 434             i0  = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
 435             i0t = (x*nsy + y)*nsz;
 436             for (z = 0; z < nf[ZZ]; z++)
 437             {
 438                 fftgrid[i0+z] = grid_th[i0t+z];
 439             }
 440         }
 441     }
 442 }
 443
 444 static void
 445 reduce_threadgrid_overlap(struct gmx_pme_t *pme,
 446                           const pmegrids_t *pmegrids, int thread,
 447                           real *fftgrid, real *commbuf_x, real *commbuf_y,
 448                           int grid_index)
 449 {
 450     ivec local_fft_ndata, local_fft_offset, local_fft_size;
 451     int  fft_nx, fft_ny, fft_nz;
 452     int  fft_my, fft_mz;
 453     int  buf_my = -1;
 454     int  nsy, nsz;
 455     ivec localcopy_end, commcopy_end;
 456     int  offx, offy, offz, x, y, z, i0, i0t;
 457     int  sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
 458     gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
 459     gmx_bool bCommX, bCommY;
 460     int  d;
 461     int  thread_f;
 462     const pmegrid_t *pmegrid, *pmegrid_g, *pmegrid_f;
 463     const real *grid_th;
 464     real *commbuf = NULL;
 465
 466     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
 467                                    local_fft_ndata,
 468                                    local_fft_offset,
 469                                    local_fft_size);
 470     fft_nx = local_fft_ndata[XX];
 471     fft_ny = local_fft_ndata[YY];
 472     fft_nz = local_fft_ndata[ZZ];
 473
 474     fft_my = local_fft_size[YY];
 475     fft_mz = local_fft_size[ZZ];
 476
 477     /* This routine is called when all thread have finished spreading.
 478      * Here each thread sums grid contributions calculated by other threads
 479      * to the thread local grid volume.
 480      * To minimize the number of grid copying operations,
 481      * this routines sums immediately from the pmegrid to the fftgrid.
 482      */
 483
 484     /* Determine which part of the full node grid we should operate on,
 485      * this is our thread local part of the full grid.
 486      */
 487     pmegrid = &pmegrids->grid_th[thread];
 488
 489     for (d = 0; d < DIM; d++)
 490     {
 491         /* Determine up to where our thread needs to copy from the
 492          * thread-local charge spreading grid to the rank-local FFT grid.
 493          * This is up to our spreading grid end minus order-1 and
 494          * not beyond the local FFT grid.
 495          */
 496         localcopy_end[d] =
 497             std::min(pmegrid->offset[d] + pmegrid->n[d] - (pmegrid->order - 1),
 498                      local_fft_ndata[d]);
 499
 500         /* Determine up to where our thread needs to copy from the
 501          * thread-local charge spreading grid to the communication buffer.
 502          * Note: only relevant with communication, ignored otherwise.
 503          */
 504         commcopy_end[d]  = localcopy_end[d];
 505         if (pmegrid->ci[d] == pmegrids->nc[d] - 1)
 506         {
 507             /* The last thread should copy up to the last pme grid line.
 508              * When the rank-local FFT grid is narrower than pme-order,
 509              * we need the max below to ensure copying of all data.
 510              */
 511             commcopy_end[d] = std::max(commcopy_end[d], pme->pme_order);
 512         }
 513     }
 514
 515     offx = pmegrid->offset[XX];
 516     offy = pmegrid->offset[YY];
 517     offz = pmegrid->offset[ZZ];
 518
 519
 520     bClearBufX  = TRUE;
 521     bClearBufY  = TRUE;
 522     bClearBufXY = TRUE;
 523
 524     /* Now loop over all the thread data blocks that contribute
 525      * to the grid region we (our thread) are operating on.
 526      */
 527     /* Note that fft_nx/y is equal to the number of grid points
 528      * between the first point of our node grid and the one of the next node.
 529      */
 530     for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
 531     {
 532         fx     = pmegrid->ci[XX] + sx;
 533         ox     = 0;
 534         bCommX = FALSE;
 535         if (fx < 0)
 536         {
 537             fx    += pmegrids->nc[XX];
 538             ox    -= fft_nx;
 539             bCommX = (pme->nnodes_major > 1);
 540         }
 541         pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
 542         ox       += pmegrid_g->offset[XX];
 543         /* Determine the end of our part of the source grid.
 544          * Use our thread local source grid and target grid part
 545          */
 546         tx1 = std::min(ox + pmegrid_g->n[XX],
 547                        !bCommX ? localcopy_end[XX] : commcopy_end[XX]);
 548
 549         for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
 550         {
 551             fy     = pmegrid->ci[YY] + sy;
 552             oy     = 0;
 553             bCommY = FALSE;
 554             if (fy < 0)
 555             {
 556                 fy    += pmegrids->nc[YY];
 557                 oy    -= fft_ny;
 558                 bCommY = (pme->nnodes_minor > 1);
 559             }
 560             pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
 561             oy       += pmegrid_g->offset[YY];
 562             /* Determine the end of our part of the source grid.
 563              * Use our thread local source grid and target grid part
 564              */
 565             ty1 = std::min(oy + pmegrid_g->n[YY],
 566                            !bCommY ? localcopy_end[YY] : commcopy_end[YY]);
 567
 568             for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
 569             {
 570                 fz = pmegrid->ci[ZZ] + sz;
 571                 oz = 0;
 572                 if (fz < 0)
 573                 {
 574                     fz += pmegrids->nc[ZZ];
 575                     oz -= fft_nz;
 576                 }
 577                 pmegrid_g = &pmegrids->grid_th[fz];
 578                 oz       += pmegrid_g->offset[ZZ];
 579                 tz1       = std::min(oz + pmegrid_g->n[ZZ], localcopy_end[ZZ]);
 580
 581                 if (sx == 0 && sy == 0 && sz == 0)
 582                 {
 583                     /* We have already added our local contribution
 584                      * before calling this routine, so skip it here.
 585                      */
 586                     continue;
 587                 }
 588
 589                 thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
 590
 591                 pmegrid_f = &pmegrids->grid_th[thread_f];
 592
 593                 grid_th = pmegrid_f->grid;
 594
 595                 nsy = pmegrid_f->s[YY];
 596                 nsz = pmegrid_f->s[ZZ];
 597
 598 #ifdef DEBUG_PME_REDUCE
 599                 printf("n%d t%d add %d  %2d %2d %2d  %2d %2d %2d  %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
 600                        pme->nodeid, thread, thread_f,
 601                        pme->pmegrid_start_ix,
 602                        pme->pmegrid_start_iy,
 603                        pme->pmegrid_start_iz,
 604                        sx, sy, sz,
 605                        offx-ox, tx1-ox, offx, tx1,
 606                        offy-oy, ty1-oy, offy, ty1,
 607                        offz-oz, tz1-oz, offz, tz1);
 608 #endif
 609
 610                 if (!(bCommX || bCommY))
 611                 {
 612                     /* Copy from the thread local grid to the node grid */
 613                     for (x = offx; x < tx1; x++)
 614                     {
 615                         for (y = offy; y < ty1; y++)
 616                         {
 617                             i0  = (x*fft_my + y)*fft_mz;
 618                             i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
 619                             for (z = offz; z < tz1; z++)
 620                             {
 621                                 fftgrid[i0+z] += grid_th[i0t+z];
 622                             }
 623                         }
 624                     }
 625                 }
 626                 else
 627                 {
 628                     /* The order of this conditional decides
 629                      * where the corner volume gets stored with x+y decomp.
 630                      */
 631                     if (bCommY)
 632                     {
 633                         commbuf = commbuf_y;
 634                         /* The y-size of the communication buffer is set by
 635                          * the overlap of the grid part of our local slab
 636                          * with the part starting at the next slab.
 637                          */
 638                         buf_my  =
 639                             pme->overlap[1].s2g1[pme->nodeid_minor] -
 640                             pme->overlap[1].s2g0[pme->nodeid_minor+1];
 641                         if (bCommX)
 642                         {
 643                             /* We index commbuf modulo the local grid size */
 644                             commbuf += buf_my*fft_nx*fft_nz;
 645
 646                             bClearBuf   = bClearBufXY;
 647                             bClearBufXY = FALSE;
 648                         }
 649                         else
 650                         {
 651                             bClearBuf  = bClearBufY;
 652                             bClearBufY = FALSE;
 653                         }
 654                     }
 655                     else
 656                     {
 657                         commbuf    = commbuf_x;
 658                         buf_my     = fft_ny;
 659                         bClearBuf  = bClearBufX;
 660                         bClearBufX = FALSE;
 661                     }
 662
 663                     /* Copy to the communication buffer */
 664                     for (x = offx; x < tx1; x++)
 665                     {
 666                         for (y = offy; y < ty1; y++)
 667                         {
 668                             i0  = (x*buf_my + y)*fft_nz;
 669                             i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
 670
 671                             if (bClearBuf)
 672                             {
 673                                 /* First access of commbuf, initialize it */
 674                                 for (z = offz; z < tz1; z++)
 675                                 {
 676                                     commbuf[i0+z]  = grid_th[i0t+z];
 677                                 }
 678                             }
 679                             else
 680                             {
 681                                 for (z = offz; z < tz1; z++)
 682                                 {
 683                                     commbuf[i0+z] += grid_th[i0t+z];
 684                                 }
 685                             }
 686                         }
 687                     }
 688                 }
 689             }
 690         }
 691     }
 692 }
 693
 694
 695 static void sum_fftgrid_dd(struct gmx_pme_t *pme, real *fftgrid, int grid_index)
 696 {
 697     ivec local_fft_ndata, local_fft_offset, local_fft_size;
 698     pme_overlap_t *overlap;
 699     int  send_index0, send_nindex;
 700     int  recv_nindex;
 701 #ifdef GMX_MPI
 702     MPI_Status stat;
 703 #endif
 704     int  recv_size_y;
 705     int  ipulse, size_yx;
 706     real *sendptr, *recvptr;
 707     int  x, y, z, indg, indb;
 708
 709     /* Note that this routine is only used for forward communication.
 710      * Since the force gathering, unlike the coefficient spreading,
 711      * can be trivially parallelized over the particles,
 712      * the backwards process is much simpler and can use the "old"
 713      * communication setup.
 714      */
 715
 716     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
 717                                    local_fft_ndata,
 718                                    local_fft_offset,
 719                                    local_fft_size);
 720
 721     if (pme->nnodes_minor > 1)
 722     {
 723         /* Major dimension */
 724         overlap = &pme->overlap[1];
 725
 726         if (pme->nnodes_major > 1)
 727         {
 728             size_yx = pme->overlap[0].comm_data[0].send_nindex;
 729         }
 730         else
 731         {
 732             size_yx = 0;
 733         }
 734 #ifdef GMX_MPI
 735         int datasize = (local_fft_ndata[XX] + size_yx)*local_fft_ndata[ZZ];
 736
 737         int send_size_y = overlap->send_size;
 738 #endif
 739
 740         for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
 741         {
 742             send_index0   =
 743                 overlap->comm_data[ipulse].send_index0 -
 744                 overlap->comm_data[0].send_index0;
 745             send_nindex   = overlap->comm_data[ipulse].send_nindex;
 746             /* We don't use recv_index0, as we always receive starting at 0 */
 747             recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 748             recv_size_y   = overlap->comm_data[ipulse].recv_size;
 749
 750             sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
 751             recvptr = overlap->recvbuf;
 752
 753             if (debug != NULL)
 754             {
 755                 fprintf(debug, "PME fftgrid comm y %2d x %2d x %2d\n",
 756                         local_fft_ndata[XX], send_nindex, local_fft_ndata[ZZ]);
 757             }
 758
 759 #ifdef GMX_MPI
 760             int send_id = overlap->send_id[ipulse];
 761             int recv_id = overlap->recv_id[ipulse];
 762             MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
 763                          send_id, ipulse,
 764                          recvptr, recv_size_y*datasize, GMX_MPI_REAL,
 765                          recv_id, ipulse,
 766                          overlap->mpi_comm, &stat);
 767 #endif
 768
 769             for (x = 0; x < local_fft_ndata[XX]; x++)
 770             {
 771                 for (y = 0; y < recv_nindex; y++)
 772                 {
 773                     indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
 774                     indb = (x*recv_size_y        + y)*local_fft_ndata[ZZ];
 775                     for (z = 0; z < local_fft_ndata[ZZ]; z++)
 776                     {
 777                         fftgrid[indg+z] += recvptr[indb+z];
 778                     }
 779                 }
 780             }
 781
 782             if (pme->nnodes_major > 1)
 783             {
 784                 /* Copy from the received buffer to the send buffer for dim 0 */
 785                 sendptr = pme->overlap[0].sendbuf;
 786                 for (x = 0; x < size_yx; x++)
 787                 {
 788                     for (y = 0; y < recv_nindex; y++)
 789                     {
 790                         indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
 791                         indb = ((local_fft_ndata[XX] + x)*recv_size_y + y)*local_fft_ndata[ZZ];
 792                         for (z = 0; z < local_fft_ndata[ZZ]; z++)
 793                         {
 794                             sendptr[indg+z] += recvptr[indb+z];
 795                         }
 796                     }
 797                 }
 798             }
 799         }
 800     }
 801
 802     /* We only support a single pulse here.
 803      * This is not a severe limitation, as this code is only used
 804      * with OpenMP and with OpenMP the (PME) domains can be larger.
 805      */
 806     if (pme->nnodes_major > 1)
 807     {
 808         /* Major dimension */
 809         overlap = &pme->overlap[0];
 810
 811         ipulse = 0;
 812
 813         send_nindex   = overlap->comm_data[ipulse].send_nindex;
 814         /* We don't use recv_index0, as we always receive starting at 0 */
 815         recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 816
 817         recvptr = overlap->recvbuf;
 818
 819         if (debug != NULL)
 820         {
 821             fprintf(debug, "PME fftgrid comm x %2d x %2d x %2d\n",
 822                     send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
 823         }
 824
 825 #ifdef GMX_MPI
 826         int datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
 827         int send_id  = overlap->send_id[ipulse];
 828         int recv_id  = overlap->recv_id[ipulse];
 829         sendptr      = overlap->sendbuf;
 830         MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
 831                      send_id, ipulse,
 832                      recvptr, recv_nindex*datasize, GMX_MPI_REAL,
 833                      recv_id, ipulse,
 834                      overlap->mpi_comm, &stat);
 835 #endif
 836
 837         for (x = 0; x < recv_nindex; x++)
 838         {
 839             for (y = 0; y < local_fft_ndata[YY]; y++)
 840             {
 841                 indg = (x*local_fft_size[YY]  + y)*local_fft_size[ZZ];
 842                 indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
 843                 for (z = 0; z < local_fft_ndata[ZZ]; z++)
 844                 {
 845                     fftgrid[indg+z] += recvptr[indb+z];
 846                 }
 847             }
 848         }
 849     }
 850 }
 851
 852 void spread_on_grid(struct gmx_pme_t *pme,
 853                     pme_atomcomm_t *atc, pmegrids_t *grids,
 854                     gmx_bool bCalcSplines, gmx_bool bSpread,
 855                     real *fftgrid, gmx_bool bDoSplines, int grid_index)
 856 {
 857     int nthread, thread;
 858 #ifdef PME_TIME_THREADS
 859     gmx_cycles_t c1, c2, c3, ct1a, ct1b, ct1c;
 860     static double cs1     = 0, cs2 = 0, cs3 = 0;
 861     static double cs1a[6] = {0, 0, 0, 0, 0, 0};
 862     static int cnt        = 0;
 863 #endif
 864
 865     nthread = pme->nthread;
 866     assert(nthread > 0);
 867
 868 #ifdef PME_TIME_THREADS
 869     c1 = omp_cyc_start();
 870 #endif
 871     if (bCalcSplines)
 872     {
 873 #pragma omp parallel for num_threads(nthread) schedule(static)
 874         for (thread = 0; thread < nthread; thread++)
 875         {
 876             int start, end;
 877
 878             start = atc->n* thread   /nthread;
 879             end   = atc->n*(thread+1)/nthread;
 880
 881             /* Compute fftgrid index for all atoms,
 882              * with help of some extra variables.
 883              */
 884             calc_interpolation_idx(pme, atc, start, grid_index, end, thread);
 885         }
 886     }
 887 #ifdef PME_TIME_THREADS
 888     c1   = omp_cyc_end(c1);
 889     cs1 += (double)c1;
 890 #endif
 891
 892 #ifdef PME_TIME_THREADS
 893     c2 = omp_cyc_start();
 894 #endif
 895 #pragma omp parallel for num_threads(nthread) schedule(static)
 896     for (thread = 0; thread < nthread; thread++)
 897     {
 898         splinedata_t *spline;
 899         pmegrid_t *grid = NULL;
 900
 901         /* make local bsplines  */
 902         if (grids == NULL || !pme->bUseThreads)
 903         {
 904             spline = &atc->spline[0];
 905
 906             spline->n = atc->n;
 907
 908             if (bSpread)
 909             {
 910                 grid = &grids->grid;
 911             }
 912         }
 913         else
 914         {
 915             spline = &atc->spline[thread];
 916
 917             if (grids->nthread == 1)
 918             {
 919                 /* One thread, we operate on all coefficients */
 920                 spline->n = atc->n;
 921             }
 922             else
 923             {
 924                 /* Get the indices our thread should operate on */
 925                 make_thread_local_ind(atc, thread, spline);
 926             }
 927
 928             grid = &grids->grid_th[thread];
 929         }
 930
 931         if (bCalcSplines)
 932         {
 933             make_bsplines(spline->theta, spline->dtheta, pme->pme_order,
 934                           atc->fractx, spline->n, spline->ind, atc->coefficient, bDoSplines);
 935         }
 936
 937         if (bSpread)
 938         {
 939             /* put local atoms on grid. */
 940 #ifdef PME_TIME_SPREAD
 941             ct1a = omp_cyc_start();
 942 #endif
 943             spread_coefficients_bsplines_thread(grid, atc, spline, pme->spline_work);
 944
 945             if (pme->bUseThreads)
 946             {
 947                 copy_local_grid(pme, grids, grid_index, thread, fftgrid);
 948             }
 949 #ifdef PME_TIME_SPREAD
 950             ct1a          = omp_cyc_end(ct1a);
 951             cs1a[thread] += (double)ct1a;
 952 #endif
 953         }
 954     }
 955 #ifdef PME_TIME_THREADS
 956     c2   = omp_cyc_end(c2);
 957     cs2 += (double)c2;
 958 #endif
 959
 960     if (bSpread && pme->bUseThreads)
 961     {
 962 #ifdef PME_TIME_THREADS
 963         c3 = omp_cyc_start();
 964 #endif
 965 #pragma omp parallel for num_threads(grids->nthread) schedule(static)
 966         for (thread = 0; thread < grids->nthread; thread++)
 967         {
 968             reduce_threadgrid_overlap(pme, grids, thread,
 969                                       fftgrid,
 970                                       pme->overlap[0].sendbuf,
 971                                       pme->overlap[1].sendbuf,
 972                                       grid_index);
 973         }
 974 #ifdef PME_TIME_THREADS
 975         c3   = omp_cyc_end(c3);
 976         cs3 += (double)c3;
 977 #endif
 978
 979         if (pme->nnodes > 1)
 980         {
 981             /* Communicate the overlapping part of the fftgrid.
 982              * For this communication call we need to check pme->bUseThreads
 983              * to have all ranks communicate here, regardless of pme->nthread.
 984              */
 985             sum_fftgrid_dd(pme, fftgrid, grid_index);
 986         }
 987     }
 988
 989 #ifdef PME_TIME_THREADS
 990     cnt++;
 991     if (cnt % 20 == 0)
 992     {
 993         printf("idx %.2f spread %.2f red %.2f",
 994                cs1*1e-9, cs2*1e-9, cs3*1e-9);
 995 #ifdef PME_TIME_SPREAD
 996         for (thread = 0; thread < nthread; thread++)
 997         {
 998             printf(" %.2f", cs1a[thread]*1e-9);
 999         }
1000 #endif
1001         printf("\n");
1002     }
1003 #endif
1004 }