Manage compiler flags and some include options per file or target, not globally
[alexxy/gromacs.git] / src / gromacs / ewald / pme_spread.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #include "gmxpre.h"
38
39 #include "pme_spread.h"
40
41 #include "config.h"
42
43 #include <cassert>
44
45 #include <algorithm>
46
47 #include "gromacs/ewald/pme.h"
48 #include "gromacs/fft/parallel_3dfft.h"
49 #include "gromacs/simd/simd.h"
50 #include "gromacs/utility/basedefinitions.h"
51 #include "gromacs/utility/exceptions.h"
52 #include "gromacs/utility/fatalerror.h"
53 #include "gromacs/utility/smalloc.h"
54
55 #include "pme_grid.h"
56 #include "pme_internal.h"
57 #include "pme_simd.h"
58 #include "pme_spline_work.h"
59
60 /* TODO consider split of pme-spline from this file */
61
62 static void calc_interpolation_idx(const gmx_pme_t *pme, PmeAtomComm *atc,
63                                    int start, int grid_index, int end, int thread)
64 {
65     int             i;
66     int            *idxptr, tix, tiy, tiz;
67     const real     *xptr;
68     real           *fptr, tx, ty, tz;
69     real            rxx, ryx, ryy, rzx, rzy, rzz;
70     int             nx, ny, nz;
71     int            *g2tx, *g2ty, *g2tz;
72     gmx_bool        bThreads;
73     int            *thread_idx = nullptr;
74     int            *tpl_n      = nullptr;
75     int             thread_i;
76
77     nx  = pme->nkx;
78     ny  = pme->nky;
79     nz  = pme->nkz;
80
81     rxx = pme->recipbox[XX][XX];
82     ryx = pme->recipbox[YY][XX];
83     ryy = pme->recipbox[YY][YY];
84     rzx = pme->recipbox[ZZ][XX];
85     rzy = pme->recipbox[ZZ][YY];
86     rzz = pme->recipbox[ZZ][ZZ];
87
88     g2tx = pme->pmegrid[grid_index].g2t[XX];
89     g2ty = pme->pmegrid[grid_index].g2t[YY];
90     g2tz = pme->pmegrid[grid_index].g2t[ZZ];
91
92     bThreads = (atc->nthread > 1);
93     if (bThreads)
94     {
95         thread_idx = atc->thread_idx.data();
96
97         tpl_n = atc->threadMap[thread].n;
98         for (i = 0; i < atc->nthread; i++)
99         {
100             tpl_n[i] = 0;
101         }
102     }
103
104     const real shift = c_pmeMaxUnitcellShift;
105
106     for (i = start; i < end; i++)
107     {
108         xptr   = atc->x[i];
109         idxptr = atc->idx[i];
110         fptr   = atc->fractx[i];
111
112         /* Fractional coordinates along box vectors, add a positive shift to ensure tx/ty/tz are positive for triclinic boxes */
113         tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + shift );
114         ty = ny * (                  xptr[YY] * ryy + xptr[ZZ] * rzy + shift );
115         tz = nz * (                                   xptr[ZZ] * rzz + shift );
116
117         tix = static_cast<int>(tx);
118         tiy = static_cast<int>(ty);
119         tiz = static_cast<int>(tz);
120
121 #ifdef DEBUG
122         range_check(tix, 0, c_pmeNeighborUnitcellCount * nx);
123         range_check(tiy, 0, c_pmeNeighborUnitcellCount * ny);
124         range_check(tiz, 0, c_pmeNeighborUnitcellCount * nz);
125 #endif
126         /* Because decomposition only occurs in x and y,
127          * we never have a fraction correction in z.
128          */
129         fptr[XX] = tx - tix + pme->fshx[tix];
130         fptr[YY] = ty - tiy + pme->fshy[tiy];
131         fptr[ZZ] = tz - tiz;
132
133         idxptr[XX] = pme->nnx[tix];
134         idxptr[YY] = pme->nny[tiy];
135         idxptr[ZZ] = pme->nnz[tiz];
136
137 #ifdef DEBUG
138         range_check(idxptr[XX], 0, pme->pmegrid_nx);
139         range_check(idxptr[YY], 0, pme->pmegrid_ny);
140         range_check(idxptr[ZZ], 0, pme->pmegrid_nz);
141 #endif
142
143         if (bThreads)
144         {
145             thread_i      = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
146             thread_idx[i] = thread_i;
147             tpl_n[thread_i]++;
148         }
149     }
150
151     if (bThreads)
152     {
153         /* Make a list of particle indices sorted on thread */
154
155         /* Get the cumulative count */
156         for (i = 1; i < atc->nthread; i++)
157         {
158             tpl_n[i] += tpl_n[i-1];
159         }
160         /* The current implementation distributes particles equally
161          * over the threads, so we could actually allocate for that
162          * in pme_realloc_atomcomm_things.
163          */
164         AtomToThreadMap &threadMap = atc->threadMap[thread];
165         threadMap.i.resize(tpl_n[atc->nthread - 1]);
166         /* Set tpl_n to the cumulative start */
167         for (i = atc->nthread-1; i >= 1; i--)
168         {
169             tpl_n[i] = tpl_n[i-1];
170         }
171         tpl_n[0] = 0;
172
173         /* Fill our thread local array with indices sorted on thread */
174         for (i = start; i < end; i++)
175         {
176             threadMap.i[tpl_n[atc->thread_idx[i]]++] = i;
177         }
178         /* Now tpl_n contains the cummulative count again */
179     }
180 }
181
182 static void make_thread_local_ind(const PmeAtomComm *atc,
183                                   int thread, splinedata_t *spline)
184 {
185     int             n, t, i, start, end;
186
187     /* Combine the indices made by each thread into one index */
188
189     n     = 0;
190     start = 0;
191     for (t = 0; t < atc->nthread; t++)
192     {
193         const AtomToThreadMap &threadMap = atc->threadMap[t];
194         /* Copy our part (start - end) from the list of thread t */
195         if (thread > 0)
196         {
197             start = threadMap.n[thread-1];
198         }
199         end = threadMap.n[thread];
200         for (i = start; i < end; i++)
201         {
202             spline->ind[n++] = threadMap.i[i];
203         }
204     }
205
206     spline->n = n;
207 }
208
209 // At run time, the values of order used and asserted upon mean that
210 // indexing out of bounds does not occur. However compilers don't
211 // always understand that, so we suppress this warning for this code
212 // region.
213 #pragma GCC diagnostic push
214 #pragma GCC diagnostic ignored "-Warray-bounds"
215
216 /* Macro to force loop unrolling by fixing order.
217  * This gives a significant performance gain.
218  */
219 #define CALC_SPLINE(order)                     \
220     {                                              \
221         for (int j = 0; (j < DIM); j++)            \
222         {                                          \
223             real dr, div;                          \
224             real data[PME_ORDER_MAX];              \
225                                                    \
226             dr  = xptr[j];                         \
227                                                \
228             /* dr is relative offset from lower cell limit */ \
229             data[(order)-1] = 0;                     \
230             data[1]         = dr;                          \
231             data[0]         = 1 - dr;                      \
232                                                \
233             for (int k = 3; (k < (order)); k++)      \
234             {                                      \
235                 div       = 1.0/(k - 1.0);               \
236                 data[k-1] = div*dr*data[k-2];      \
237                 for (int l = 1; (l < (k-1)); l++)  \
238                 {                                  \
239                     data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
240                                        data[k-l-1]);                \
241                 }                                  \
242                 data[0] = div*(1-dr)*data[0];      \
243             }                                      \
244             /* differentiate */                    \
245             dtheta[j][i*(order)+0] = -data[0];       \
246             for (int k = 1; (k < (order)); k++)      \
247             {                                      \
248                 dtheta[j][i*(order)+k] = data[k-1] - data[k]; \
249             }                                      \
250                                                \
251             div             = 1.0/((order) - 1);                 \
252             data[(order)-1] = div*dr*data[(order)-2];  \
253             for (int l = 1; (l < ((order)-1)); l++)  \
254             {                                      \
255                 data[(order)-l-1] = div*((dr+l)*data[(order)-l-2]+    \
256                                          ((order)-l-dr)*data[(order)-l-1]); \
257             }                                      \
258             data[0] = div*(1 - dr)*data[0];        \
259                                                \
260             for (int k = 0; k < (order); k++)        \
261             {                                      \
262                 theta[j][i*(order)+k]  = data[k];    \
263             }                                      \
264         }                                          \
265     }
266
267 static void make_bsplines(splinevec theta, splinevec dtheta, int order,
268                           rvec fractx[], int nr, const int ind[], const real coefficient[],
269                           gmx_bool bDoSplines)
270 {
271     /* construct splines for local atoms */
272     int   i, ii;
273     real *xptr;
274
275     for (i = 0; i < nr; i++)
276     {
277         /* With free energy we do not use the coefficient check.
278          * In most cases this will be more efficient than calling make_bsplines
279          * twice, since usually more than half the particles have non-zero coefficients.
280          */
281         ii = ind[i];
282         if (bDoSplines || coefficient[ii] != 0.0)
283         {
284             xptr = fractx[ii];
285             assert(order >= 3 && order <= PME_ORDER_MAX);
286             switch (order)
287             {
288                 case 4:  CALC_SPLINE(4);     break;
289                 case 5:  CALC_SPLINE(5);     break;
290                 default: CALC_SPLINE(order); break;
291             }
292         }
293     }
294 }
295
296 #pragma GCC diagnostic pop
297
298 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
299 #define DO_BSPLINE(order)                            \
300     for (ithx = 0; (ithx < (order)); ithx++)                    \
301     {                                                    \
302         index_x = (i0+ithx)*pny*pnz;                     \
303         valx    = coefficient*thx[ithx];                          \
304                                                      \
305         for (ithy = 0; (ithy < (order)); ithy++)                \
306         {                                                \
307             valxy    = valx*thy[ithy];                   \
308             index_xy = index_x+(j0+ithy)*pnz;            \
309                                                      \
310             for (ithz = 0; (ithz < (order)); ithz++)            \
311             {                                            \
312                 index_xyz        = index_xy+(k0+ithz);   \
313                 grid[index_xyz] += valxy*thz[ithz];      \
314             }                                            \
315         }                                                \
316     }
317
318
319 static void spread_coefficients_bsplines_thread(const pmegrid_t                   *pmegrid,
320                                                 const PmeAtomComm                 *atc,
321                                                 splinedata_t                      *spline,
322                                                 struct pme_spline_work gmx_unused *work)
323 {
324
325     /* spread coefficients from home atoms to local grid */
326     real          *grid;
327     int            i, nn, n, ithx, ithy, ithz, i0, j0, k0;
328     const int     *idxptr;
329     int            order, norder, index_x, index_xy, index_xyz;
330     real           valx, valxy, coefficient;
331     real          *thx, *thy, *thz;
332     int            pnx, pny, pnz, ndatatot;
333     int            offx, offy, offz;
334
335 #if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
336     alignas(GMX_SIMD_ALIGNMENT) real  thz_aligned[GMX_SIMD4_WIDTH*2];
337 #endif
338
339     pnx = pmegrid->s[XX];
340     pny = pmegrid->s[YY];
341     pnz = pmegrid->s[ZZ];
342
343     offx = pmegrid->offset[XX];
344     offy = pmegrid->offset[YY];
345     offz = pmegrid->offset[ZZ];
346
347     ndatatot = pnx*pny*pnz;
348     grid     = pmegrid->grid;
349     for (i = 0; i < ndatatot; i++)
350     {
351         grid[i] = 0;
352     }
353
354     order = pmegrid->order;
355
356     for (nn = 0; nn < spline->n; nn++)
357     {
358         n           = spline->ind[nn];
359         coefficient = atc->coefficient[n];
360
361         if (coefficient != 0)
362         {
363             idxptr = atc->idx[n];
364             norder = nn*order;
365
366             i0   = idxptr[XX] - offx;
367             j0   = idxptr[YY] - offy;
368             k0   = idxptr[ZZ] - offz;
369
370             thx = spline->theta.coefficients[XX] + norder;
371             thy = spline->theta.coefficients[YY] + norder;
372             thz = spline->theta.coefficients[ZZ] + norder;
373
374             switch (order)
375             {
376                 case 4:
377 #ifdef PME_SIMD4_SPREAD_GATHER
378 #ifdef PME_SIMD4_UNALIGNED
379 #define PME_SPREAD_SIMD4_ORDER4
380 #else
381 #define PME_SPREAD_SIMD4_ALIGNED
382 #define PME_ORDER 4
383 #endif
384 #include "pme_simd4.h"
385 #else
386                     DO_BSPLINE(4);
387 #endif
388                     break;
389                 case 5:
390 #ifdef PME_SIMD4_SPREAD_GATHER
391 #define PME_SPREAD_SIMD4_ALIGNED
392 #define PME_ORDER 5
393 #include "pme_simd4.h"
394 #else
395                     DO_BSPLINE(5);
396 #endif
397                     break;
398                 default:
399                     DO_BSPLINE(order);
400                     break;
401             }
402         }
403     }
404 }
405
406 static void copy_local_grid(const gmx_pme_t *pme, const pmegrids_t *pmegrids,
407                             int grid_index, int thread, real *fftgrid)
408 {
409     ivec local_fft_ndata, local_fft_offset, local_fft_size;
410     int  fft_my, fft_mz;
411     int  nsy, nsz;
412     ivec nf;
413     int  offx, offy, offz, x, y, z, i0, i0t;
414     int  d;
415     real *grid_th;
416
417     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
418                                    local_fft_ndata,
419                                    local_fft_offset,
420                                    local_fft_size);
421     fft_my = local_fft_size[YY];
422     fft_mz = local_fft_size[ZZ];
423
424     const pmegrid_t *pmegrid = &pmegrids->grid_th[thread];
425
426     nsy = pmegrid->s[YY];
427     nsz = pmegrid->s[ZZ];
428
429     for (d = 0; d < DIM; d++)
430     {
431         nf[d] = std::min(pmegrid->n[d] - (pmegrid->order - 1),
432                          local_fft_ndata[d] - pmegrid->offset[d]);
433     }
434
435     offx = pmegrid->offset[XX];
436     offy = pmegrid->offset[YY];
437     offz = pmegrid->offset[ZZ];
438
439     /* Directly copy the non-overlapping parts of the local grids.
440      * This also initializes the full grid.
441      */
442     grid_th = pmegrid->grid;
443     for (x = 0; x < nf[XX]; x++)
444     {
445         for (y = 0; y < nf[YY]; y++)
446         {
447             i0  = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
448             i0t = (x*nsy + y)*nsz;
449             for (z = 0; z < nf[ZZ]; z++)
450             {
451                 fftgrid[i0+z] = grid_th[i0t+z];
452             }
453         }
454     }
455 }
456
457 static void
458 reduce_threadgrid_overlap(const gmx_pme_t *pme,
459                           const pmegrids_t *pmegrids, int thread,
460                           real *fftgrid, real *commbuf_x, real *commbuf_y,
461                           int grid_index)
462 {
463     ivec local_fft_ndata, local_fft_offset, local_fft_size;
464     int  fft_nx, fft_ny, fft_nz;
465     int  fft_my, fft_mz;
466     int  buf_my = -1;
467     int  nsy, nsz;
468     ivec localcopy_end, commcopy_end;
469     int  offx, offy, offz, x, y, z, i0, i0t;
470     int  sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
471     gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
472     gmx_bool bCommX, bCommY;
473     int  d;
474     int  thread_f;
475     const pmegrid_t *pmegrid, *pmegrid_g, *pmegrid_f;
476     const real *grid_th;
477     real *commbuf = nullptr;
478
479     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
480                                    local_fft_ndata,
481                                    local_fft_offset,
482                                    local_fft_size);
483     fft_nx = local_fft_ndata[XX];
484     fft_ny = local_fft_ndata[YY];
485     fft_nz = local_fft_ndata[ZZ];
486
487     fft_my = local_fft_size[YY];
488     fft_mz = local_fft_size[ZZ];
489
490     /* This routine is called when all thread have finished spreading.
491      * Here each thread sums grid contributions calculated by other threads
492      * to the thread local grid volume.
493      * To minimize the number of grid copying operations,
494      * this routines sums immediately from the pmegrid to the fftgrid.
495      */
496
497     /* Determine which part of the full node grid we should operate on,
498      * this is our thread local part of the full grid.
499      */
500     pmegrid = &pmegrids->grid_th[thread];
501
502     for (d = 0; d < DIM; d++)
503     {
504         /* Determine up to where our thread needs to copy from the
505          * thread-local charge spreading grid to the rank-local FFT grid.
506          * This is up to our spreading grid end minus order-1 and
507          * not beyond the local FFT grid.
508          */
509         localcopy_end[d] =
510             std::min(pmegrid->offset[d] + pmegrid->n[d] - (pmegrid->order - 1),
511                      local_fft_ndata[d]);
512
513         /* Determine up to where our thread needs to copy from the
514          * thread-local charge spreading grid to the communication buffer.
515          * Note: only relevant with communication, ignored otherwise.
516          */
517         commcopy_end[d]  = localcopy_end[d];
518         if (pmegrid->ci[d] == pmegrids->nc[d] - 1)
519         {
520             /* The last thread should copy up to the last pme grid line.
521              * When the rank-local FFT grid is narrower than pme-order,
522              * we need the max below to ensure copying of all data.
523              */
524             commcopy_end[d] = std::max(commcopy_end[d], pme->pme_order);
525         }
526     }
527
528     offx = pmegrid->offset[XX];
529     offy = pmegrid->offset[YY];
530     offz = pmegrid->offset[ZZ];
531
532
533     bClearBufX  = TRUE;
534     bClearBufY  = TRUE;
535     bClearBufXY = TRUE;
536
537     /* Now loop over all the thread data blocks that contribute
538      * to the grid region we (our thread) are operating on.
539      */
540     /* Note that fft_nx/y is equal to the number of grid points
541      * between the first point of our node grid and the one of the next node.
542      */
543     for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
544     {
545         fx     = pmegrid->ci[XX] + sx;
546         ox     = 0;
547         bCommX = FALSE;
548         if (fx < 0)
549         {
550             fx    += pmegrids->nc[XX];
551             ox    -= fft_nx;
552             bCommX = (pme->nnodes_major > 1);
553         }
554         pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
555         ox       += pmegrid_g->offset[XX];
556         /* Determine the end of our part of the source grid.
557          * Use our thread local source grid and target grid part
558          */
559         tx1 = std::min(ox + pmegrid_g->n[XX],
560                        !bCommX ? localcopy_end[XX] : commcopy_end[XX]);
561
562         for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
563         {
564             fy     = pmegrid->ci[YY] + sy;
565             oy     = 0;
566             bCommY = FALSE;
567             if (fy < 0)
568             {
569                 fy    += pmegrids->nc[YY];
570                 oy    -= fft_ny;
571                 bCommY = (pme->nnodes_minor > 1);
572             }
573             pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
574             oy       += pmegrid_g->offset[YY];
575             /* Determine the end of our part of the source grid.
576              * Use our thread local source grid and target grid part
577              */
578             ty1 = std::min(oy + pmegrid_g->n[YY],
579                            !bCommY ? localcopy_end[YY] : commcopy_end[YY]);
580
581             for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
582             {
583                 fz = pmegrid->ci[ZZ] + sz;
584                 oz = 0;
585                 if (fz < 0)
586                 {
587                     fz += pmegrids->nc[ZZ];
588                     oz -= fft_nz;
589                 }
590                 pmegrid_g = &pmegrids->grid_th[fz];
591                 oz       += pmegrid_g->offset[ZZ];
592                 tz1       = std::min(oz + pmegrid_g->n[ZZ], localcopy_end[ZZ]);
593
594                 if (sx == 0 && sy == 0 && sz == 0)
595                 {
596                     /* We have already added our local contribution
597                      * before calling this routine, so skip it here.
598                      */
599                     continue;
600                 }
601
602                 thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
603
604                 pmegrid_f = &pmegrids->grid_th[thread_f];
605
606                 grid_th = pmegrid_f->grid;
607
608                 nsy = pmegrid_f->s[YY];
609                 nsz = pmegrid_f->s[ZZ];
610
611 #ifdef DEBUG_PME_REDUCE
612                 printf("n%d t%d add %d  %2d %2d %2d  %2d %2d %2d  %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
613                        pme->nodeid, thread, thread_f,
614                        pme->pmegrid_start_ix,
615                        pme->pmegrid_start_iy,
616                        pme->pmegrid_start_iz,
617                        sx, sy, sz,
618                        offx-ox, tx1-ox, offx, tx1,
619                        offy-oy, ty1-oy, offy, ty1,
620                        offz-oz, tz1-oz, offz, tz1);
621 #endif
622
623                 if (!(bCommX || bCommY))
624                 {
625                     /* Copy from the thread local grid to the node grid */
626                     for (x = offx; x < tx1; x++)
627                     {
628                         for (y = offy; y < ty1; y++)
629                         {
630                             i0  = (x*fft_my + y)*fft_mz;
631                             i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
632                             for (z = offz; z < tz1; z++)
633                             {
634                                 fftgrid[i0+z] += grid_th[i0t+z];
635                             }
636                         }
637                     }
638                 }
639                 else
640                 {
641                     /* The order of this conditional decides
642                      * where the corner volume gets stored with x+y decomp.
643                      */
644                     if (bCommY)
645                     {
646                         commbuf = commbuf_y;
647                         /* The y-size of the communication buffer is set by
648                          * the overlap of the grid part of our local slab
649                          * with the part starting at the next slab.
650                          */
651                         buf_my  =
652                             pme->overlap[1].s2g1[pme->nodeid_minor] -
653                             pme->overlap[1].s2g0[pme->nodeid_minor+1];
654                         if (bCommX)
655                         {
656                             /* We index commbuf modulo the local grid size */
657                             commbuf += buf_my*fft_nx*fft_nz;
658
659                             bClearBuf   = bClearBufXY;
660                             bClearBufXY = FALSE;
661                         }
662                         else
663                         {
664                             bClearBuf  = bClearBufY;
665                             bClearBufY = FALSE;
666                         }
667                     }
668                     else
669                     {
670                         commbuf    = commbuf_x;
671                         buf_my     = fft_ny;
672                         bClearBuf  = bClearBufX;
673                         bClearBufX = FALSE;
674                     }
675
676                     /* Copy to the communication buffer */
677                     for (x = offx; x < tx1; x++)
678                     {
679                         for (y = offy; y < ty1; y++)
680                         {
681                             i0  = (x*buf_my + y)*fft_nz;
682                             i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
683
684                             if (bClearBuf)
685                             {
686                                 /* First access of commbuf, initialize it */
687                                 for (z = offz; z < tz1; z++)
688                                 {
689                                     commbuf[i0+z]  = grid_th[i0t+z];
690                                 }
691                             }
692                             else
693                             {
694                                 for (z = offz; z < tz1; z++)
695                                 {
696                                     commbuf[i0+z] += grid_th[i0t+z];
697                                 }
698                             }
699                         }
700                     }
701                 }
702             }
703         }
704     }
705 }
706
707
708 static void sum_fftgrid_dd(const gmx_pme_t *pme, real *fftgrid, int grid_index)
709 {
710     ivec local_fft_ndata, local_fft_offset, local_fft_size;
711     int  send_index0, send_nindex;
712     int  recv_nindex;
713 #if GMX_MPI
714     MPI_Status stat;
715 #endif
716     int  recv_size_y;
717     int  size_yx;
718     int  x, y, z, indg, indb;
719
720     /* Note that this routine is only used for forward communication.
721      * Since the force gathering, unlike the coefficient spreading,
722      * can be trivially parallelized over the particles,
723      * the backwards process is much simpler and can use the "old"
724      * communication setup.
725      */
726
727     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
728                                    local_fft_ndata,
729                                    local_fft_offset,
730                                    local_fft_size);
731
732     if (pme->nnodes_minor > 1)
733     {
734         /* Major dimension */
735         const pme_overlap_t *overlap = &pme->overlap[1];
736
737         if (pme->nnodes_major > 1)
738         {
739             size_yx = pme->overlap[0].comm_data[0].send_nindex;
740         }
741         else
742         {
743             size_yx = 0;
744         }
745 #if GMX_MPI
746         int datasize = (local_fft_ndata[XX] + size_yx)*local_fft_ndata[ZZ];
747
748         int send_size_y = overlap->send_size;
749 #endif
750
751         for (size_t ipulse = 0; ipulse < overlap->comm_data.size(); ipulse++)
752         {
753             send_index0   =
754                 overlap->comm_data[ipulse].send_index0 -
755                 overlap->comm_data[0].send_index0;
756             send_nindex   = overlap->comm_data[ipulse].send_nindex;
757             /* We don't use recv_index0, as we always receive starting at 0 */
758             recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
759             recv_size_y   = overlap->comm_data[ipulse].recv_size;
760
761             auto *sendptr = const_cast<real *>(overlap->sendbuf.data()) + send_index0 * local_fft_ndata[ZZ];
762             auto *recvptr = const_cast<real *>(overlap->recvbuf.data());
763
764             if (debug != nullptr)
765             {
766                 fprintf(debug, "PME fftgrid comm y %2d x %2d x %2d\n",
767                         local_fft_ndata[XX], send_nindex, local_fft_ndata[ZZ]);
768             }
769
770 #if GMX_MPI
771             int send_id = overlap->comm_data[ipulse].send_id;
772             int recv_id = overlap->comm_data[ipulse].recv_id;
773             MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
774                          send_id, ipulse,
775                          recvptr, recv_size_y*datasize, GMX_MPI_REAL,
776                          recv_id, ipulse,
777                          overlap->mpi_comm, &stat);
778 #endif
779
780             for (x = 0; x < local_fft_ndata[XX]; x++)
781             {
782                 for (y = 0; y < recv_nindex; y++)
783                 {
784                     indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
785                     indb = (x*recv_size_y        + y)*local_fft_ndata[ZZ];
786                     for (z = 0; z < local_fft_ndata[ZZ]; z++)
787                     {
788                         fftgrid[indg+z] += recvptr[indb+z];
789                     }
790                 }
791             }
792
793             if (pme->nnodes_major > 1)
794             {
795                 /* Copy from the received buffer to the send buffer for dim 0 */
796                 sendptr = const_cast<real *>(pme->overlap[0].sendbuf.data());
797                 for (x = 0; x < size_yx; x++)
798                 {
799                     for (y = 0; y < recv_nindex; y++)
800                     {
801                         indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
802                         indb = ((local_fft_ndata[XX] + x)*recv_size_y + y)*local_fft_ndata[ZZ];
803                         for (z = 0; z < local_fft_ndata[ZZ]; z++)
804                         {
805                             sendptr[indg+z] += recvptr[indb+z];
806                         }
807                     }
808                 }
809             }
810         }
811     }
812
813     /* We only support a single pulse here.
814      * This is not a severe limitation, as this code is only used
815      * with OpenMP and with OpenMP the (PME) domains can be larger.
816      */
817     if (pme->nnodes_major > 1)
818     {
819         /* Major dimension */
820         const pme_overlap_t *overlap = &pme->overlap[0];
821
822         size_t ipulse = 0;
823
824         send_nindex   = overlap->comm_data[ipulse].send_nindex;
825         /* We don't use recv_index0, as we always receive starting at 0 */
826         recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
827
828         if (debug != nullptr)
829         {
830             fprintf(debug, "PME fftgrid comm x %2d x %2d x %2d\n",
831                     send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
832         }
833
834 #if GMX_MPI
835         int datasize  = local_fft_ndata[YY]*local_fft_ndata[ZZ];
836         int send_id   = overlap->comm_data[ipulse].send_id;
837         int recv_id   = overlap->comm_data[ipulse].recv_id;
838         auto *sendptr = const_cast<real *>(overlap->sendbuf.data());
839         auto *recvptr = const_cast<real *>(overlap->recvbuf.data());
840         MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
841                      send_id, ipulse,
842                      recvptr, recv_nindex*datasize, GMX_MPI_REAL,
843                      recv_id, ipulse,
844                      overlap->mpi_comm, &stat);
845 #endif
846
847         for (x = 0; x < recv_nindex; x++)
848         {
849             for (y = 0; y < local_fft_ndata[YY]; y++)
850             {
851                 indg = (x*local_fft_size[YY]  + y)*local_fft_size[ZZ];
852                 indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
853                 for (z = 0; z < local_fft_ndata[ZZ]; z++)
854                 {
855                     fftgrid[indg + z] += overlap->recvbuf[indb + z];
856                 }
857             }
858         }
859     }
860 }
861
862 void spread_on_grid(const gmx_pme_t *pme,
863                     PmeAtomComm *atc, const pmegrids_t *grids,
864                     gmx_bool bCalcSplines, gmx_bool bSpread,
865                     real *fftgrid, gmx_bool bDoSplines, int grid_index)
866 {
867     int nthread, thread;
868 #ifdef PME_TIME_THREADS
869     gmx_cycles_t c1, c2, c3, ct1a, ct1b, ct1c;
870     static double cs1     = 0, cs2 = 0, cs3 = 0;
871     static double cs1a[6] = {0, 0, 0, 0, 0, 0};
872     static int cnt        = 0;
873 #endif
874
875     nthread = pme->nthread;
876     assert(nthread > 0);
877
878 #ifdef PME_TIME_THREADS
879     c1 = omp_cyc_start();
880 #endif
881     if (bCalcSplines)
882     {
883 #pragma omp parallel for num_threads(nthread) schedule(static)
884         for (thread = 0; thread < nthread; thread++)
885         {
886             try
887             {
888                 int start, end;
889
890                 start = atc->numAtoms()* thread   /nthread;
891                 end   = atc->numAtoms()*(thread+1)/nthread;
892
893                 /* Compute fftgrid index for all atoms,
894                  * with help of some extra variables.
895                  */
896                 calc_interpolation_idx(pme, atc, start, grid_index, end, thread);
897             }
898             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
899         }
900     }
901 #ifdef PME_TIME_THREADS
902     c1   = omp_cyc_end(c1);
903     cs1 += (double)c1;
904 #endif
905
906 #ifdef PME_TIME_THREADS
907     c2 = omp_cyc_start();
908 #endif
909 #pragma omp parallel for num_threads(nthread) schedule(static)
910     for (thread = 0; thread < nthread; thread++)
911     {
912         try
913         {
914             splinedata_t *spline;
915
916             /* make local bsplines  */
917             if (grids == nullptr || !pme->bUseThreads)
918             {
919                 spline = &atc->spline[0];
920
921                 spline->n = atc->numAtoms();
922             }
923             else
924             {
925                 spline = &atc->spline[thread];
926
927                 if (grids->nthread == 1)
928                 {
929                     /* One thread, we operate on all coefficients */
930                     spline->n = atc->numAtoms();
931                 }
932                 else
933                 {
934                     /* Get the indices our thread should operate on */
935                     make_thread_local_ind(atc, thread, spline);
936                 }
937             }
938
939             if (bCalcSplines)
940             {
941                 make_bsplines(spline->theta.coefficients,
942                               spline->dtheta.coefficients,
943                               pme->pme_order,
944                               as_rvec_array(atc->fractx.data()), spline->n, spline->ind.data(),
945                               atc->coefficient.data(), bDoSplines);
946             }
947
948             if (bSpread)
949             {
950                 /* put local atoms on grid. */
951                 const pmegrid_t *grid = pme->bUseThreads ? &grids->grid_th[thread] : &grids->grid;
952
953 #ifdef PME_TIME_SPREAD
954                 ct1a = omp_cyc_start();
955 #endif
956                 spread_coefficients_bsplines_thread(grid, atc, spline, pme->spline_work);
957
958                 if (pme->bUseThreads)
959                 {
960                     copy_local_grid(pme, grids, grid_index, thread, fftgrid);
961                 }
962 #ifdef PME_TIME_SPREAD
963                 ct1a          = omp_cyc_end(ct1a);
964                 cs1a[thread] += (double)ct1a;
965 #endif
966             }
967         }
968         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
969     }
970 #ifdef PME_TIME_THREADS
971     c2   = omp_cyc_end(c2);
972     cs2 += (double)c2;
973 #endif
974
975     if (bSpread && pme->bUseThreads)
976     {
977 #ifdef PME_TIME_THREADS
978         c3 = omp_cyc_start();
979 #endif
980 #pragma omp parallel for num_threads(grids->nthread) schedule(static)
981         for (thread = 0; thread < grids->nthread; thread++)
982         {
983             try
984             {
985                 reduce_threadgrid_overlap(pme, grids, thread,
986                                           fftgrid,
987                                           const_cast<real *>(pme->overlap[0].sendbuf.data()),
988                                           const_cast<real *>(pme->overlap[1].sendbuf.data()),
989                                           grid_index);
990             }
991             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
992         }
993 #ifdef PME_TIME_THREADS
994         c3   = omp_cyc_end(c3);
995         cs3 += (double)c3;
996 #endif
997
998         if (pme->nnodes > 1)
999         {
1000             /* Communicate the overlapping part of the fftgrid.
1001              * For this communication call we need to check pme->bUseThreads
1002              * to have all ranks communicate here, regardless of pme->nthread.
1003              */
1004             sum_fftgrid_dd(pme, fftgrid, grid_index);
1005         }
1006     }
1007
1008 #ifdef PME_TIME_THREADS
1009     cnt++;
1010     if (cnt % 20 == 0)
1011     {
1012         printf("idx %.2f spread %.2f red %.2f",
1013                cs1*1e-9, cs2*1e-9, cs3*1e-9);
1014 #ifdef PME_TIME_SPREAD
1015         for (thread = 0; thread < nthread; thread++)
1016         {
1017             printf(" %.2f", cs1a[thread]*1e-9);
1018         }
1019 #endif
1020         printf("\n");
1021     }
1022 #endif
1023 }