5bb4be177ba0a9269c40cd0b98cac1ffd9f7e598
[alexxy/gromacs.git] / src / gromacs / ewald / pme-spread.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #include "gmxpre.h"
38
39 #include "pme-spread.h"
40
41 #include "config.h"
42
43 #include <assert.h>
44
45 #include <algorithm>
46
47 #include "gromacs/ewald/pme.h"
48 #include "gromacs/fft/parallel_3dfft.h"
49 #include "gromacs/simd/simd.h"
50 #include "gromacs/utility/basedefinitions.h"
51 #include "gromacs/utility/exceptions.h"
52 #include "gromacs/utility/fatalerror.h"
53 #include "gromacs/utility/smalloc.h"
54
55 #include "pme-grid.h"
56 #include "pme-internal.h"
57 #include "pme-simd.h"
58 #include "pme-spline-work.h"
59
60 /* TODO consider split of pme-spline from this file */
61
62 static void calc_interpolation_idx(struct gmx_pme_t *pme, pme_atomcomm_t *atc,
63                                    int start, int grid_index, int end, int thread)
64 {
65     int             i;
66     int            *idxptr, tix, tiy, tiz;
67     real           *xptr, *fptr, tx, ty, tz;
68     real            rxx, ryx, ryy, rzx, rzy, rzz;
69     int             nx, ny, nz;
70     int            *g2tx, *g2ty, *g2tz;
71     gmx_bool        bThreads;
72     int            *thread_idx = NULL;
73     thread_plist_t *tpl        = NULL;
74     int            *tpl_n      = NULL;
75     int             thread_i;
76
77     nx  = pme->nkx;
78     ny  = pme->nky;
79     nz  = pme->nkz;
80
81     rxx = pme->recipbox[XX][XX];
82     ryx = pme->recipbox[YY][XX];
83     ryy = pme->recipbox[YY][YY];
84     rzx = pme->recipbox[ZZ][XX];
85     rzy = pme->recipbox[ZZ][YY];
86     rzz = pme->recipbox[ZZ][ZZ];
87
88     g2tx = pme->pmegrid[grid_index].g2t[XX];
89     g2ty = pme->pmegrid[grid_index].g2t[YY];
90     g2tz = pme->pmegrid[grid_index].g2t[ZZ];
91
92     bThreads = (atc->nthread > 1);
93     if (bThreads)
94     {
95         thread_idx = atc->thread_idx;
96
97         tpl   = &atc->thread_plist[thread];
98         tpl_n = tpl->n;
99         for (i = 0; i < atc->nthread; i++)
100         {
101             tpl_n[i] = 0;
102         }
103     }
104
105     const real shift = c_pmeMaxUnitcellShift;
106
107     for (i = start; i < end; i++)
108     {
109         xptr   = atc->x[i];
110         idxptr = atc->idx[i];
111         fptr   = atc->fractx[i];
112
113         /* Fractional coordinates along box vectors, add a positive shift to ensure tx/ty/tz are positive for triclinic boxes */
114         tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + shift );
115         ty = ny * (                  xptr[YY] * ryy + xptr[ZZ] * rzy + shift );
116         tz = nz * (                                   xptr[ZZ] * rzz + shift );
117
118         tix = static_cast<int>(tx);
119         tiy = static_cast<int>(ty);
120         tiz = static_cast<int>(tz);
121
122         /* Because decomposition only occurs in x and y,
123          * we never have a fraction correction in z.
124          */
125         fptr[XX] = tx - tix + pme->fshx[tix];
126         fptr[YY] = ty - tiy + pme->fshy[tiy];
127         fptr[ZZ] = tz - tiz;
128
129         idxptr[XX] = pme->nnx[tix];
130         idxptr[YY] = pme->nny[tiy];
131         idxptr[ZZ] = pme->nnz[tiz];
132
133 #ifdef DEBUG
134         range_check(idxptr[XX], 0, pme->pmegrid_nx);
135         range_check(idxptr[YY], 0, pme->pmegrid_ny);
136         range_check(idxptr[ZZ], 0, pme->pmegrid_nz);
137 #endif
138
139         if (bThreads)
140         {
141             thread_i      = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
142             thread_idx[i] = thread_i;
143             tpl_n[thread_i]++;
144         }
145     }
146
147     if (bThreads)
148     {
149         /* Make a list of particle indices sorted on thread */
150
151         /* Get the cumulative count */
152         for (i = 1; i < atc->nthread; i++)
153         {
154             tpl_n[i] += tpl_n[i-1];
155         }
156         /* The current implementation distributes particles equally
157          * over the threads, so we could actually allocate for that
158          * in pme_realloc_atomcomm_things.
159          */
160         if (tpl_n[atc->nthread-1] > tpl->nalloc)
161         {
162             tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
163             srenew(tpl->i, tpl->nalloc);
164         }
165         /* Set tpl_n to the cumulative start */
166         for (i = atc->nthread-1; i >= 1; i--)
167         {
168             tpl_n[i] = tpl_n[i-1];
169         }
170         tpl_n[0] = 0;
171
172         /* Fill our thread local array with indices sorted on thread */
173         for (i = start; i < end; i++)
174         {
175             tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
176         }
177         /* Now tpl_n contains the cummulative count again */
178     }
179 }
180
181 static void make_thread_local_ind(pme_atomcomm_t *atc,
182                                   int thread, splinedata_t *spline)
183 {
184     int             n, t, i, start, end;
185     thread_plist_t *tpl;
186
187     /* Combine the indices made by each thread into one index */
188
189     n     = 0;
190     start = 0;
191     for (t = 0; t < atc->nthread; t++)
192     {
193         tpl = &atc->thread_plist[t];
194         /* Copy our part (start - end) from the list of thread t */
195         if (thread > 0)
196         {
197             start = tpl->n[thread-1];
198         }
199         end = tpl->n[thread];
200         for (i = start; i < end; i++)
201         {
202             spline->ind[n++] = tpl->i[i];
203         }
204     }
205
206     spline->n = n;
207 }
208
209 /* Macro to force loop unrolling by fixing order.
210  * This gives a significant performance gain.
211  */
212 #define CALC_SPLINE(order)                     \
213     {                                              \
214         for (int j = 0; (j < DIM); j++)            \
215         {                                          \
216             real dr, div;                          \
217             real data[PME_ORDER_MAX];              \
218                                                    \
219             dr  = xptr[j];                         \
220                                                \
221             /* dr is relative offset from lower cell limit */ \
222             data[order-1] = 0;                     \
223             data[1]       = dr;                          \
224             data[0]       = 1 - dr;                      \
225                                                \
226             for (int k = 3; (k < order); k++)      \
227             {                                      \
228                 div       = 1.0/(k - 1.0);               \
229                 data[k-1] = div*dr*data[k-2];      \
230                 for (int l = 1; (l < (k-1)); l++)  \
231                 {                                  \
232                     data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
233                                        data[k-l-1]);                \
234                 }                                  \
235                 data[0] = div*(1-dr)*data[0];      \
236             }                                      \
237             /* differentiate */                    \
238             dtheta[j][i*order+0] = -data[0];       \
239             for (int k = 1; (k < order); k++)      \
240             {                                      \
241                 dtheta[j][i*order+k] = data[k-1] - data[k]; \
242             }                                      \
243                                                \
244             div           = 1.0/(order - 1);                 \
245             data[order-1] = div*dr*data[order-2];  \
246             for (int l = 1; (l < (order-1)); l++)  \
247             {                                      \
248                 data[order-l-1] = div*((dr+l)*data[order-l-2]+    \
249                                        (order-l-dr)*data[order-l-1]); \
250             }                                      \
251             data[0] = div*(1 - dr)*data[0];        \
252                                                \
253             for (int k = 0; k < order; k++)        \
254             {                                      \
255                 theta[j][i*order+k]  = data[k];    \
256             }                                      \
257         }                                          \
258     }
259
260 static void make_bsplines(splinevec theta, splinevec dtheta, int order,
261                           rvec fractx[], int nr, int ind[], real coefficient[],
262                           gmx_bool bDoSplines)
263 {
264     /* construct splines for local atoms */
265     int   i, ii;
266     real *xptr;
267
268     for (i = 0; i < nr; i++)
269     {
270         /* With free energy we do not use the coefficient check.
271          * In most cases this will be more efficient than calling make_bsplines
272          * twice, since usually more than half the particles have non-zero coefficients.
273          */
274         ii = ind[i];
275         if (bDoSplines || coefficient[ii] != 0.0)
276         {
277             xptr = fractx[ii];
278             assert(order >= 4 && order <= PME_ORDER_MAX);
279             switch (order)
280             {
281                 case 4:  CALC_SPLINE(4);     break;
282                 case 5:  CALC_SPLINE(5);     break;
283                 default: CALC_SPLINE(order); break;
284             }
285         }
286     }
287 }
288
289 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
290 #define DO_BSPLINE(order)                            \
291     for (ithx = 0; (ithx < order); ithx++)                    \
292     {                                                    \
293         index_x = (i0+ithx)*pny*pnz;                     \
294         valx    = coefficient*thx[ithx];                          \
295                                                      \
296         for (ithy = 0; (ithy < order); ithy++)                \
297         {                                                \
298             valxy    = valx*thy[ithy];                   \
299             index_xy = index_x+(j0+ithy)*pnz;            \
300                                                      \
301             for (ithz = 0; (ithz < order); ithz++)            \
302             {                                            \
303                 index_xyz        = index_xy+(k0+ithz);   \
304                 grid[index_xyz] += valxy*thz[ithz];      \
305             }                                            \
306         }                                                \
307     }
308
309
310 static void spread_coefficients_bsplines_thread(pmegrid_t                         *pmegrid,
311                                                 pme_atomcomm_t                    *atc,
312                                                 splinedata_t                      *spline,
313                                                 struct pme_spline_work gmx_unused *work)
314 {
315
316     /* spread coefficients from home atoms to local grid */
317     real          *grid;
318     int            i, nn, n, ithx, ithy, ithz, i0, j0, k0;
319     int       *    idxptr;
320     int            order, norder, index_x, index_xy, index_xyz;
321     real           valx, valxy, coefficient;
322     real          *thx, *thy, *thz;
323     int            pnx, pny, pnz, ndatatot;
324     int            offx, offy, offz;
325
326 #if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
327     GMX_ALIGNED(real, GMX_SIMD4_WIDTH)  thz_aligned[GMX_SIMD4_WIDTH*2];
328 #endif
329
330     pnx = pmegrid->s[XX];
331     pny = pmegrid->s[YY];
332     pnz = pmegrid->s[ZZ];
333
334     offx = pmegrid->offset[XX];
335     offy = pmegrid->offset[YY];
336     offz = pmegrid->offset[ZZ];
337
338     ndatatot = pnx*pny*pnz;
339     grid     = pmegrid->grid;
340     for (i = 0; i < ndatatot; i++)
341     {
342         grid[i] = 0;
343     }
344
345     order = pmegrid->order;
346
347     for (nn = 0; nn < spline->n; nn++)
348     {
349         n           = spline->ind[nn];
350         coefficient = atc->coefficient[n];
351
352         if (coefficient != 0)
353         {
354             idxptr = atc->idx[n];
355             norder = nn*order;
356
357             i0   = idxptr[XX] - offx;
358             j0   = idxptr[YY] - offy;
359             k0   = idxptr[ZZ] - offz;
360
361             thx = spline->theta[XX] + norder;
362             thy = spline->theta[YY] + norder;
363             thz = spline->theta[ZZ] + norder;
364
365             switch (order)
366             {
367                 case 4:
368 #ifdef PME_SIMD4_SPREAD_GATHER
369 #ifdef PME_SIMD4_UNALIGNED
370 #define PME_SPREAD_SIMD4_ORDER4
371 #else
372 #define PME_SPREAD_SIMD4_ALIGNED
373 #define PME_ORDER 4
374 #endif
375 #include "pme-simd4.h"
376 #else
377                     DO_BSPLINE(4);
378 #endif
379                     break;
380                 case 5:
381 #ifdef PME_SIMD4_SPREAD_GATHER
382 #define PME_SPREAD_SIMD4_ALIGNED
383 #define PME_ORDER 5
384 #include "pme-simd4.h"
385 #else
386                     DO_BSPLINE(5);
387 #endif
388                     break;
389                 default:
390                     DO_BSPLINE(order);
391                     break;
392             }
393         }
394     }
395 }
396
397 static void copy_local_grid(struct gmx_pme_t *pme, pmegrids_t *pmegrids,
398                             int grid_index, int thread, real *fftgrid)
399 {
400     ivec local_fft_ndata, local_fft_offset, local_fft_size;
401     int  fft_my, fft_mz;
402     int  nsy, nsz;
403     ivec nf;
404     int  offx, offy, offz, x, y, z, i0, i0t;
405     int  d;
406     pmegrid_t *pmegrid;
407     real *grid_th;
408
409     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
410                                    local_fft_ndata,
411                                    local_fft_offset,
412                                    local_fft_size);
413     fft_my = local_fft_size[YY];
414     fft_mz = local_fft_size[ZZ];
415
416     pmegrid = &pmegrids->grid_th[thread];
417
418     nsy = pmegrid->s[YY];
419     nsz = pmegrid->s[ZZ];
420
421     for (d = 0; d < DIM; d++)
422     {
423         nf[d] = std::min(pmegrid->n[d] - (pmegrid->order - 1),
424                          local_fft_ndata[d] - pmegrid->offset[d]);
425     }
426
427     offx = pmegrid->offset[XX];
428     offy = pmegrid->offset[YY];
429     offz = pmegrid->offset[ZZ];
430
431     /* Directly copy the non-overlapping parts of the local grids.
432      * This also initializes the full grid.
433      */
434     grid_th = pmegrid->grid;
435     for (x = 0; x < nf[XX]; x++)
436     {
437         for (y = 0; y < nf[YY]; y++)
438         {
439             i0  = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
440             i0t = (x*nsy + y)*nsz;
441             for (z = 0; z < nf[ZZ]; z++)
442             {
443                 fftgrid[i0+z] = grid_th[i0t+z];
444             }
445         }
446     }
447 }
448
449 static void
450 reduce_threadgrid_overlap(struct gmx_pme_t *pme,
451                           const pmegrids_t *pmegrids, int thread,
452                           real *fftgrid, real *commbuf_x, real *commbuf_y,
453                           int grid_index)
454 {
455     ivec local_fft_ndata, local_fft_offset, local_fft_size;
456     int  fft_nx, fft_ny, fft_nz;
457     int  fft_my, fft_mz;
458     int  buf_my = -1;
459     int  nsy, nsz;
460     ivec localcopy_end, commcopy_end;
461     int  offx, offy, offz, x, y, z, i0, i0t;
462     int  sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
463     gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
464     gmx_bool bCommX, bCommY;
465     int  d;
466     int  thread_f;
467     const pmegrid_t *pmegrid, *pmegrid_g, *pmegrid_f;
468     const real *grid_th;
469     real *commbuf = NULL;
470
471     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
472                                    local_fft_ndata,
473                                    local_fft_offset,
474                                    local_fft_size);
475     fft_nx = local_fft_ndata[XX];
476     fft_ny = local_fft_ndata[YY];
477     fft_nz = local_fft_ndata[ZZ];
478
479     fft_my = local_fft_size[YY];
480     fft_mz = local_fft_size[ZZ];
481
482     /* This routine is called when all thread have finished spreading.
483      * Here each thread sums grid contributions calculated by other threads
484      * to the thread local grid volume.
485      * To minimize the number of grid copying operations,
486      * this routines sums immediately from the pmegrid to the fftgrid.
487      */
488
489     /* Determine which part of the full node grid we should operate on,
490      * this is our thread local part of the full grid.
491      */
492     pmegrid = &pmegrids->grid_th[thread];
493
494     for (d = 0; d < DIM; d++)
495     {
496         /* Determine up to where our thread needs to copy from the
497          * thread-local charge spreading grid to the rank-local FFT grid.
498          * This is up to our spreading grid end minus order-1 and
499          * not beyond the local FFT grid.
500          */
501         localcopy_end[d] =
502             std::min(pmegrid->offset[d] + pmegrid->n[d] - (pmegrid->order - 1),
503                      local_fft_ndata[d]);
504
505         /* Determine up to where our thread needs to copy from the
506          * thread-local charge spreading grid to the communication buffer.
507          * Note: only relevant with communication, ignored otherwise.
508          */
509         commcopy_end[d]  = localcopy_end[d];
510         if (pmegrid->ci[d] == pmegrids->nc[d] - 1)
511         {
512             /* The last thread should copy up to the last pme grid line.
513              * When the rank-local FFT grid is narrower than pme-order,
514              * we need the max below to ensure copying of all data.
515              */
516             commcopy_end[d] = std::max(commcopy_end[d], pme->pme_order);
517         }
518     }
519
520     offx = pmegrid->offset[XX];
521     offy = pmegrid->offset[YY];
522     offz = pmegrid->offset[ZZ];
523
524
525     bClearBufX  = TRUE;
526     bClearBufY  = TRUE;
527     bClearBufXY = TRUE;
528
529     /* Now loop over all the thread data blocks that contribute
530      * to the grid region we (our thread) are operating on.
531      */
532     /* Note that fft_nx/y is equal to the number of grid points
533      * between the first point of our node grid and the one of the next node.
534      */
535     for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
536     {
537         fx     = pmegrid->ci[XX] + sx;
538         ox     = 0;
539         bCommX = FALSE;
540         if (fx < 0)
541         {
542             fx    += pmegrids->nc[XX];
543             ox    -= fft_nx;
544             bCommX = (pme->nnodes_major > 1);
545         }
546         pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
547         ox       += pmegrid_g->offset[XX];
548         /* Determine the end of our part of the source grid.
549          * Use our thread local source grid and target grid part
550          */
551         tx1 = std::min(ox + pmegrid_g->n[XX],
552                        !bCommX ? localcopy_end[XX] : commcopy_end[XX]);
553
554         for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
555         {
556             fy     = pmegrid->ci[YY] + sy;
557             oy     = 0;
558             bCommY = FALSE;
559             if (fy < 0)
560             {
561                 fy    += pmegrids->nc[YY];
562                 oy    -= fft_ny;
563                 bCommY = (pme->nnodes_minor > 1);
564             }
565             pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
566             oy       += pmegrid_g->offset[YY];
567             /* Determine the end of our part of the source grid.
568              * Use our thread local source grid and target grid part
569              */
570             ty1 = std::min(oy + pmegrid_g->n[YY],
571                            !bCommY ? localcopy_end[YY] : commcopy_end[YY]);
572
573             for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
574             {
575                 fz = pmegrid->ci[ZZ] + sz;
576                 oz = 0;
577                 if (fz < 0)
578                 {
579                     fz += pmegrids->nc[ZZ];
580                     oz -= fft_nz;
581                 }
582                 pmegrid_g = &pmegrids->grid_th[fz];
583                 oz       += pmegrid_g->offset[ZZ];
584                 tz1       = std::min(oz + pmegrid_g->n[ZZ], localcopy_end[ZZ]);
585
586                 if (sx == 0 && sy == 0 && sz == 0)
587                 {
588                     /* We have already added our local contribution
589                      * before calling this routine, so skip it here.
590                      */
591                     continue;
592                 }
593
594                 thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
595
596                 pmegrid_f = &pmegrids->grid_th[thread_f];
597
598                 grid_th = pmegrid_f->grid;
599
600                 nsy = pmegrid_f->s[YY];
601                 nsz = pmegrid_f->s[ZZ];
602
603 #ifdef DEBUG_PME_REDUCE
604                 printf("n%d t%d add %d  %2d %2d %2d  %2d %2d %2d  %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
605                        pme->nodeid, thread, thread_f,
606                        pme->pmegrid_start_ix,
607                        pme->pmegrid_start_iy,
608                        pme->pmegrid_start_iz,
609                        sx, sy, sz,
610                        offx-ox, tx1-ox, offx, tx1,
611                        offy-oy, ty1-oy, offy, ty1,
612                        offz-oz, tz1-oz, offz, tz1);
613 #endif
614
615                 if (!(bCommX || bCommY))
616                 {
617                     /* Copy from the thread local grid to the node grid */
618                     for (x = offx; x < tx1; x++)
619                     {
620                         for (y = offy; y < ty1; y++)
621                         {
622                             i0  = (x*fft_my + y)*fft_mz;
623                             i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
624                             for (z = offz; z < tz1; z++)
625                             {
626                                 fftgrid[i0+z] += grid_th[i0t+z];
627                             }
628                         }
629                     }
630                 }
631                 else
632                 {
633                     /* The order of this conditional decides
634                      * where the corner volume gets stored with x+y decomp.
635                      */
636                     if (bCommY)
637                     {
638                         commbuf = commbuf_y;
639                         /* The y-size of the communication buffer is set by
640                          * the overlap of the grid part of our local slab
641                          * with the part starting at the next slab.
642                          */
643                         buf_my  =
644                             pme->overlap[1].s2g1[pme->nodeid_minor] -
645                             pme->overlap[1].s2g0[pme->nodeid_minor+1];
646                         if (bCommX)
647                         {
648                             /* We index commbuf modulo the local grid size */
649                             commbuf += buf_my*fft_nx*fft_nz;
650
651                             bClearBuf   = bClearBufXY;
652                             bClearBufXY = FALSE;
653                         }
654                         else
655                         {
656                             bClearBuf  = bClearBufY;
657                             bClearBufY = FALSE;
658                         }
659                     }
660                     else
661                     {
662                         commbuf    = commbuf_x;
663                         buf_my     = fft_ny;
664                         bClearBuf  = bClearBufX;
665                         bClearBufX = FALSE;
666                     }
667
668                     /* Copy to the communication buffer */
669                     for (x = offx; x < tx1; x++)
670                     {
671                         for (y = offy; y < ty1; y++)
672                         {
673                             i0  = (x*buf_my + y)*fft_nz;
674                             i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
675
676                             if (bClearBuf)
677                             {
678                                 /* First access of commbuf, initialize it */
679                                 for (z = offz; z < tz1; z++)
680                                 {
681                                     commbuf[i0+z]  = grid_th[i0t+z];
682                                 }
683                             }
684                             else
685                             {
686                                 for (z = offz; z < tz1; z++)
687                                 {
688                                     commbuf[i0+z] += grid_th[i0t+z];
689                                 }
690                             }
691                         }
692                     }
693                 }
694             }
695         }
696     }
697 }
698
699
700 static void sum_fftgrid_dd(struct gmx_pme_t *pme, real *fftgrid, int grid_index)
701 {
702     ivec local_fft_ndata, local_fft_offset, local_fft_size;
703     pme_overlap_t *overlap;
704     int  send_index0, send_nindex;
705     int  recv_nindex;
706 #if GMX_MPI
707     MPI_Status stat;
708 #endif
709     int  recv_size_y;
710     int  ipulse, size_yx;
711     real *sendptr, *recvptr;
712     int  x, y, z, indg, indb;
713
714     /* Note that this routine is only used for forward communication.
715      * Since the force gathering, unlike the coefficient spreading,
716      * can be trivially parallelized over the particles,
717      * the backwards process is much simpler and can use the "old"
718      * communication setup.
719      */
720
721     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
722                                    local_fft_ndata,
723                                    local_fft_offset,
724                                    local_fft_size);
725
726     if (pme->nnodes_minor > 1)
727     {
728         /* Major dimension */
729         overlap = &pme->overlap[1];
730
731         if (pme->nnodes_major > 1)
732         {
733             size_yx = pme->overlap[0].comm_data[0].send_nindex;
734         }
735         else
736         {
737             size_yx = 0;
738         }
739 #if GMX_MPI
740         int datasize = (local_fft_ndata[XX] + size_yx)*local_fft_ndata[ZZ];
741
742         int send_size_y = overlap->send_size;
743 #endif
744
745         for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
746         {
747             send_index0   =
748                 overlap->comm_data[ipulse].send_index0 -
749                 overlap->comm_data[0].send_index0;
750             send_nindex   = overlap->comm_data[ipulse].send_nindex;
751             /* We don't use recv_index0, as we always receive starting at 0 */
752             recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
753             recv_size_y   = overlap->comm_data[ipulse].recv_size;
754
755             sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
756             recvptr = overlap->recvbuf;
757
758             if (debug != NULL)
759             {
760                 fprintf(debug, "PME fftgrid comm y %2d x %2d x %2d\n",
761                         local_fft_ndata[XX], send_nindex, local_fft_ndata[ZZ]);
762             }
763
764 #if GMX_MPI
765             int send_id = overlap->send_id[ipulse];
766             int recv_id = overlap->recv_id[ipulse];
767             MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
768                          send_id, ipulse,
769                          recvptr, recv_size_y*datasize, GMX_MPI_REAL,
770                          recv_id, ipulse,
771                          overlap->mpi_comm, &stat);
772 #endif
773
774             for (x = 0; x < local_fft_ndata[XX]; x++)
775             {
776                 for (y = 0; y < recv_nindex; y++)
777                 {
778                     indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
779                     indb = (x*recv_size_y        + y)*local_fft_ndata[ZZ];
780                     for (z = 0; z < local_fft_ndata[ZZ]; z++)
781                     {
782                         fftgrid[indg+z] += recvptr[indb+z];
783                     }
784                 }
785             }
786
787             if (pme->nnodes_major > 1)
788             {
789                 /* Copy from the received buffer to the send buffer for dim 0 */
790                 sendptr = pme->overlap[0].sendbuf;
791                 for (x = 0; x < size_yx; x++)
792                 {
793                     for (y = 0; y < recv_nindex; y++)
794                     {
795                         indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
796                         indb = ((local_fft_ndata[XX] + x)*recv_size_y + y)*local_fft_ndata[ZZ];
797                         for (z = 0; z < local_fft_ndata[ZZ]; z++)
798                         {
799                             sendptr[indg+z] += recvptr[indb+z];
800                         }
801                     }
802                 }
803             }
804         }
805     }
806
807     /* We only support a single pulse here.
808      * This is not a severe limitation, as this code is only used
809      * with OpenMP and with OpenMP the (PME) domains can be larger.
810      */
811     if (pme->nnodes_major > 1)
812     {
813         /* Major dimension */
814         overlap = &pme->overlap[0];
815
816         ipulse = 0;
817
818         send_nindex   = overlap->comm_data[ipulse].send_nindex;
819         /* We don't use recv_index0, as we always receive starting at 0 */
820         recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
821
822         recvptr = overlap->recvbuf;
823
824         if (debug != NULL)
825         {
826             fprintf(debug, "PME fftgrid comm x %2d x %2d x %2d\n",
827                     send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
828         }
829
830 #if GMX_MPI
831         int datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
832         int send_id  = overlap->send_id[ipulse];
833         int recv_id  = overlap->recv_id[ipulse];
834         sendptr      = overlap->sendbuf;
835         MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
836                      send_id, ipulse,
837                      recvptr, recv_nindex*datasize, GMX_MPI_REAL,
838                      recv_id, ipulse,
839                      overlap->mpi_comm, &stat);
840 #endif
841
842         for (x = 0; x < recv_nindex; x++)
843         {
844             for (y = 0; y < local_fft_ndata[YY]; y++)
845             {
846                 indg = (x*local_fft_size[YY]  + y)*local_fft_size[ZZ];
847                 indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
848                 for (z = 0; z < local_fft_ndata[ZZ]; z++)
849                 {
850                     fftgrid[indg+z] += recvptr[indb+z];
851                 }
852             }
853         }
854     }
855 }
856
857 void spread_on_grid(struct gmx_pme_t *pme,
858                     pme_atomcomm_t *atc, pmegrids_t *grids,
859                     gmx_bool bCalcSplines, gmx_bool bSpread,
860                     real *fftgrid, gmx_bool bDoSplines, int grid_index)
861 {
862     int nthread, thread;
863 #ifdef PME_TIME_THREADS
864     gmx_cycles_t c1, c2, c3, ct1a, ct1b, ct1c;
865     static double cs1     = 0, cs2 = 0, cs3 = 0;
866     static double cs1a[6] = {0, 0, 0, 0, 0, 0};
867     static int cnt        = 0;
868 #endif
869
870     nthread = pme->nthread;
871     assert(nthread > 0);
872
873 #ifdef PME_TIME_THREADS
874     c1 = omp_cyc_start();
875 #endif
876     if (bCalcSplines)
877     {
878 #pragma omp parallel for num_threads(nthread) schedule(static)
879         for (thread = 0; thread < nthread; thread++)
880         {
881             try
882             {
883                 int start, end;
884
885                 start = atc->n* thread   /nthread;
886                 end   = atc->n*(thread+1)/nthread;
887
888                 /* Compute fftgrid index for all atoms,
889                  * with help of some extra variables.
890                  */
891                 calc_interpolation_idx(pme, atc, start, grid_index, end, thread);
892             }
893             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
894         }
895     }
896 #ifdef PME_TIME_THREADS
897     c1   = omp_cyc_end(c1);
898     cs1 += (double)c1;
899 #endif
900
901 #ifdef PME_TIME_THREADS
902     c2 = omp_cyc_start();
903 #endif
904 #pragma omp parallel for num_threads(nthread) schedule(static)
905     for (thread = 0; thread < nthread; thread++)
906     {
907         try
908         {
909             splinedata_t *spline;
910             pmegrid_t *grid = NULL;
911
912             /* make local bsplines  */
913             if (grids == NULL || !pme->bUseThreads)
914             {
915                 spline = &atc->spline[0];
916
917                 spline->n = atc->n;
918
919                 if (bSpread)
920                 {
921                     grid = &grids->grid;
922                 }
923             }
924             else
925             {
926                 spline = &atc->spline[thread];
927
928                 if (grids->nthread == 1)
929                 {
930                     /* One thread, we operate on all coefficients */
931                     spline->n = atc->n;
932                 }
933                 else
934                 {
935                     /* Get the indices our thread should operate on */
936                     make_thread_local_ind(atc, thread, spline);
937                 }
938
939                 grid = &grids->grid_th[thread];
940             }
941
942             if (bCalcSplines)
943             {
944                 make_bsplines(spline->theta, spline->dtheta, pme->pme_order,
945                               atc->fractx, spline->n, spline->ind, atc->coefficient, bDoSplines);
946             }
947
948             if (bSpread)
949             {
950                 /* put local atoms on grid. */
951 #ifdef PME_TIME_SPREAD
952                 ct1a = omp_cyc_start();
953 #endif
954                 spread_coefficients_bsplines_thread(grid, atc, spline, pme->spline_work);
955
956                 if (pme->bUseThreads)
957                 {
958                     copy_local_grid(pme, grids, grid_index, thread, fftgrid);
959                 }
960 #ifdef PME_TIME_SPREAD
961                 ct1a          = omp_cyc_end(ct1a);
962                 cs1a[thread] += (double)ct1a;
963 #endif
964             }
965         }
966         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
967     }
968 #ifdef PME_TIME_THREADS
969     c2   = omp_cyc_end(c2);
970     cs2 += (double)c2;
971 #endif
972
973     if (bSpread && pme->bUseThreads)
974     {
975 #ifdef PME_TIME_THREADS
976         c3 = omp_cyc_start();
977 #endif
978 #pragma omp parallel for num_threads(grids->nthread) schedule(static)
979         for (thread = 0; thread < grids->nthread; thread++)
980         {
981             try
982             {
983                 reduce_threadgrid_overlap(pme, grids, thread,
984                                           fftgrid,
985                                           pme->overlap[0].sendbuf,
986                                           pme->overlap[1].sendbuf,
987                                           grid_index);
988             }
989             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
990         }
991 #ifdef PME_TIME_THREADS
992         c3   = omp_cyc_end(c3);
993         cs3 += (double)c3;
994 #endif
995
996         if (pme->nnodes > 1)
997         {
998             /* Communicate the overlapping part of the fftgrid.
999              * For this communication call we need to check pme->bUseThreads
1000              * to have all ranks communicate here, regardless of pme->nthread.
1001              */
1002             sum_fftgrid_dd(pme, fftgrid, grid_index);
1003         }
1004     }
1005
1006 #ifdef PME_TIME_THREADS
1007     cnt++;
1008     if (cnt % 20 == 0)
1009     {
1010         printf("idx %.2f spread %.2f red %.2f",
1011                cs1*1e-9, cs2*1e-9, cs3*1e-9);
1012 #ifdef PME_TIME_SPREAD
1013         for (thread = 0; thread < nthread; thread++)
1014         {
1015             printf(" %.2f", cs1a[thread]*1e-9);
1016         }
1017 #endif
1018         printf("\n");
1019     }
1020 #endif
1021 }