2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
7 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
8 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9 * and including many others, as listed in the AUTHORS file in the
10 * top-level source directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
40 #include "pme_spread.h"
48 #include "gromacs/ewald/pme.h"
49 #include "gromacs/fft/parallel_3dfft.h"
50 #include "gromacs/simd/simd.h"
51 #include "gromacs/utility/basedefinitions.h"
52 #include "gromacs/utility/exceptions.h"
53 #include "gromacs/utility/fatalerror.h"
54 #include "gromacs/utility/gmxassert.h"
55 #include "gromacs/utility/smalloc.h"
58 #include "pme_internal.h"
60 #include "pme_spline_work.h"
61 #include "spline_vectors.h"
63 /* TODO consider split of pme-spline from this file */
65 static void calc_interpolation_idx(const gmx_pme_t* pme, PmeAtomComm* atc, int start, int grid_index, int end, int thread)
68 int * idxptr, tix, tiy, tiz;
70 real * fptr, tx, ty, tz;
71 real rxx, ryx, ryy, rzx, rzy, rzz;
73 int * g2tx, *g2ty, *g2tz;
75 int* thread_idx = nullptr;
83 rxx = pme->recipbox[XX][XX];
84 ryx = pme->recipbox[YY][XX];
85 ryy = pme->recipbox[YY][YY];
86 rzx = pme->recipbox[ZZ][XX];
87 rzy = pme->recipbox[ZZ][YY];
88 rzz = pme->recipbox[ZZ][ZZ];
90 g2tx = pme->pmegrid[grid_index].g2t[XX];
91 g2ty = pme->pmegrid[grid_index].g2t[YY];
92 g2tz = pme->pmegrid[grid_index].g2t[ZZ];
94 bThreads = (atc->nthread > 1);
97 thread_idx = atc->thread_idx.data();
99 tpl_n = atc->threadMap[thread].n;
100 for (i = 0; i < atc->nthread; i++)
106 const real shift = c_pmeMaxUnitcellShift;
108 for (i = start; i < end; i++)
111 idxptr = atc->idx[i];
112 fptr = atc->fractx[i];
114 /* Fractional coordinates along box vectors, add a positive shift to ensure tx/ty/tz are positive for triclinic boxes */
115 tx = nx * (xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + shift);
116 ty = ny * (xptr[YY] * ryy + xptr[ZZ] * rzy + shift);
117 tz = nz * (xptr[ZZ] * rzz + shift);
119 tix = static_cast<int>(tx);
120 tiy = static_cast<int>(ty);
121 tiz = static_cast<int>(tz);
124 range_check(tix, 0, c_pmeNeighborUnitcellCount * nx);
125 range_check(tiy, 0, c_pmeNeighborUnitcellCount * ny);
126 range_check(tiz, 0, c_pmeNeighborUnitcellCount * nz);
128 /* Because decomposition only occurs in x and y,
129 * we never have a fraction correction in z.
131 fptr[XX] = tx - tix + pme->fshx[tix];
132 fptr[YY] = ty - tiy + pme->fshy[tiy];
135 idxptr[XX] = pme->nnx[tix];
136 idxptr[YY] = pme->nny[tiy];
137 idxptr[ZZ] = pme->nnz[tiz];
140 range_check(idxptr[XX], 0, pme->pmegrid_nx);
141 range_check(idxptr[YY], 0, pme->pmegrid_ny);
142 range_check(idxptr[ZZ], 0, pme->pmegrid_nz);
147 thread_i = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
148 thread_idx[i] = thread_i;
155 /* Make a list of particle indices sorted on thread */
157 /* Get the cumulative count */
158 for (i = 1; i < atc->nthread; i++)
160 tpl_n[i] += tpl_n[i - 1];
162 /* The current implementation distributes particles equally
163 * over the threads, so we could actually allocate for that
164 * in pme_realloc_atomcomm_things.
166 AtomToThreadMap& threadMap = atc->threadMap[thread];
167 threadMap.i.resize(tpl_n[atc->nthread - 1]);
168 /* Set tpl_n to the cumulative start */
169 for (i = atc->nthread - 1; i >= 1; i--)
171 tpl_n[i] = tpl_n[i - 1];
175 /* Fill our thread local array with indices sorted on thread */
176 for (i = start; i < end; i++)
178 threadMap.i[tpl_n[atc->thread_idx[i]]++] = i;
180 /* Now tpl_n contains the cummulative count again */
184 static void make_thread_local_ind(const PmeAtomComm* atc, int thread, splinedata_t* spline)
186 int n, t, i, start, end;
188 /* Combine the indices made by each thread into one index */
192 for (t = 0; t < atc->nthread; t++)
194 const AtomToThreadMap& threadMap = atc->threadMap[t];
195 /* Copy our part (start - end) from the list of thread t */
198 start = threadMap.n[thread - 1];
200 end = threadMap.n[thread];
201 for (i = start; i < end; i++)
203 spline->ind[n++] = threadMap.i[i];
210 // At run time, the values of order used and asserted upon mean that
211 // indexing out of bounds does not occur. However compilers don't
212 // always understand that, so we suppress this warning for this code
214 #pragma GCC diagnostic push
215 #pragma GCC diagnostic ignored "-Warray-bounds"
217 /* Macro to force loop unrolling by fixing order.
218 * This gives a significant performance gain.
220 #define CALC_SPLINE(order) \
222 for (int j = 0; (j < DIM); j++) \
225 real data[PME_ORDER_MAX]; \
229 /* dr is relative offset from lower cell limit */ \
230 data[(order)-1] = 0; \
234 for (int k = 3; (k < (order)); k++) \
236 div = 1.0 / (k - 1.0); \
237 data[k - 1] = div * dr * data[k - 2]; \
238 for (int l = 1; (l < (k - 1)); l++) \
241 div * ((dr + l) * data[k - l - 2] + (k - l - dr) * data[k - l - 1]); \
243 data[0] = div * (1 - dr) * data[0]; \
245 /* differentiate */ \
246 dtheta[j][i * (order) + 0] = -data[0]; \
247 for (int k = 1; (k < (order)); k++) \
249 dtheta[j][i * (order) + k] = data[k - 1] - data[k]; \
252 div = 1.0 / ((order)-1); \
253 data[(order)-1] = div * dr * data[(order)-2]; \
254 for (int l = 1; (l < ((order)-1)); l++) \
256 data[(order)-l - 1] = \
257 div * ((dr + l) * data[(order)-l - 2] + ((order)-l - dr) * data[(order)-l - 1]); \
259 data[0] = div * (1 - dr) * data[0]; \
261 for (int k = 0; k < (order); k++) \
263 theta[j][i * (order) + k] = data[k]; \
268 static void make_bsplines(splinevec theta,
274 const real coefficient[],
277 /* construct splines for local atoms */
281 for (i = 0; i < nr; i++)
283 /* With free energy we do not use the coefficient check.
284 * In most cases this will be more efficient than calling make_bsplines
285 * twice, since usually more than half the particles have non-zero coefficients.
288 if (bDoSplines || coefficient[ii] != 0.0)
291 assert(order >= 3 && order <= PME_ORDER_MAX);
294 case 4: CALC_SPLINE(4) break;
295 case 5: CALC_SPLINE(5) break;
296 default: CALC_SPLINE(order) break;
302 #pragma GCC diagnostic pop
304 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
305 #define DO_BSPLINE(order) \
306 for (ithx = 0; (ithx < (order)); ithx++) \
308 index_x = (i0 + ithx) * pny * pnz; \
309 valx = coefficient * thx[ithx]; \
311 for (ithy = 0; (ithy < (order)); ithy++) \
313 valxy = valx * thy[ithy]; \
314 index_xy = index_x + (j0 + ithy) * pnz; \
316 for (ithz = 0; (ithz < (order)); ithz++) \
318 index_xyz = index_xy + (k0 + ithz); \
319 grid[index_xyz] += valxy * thz[ithz]; \
325 static void spread_coefficients_bsplines_thread(const pmegrid_t* pmegrid,
326 const PmeAtomComm* atc,
327 splinedata_t* spline,
328 struct pme_spline_work gmx_unused* work)
331 /* spread coefficients from home atoms to local grid */
333 int i, nn, n, ithx, ithy, ithz, i0, j0, k0;
335 int order, norder, index_x, index_xy, index_xyz;
336 real valx, valxy, coefficient;
337 real * thx, *thy, *thz;
338 int pnx, pny, pnz, ndatatot;
339 int offx, offy, offz;
341 #if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
342 alignas(GMX_SIMD_ALIGNMENT) real thz_aligned[GMX_SIMD4_WIDTH * 2];
345 pnx = pmegrid->s[XX];
346 pny = pmegrid->s[YY];
347 pnz = pmegrid->s[ZZ];
349 offx = pmegrid->offset[XX];
350 offy = pmegrid->offset[YY];
351 offz = pmegrid->offset[ZZ];
353 ndatatot = pnx * pny * pnz;
354 grid = pmegrid->grid;
355 for (i = 0; i < ndatatot; i++)
360 order = pmegrid->order;
362 for (nn = 0; nn < spline->n; nn++)
365 coefficient = atc->coefficient[n];
367 if (coefficient != 0)
369 idxptr = atc->idx[n];
372 i0 = idxptr[XX] - offx;
373 j0 = idxptr[YY] - offy;
374 k0 = idxptr[ZZ] - offz;
376 thx = spline->theta.coefficients[XX] + norder;
377 thy = spline->theta.coefficients[YY] + norder;
378 thz = spline->theta.coefficients[ZZ] + norder;
383 #ifdef PME_SIMD4_SPREAD_GATHER
384 # ifdef PME_SIMD4_UNALIGNED
385 # define PME_SPREAD_SIMD4_ORDER4
387 # define PME_SPREAD_SIMD4_ALIGNED
390 # include "pme_simd4.h"
396 #ifdef PME_SIMD4_SPREAD_GATHER
397 # define PME_SPREAD_SIMD4_ALIGNED
399 # include "pme_simd4.h"
404 default: DO_BSPLINE(order) break;
410 static void copy_local_grid(const gmx_pme_t* pme, const pmegrids_t* pmegrids, int grid_index, int thread, real* fftgrid)
412 ivec local_fft_ndata, local_fft_offset, local_fft_size;
416 int offx, offy, offz, x, y, z, i0, i0t;
420 gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index], local_fft_ndata, local_fft_offset,
422 fft_my = local_fft_size[YY];
423 fft_mz = local_fft_size[ZZ];
425 const pmegrid_t* pmegrid = &pmegrids->grid_th[thread];
427 nsy = pmegrid->s[YY];
428 nsz = pmegrid->s[ZZ];
430 for (d = 0; d < DIM; d++)
432 nf[d] = std::min(pmegrid->n[d] - (pmegrid->order - 1), local_fft_ndata[d] - pmegrid->offset[d]);
435 offx = pmegrid->offset[XX];
436 offy = pmegrid->offset[YY];
437 offz = pmegrid->offset[ZZ];
439 /* Directly copy the non-overlapping parts of the local grids.
440 * This also initializes the full grid.
442 grid_th = pmegrid->grid;
443 for (x = 0; x < nf[XX]; x++)
445 for (y = 0; y < nf[YY]; y++)
447 i0 = ((offx + x) * fft_my + (offy + y)) * fft_mz + offz;
448 i0t = (x * nsy + y) * nsz;
449 for (z = 0; z < nf[ZZ]; z++)
451 fftgrid[i0 + z] = grid_th[i0t + z];
457 static void reduce_threadgrid_overlap(const gmx_pme_t* pme,
458 const pmegrids_t* pmegrids,
465 ivec local_fft_ndata, local_fft_offset, local_fft_size;
466 int fft_nx, fft_ny, fft_nz;
470 ivec localcopy_end, commcopy_end;
471 int offx, offy, offz, x, y, z, i0, i0t;
472 int sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
473 gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
474 gmx_bool bCommX, bCommY;
477 const pmegrid_t *pmegrid, *pmegrid_g, *pmegrid_f;
479 real* commbuf = nullptr;
481 gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index], local_fft_ndata, local_fft_offset,
483 fft_nx = local_fft_ndata[XX];
484 fft_ny = local_fft_ndata[YY];
485 fft_nz = local_fft_ndata[ZZ];
487 fft_my = local_fft_size[YY];
488 fft_mz = local_fft_size[ZZ];
490 /* This routine is called when all thread have finished spreading.
491 * Here each thread sums grid contributions calculated by other threads
492 * to the thread local grid volume.
493 * To minimize the number of grid copying operations,
494 * this routines sums immediately from the pmegrid to the fftgrid.
497 /* Determine which part of the full node grid we should operate on,
498 * this is our thread local part of the full grid.
500 pmegrid = &pmegrids->grid_th[thread];
502 for (d = 0; d < DIM; d++)
504 /* Determine up to where our thread needs to copy from the
505 * thread-local charge spreading grid to the rank-local FFT grid.
506 * This is up to our spreading grid end minus order-1 and
507 * not beyond the local FFT grid.
509 localcopy_end[d] = std::min(pmegrid->offset[d] + pmegrid->n[d] - (pmegrid->order - 1),
512 /* Determine up to where our thread needs to copy from the
513 * thread-local charge spreading grid to the communication buffer.
514 * Note: only relevant with communication, ignored otherwise.
516 commcopy_end[d] = localcopy_end[d];
517 if (pmegrid->ci[d] == pmegrids->nc[d] - 1)
519 /* The last thread should copy up to the last pme grid line.
520 * When the rank-local FFT grid is narrower than pme-order,
521 * we need the max below to ensure copying of all data.
523 commcopy_end[d] = std::max(commcopy_end[d], pme->pme_order);
527 offx = pmegrid->offset[XX];
528 offy = pmegrid->offset[YY];
529 offz = pmegrid->offset[ZZ];
536 /* Now loop over all the thread data blocks that contribute
537 * to the grid region we (our thread) are operating on.
539 /* Note that fft_nx/y is equal to the number of grid points
540 * between the first point of our node grid and the one of the next node.
542 for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
544 fx = pmegrid->ci[XX] + sx;
549 fx += pmegrids->nc[XX];
551 bCommX = (pme->nnodes_major > 1);
553 pmegrid_g = &pmegrids->grid_th[fx * pmegrids->nc[YY] * pmegrids->nc[ZZ]];
554 ox += pmegrid_g->offset[XX];
555 /* Determine the end of our part of the source grid.
556 * Use our thread local source grid and target grid part
558 tx1 = std::min(ox + pmegrid_g->n[XX], !bCommX ? localcopy_end[XX] : commcopy_end[XX]);
560 for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
562 fy = pmegrid->ci[YY] + sy;
567 fy += pmegrids->nc[YY];
569 bCommY = (pme->nnodes_minor > 1);
571 pmegrid_g = &pmegrids->grid_th[fy * pmegrids->nc[ZZ]];
572 oy += pmegrid_g->offset[YY];
573 /* Determine the end of our part of the source grid.
574 * Use our thread local source grid and target grid part
576 ty1 = std::min(oy + pmegrid_g->n[YY], !bCommY ? localcopy_end[YY] : commcopy_end[YY]);
578 for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
580 fz = pmegrid->ci[ZZ] + sz;
584 fz += pmegrids->nc[ZZ];
587 pmegrid_g = &pmegrids->grid_th[fz];
588 oz += pmegrid_g->offset[ZZ];
589 tz1 = std::min(oz + pmegrid_g->n[ZZ], localcopy_end[ZZ]);
591 if (sx == 0 && sy == 0 && sz == 0)
593 /* We have already added our local contribution
594 * before calling this routine, so skip it here.
599 thread_f = (fx * pmegrids->nc[YY] + fy) * pmegrids->nc[ZZ] + fz;
601 pmegrid_f = &pmegrids->grid_th[thread_f];
603 grid_th = pmegrid_f->grid;
605 nsy = pmegrid_f->s[YY];
606 nsz = pmegrid_f->s[ZZ];
608 #ifdef DEBUG_PME_REDUCE
609 printf("n%d t%d add %d %2d %2d %2d %2d %2d %2d %2d-%2d %2d-%2d, %2d-%2d "
610 "%2d-%2d, %2d-%2d %2d-%2d\n",
611 pme->nodeid, thread, thread_f, pme->pmegrid_start_ix, pme->pmegrid_start_iy,
612 pme->pmegrid_start_iz, sx, sy, sz, offx - ox, tx1 - ox, offx, tx1, offy - oy,
613 ty1 - oy, offy, ty1, offz - oz, tz1 - oz, offz, tz1);
616 if (!(bCommX || bCommY))
618 /* Copy from the thread local grid to the node grid */
619 for (x = offx; x < tx1; x++)
621 for (y = offy; y < ty1; y++)
623 i0 = (x * fft_my + y) * fft_mz;
624 i0t = ((x - ox) * nsy + (y - oy)) * nsz - oz;
625 for (z = offz; z < tz1; z++)
627 fftgrid[i0 + z] += grid_th[i0t + z];
634 /* The order of this conditional decides
635 * where the corner volume gets stored with x+y decomp.
640 /* The y-size of the communication buffer is set by
641 * the overlap of the grid part of our local slab
642 * with the part starting at the next slab.
644 buf_my = pme->overlap[1].s2g1[pme->nodeid_minor]
645 - pme->overlap[1].s2g0[pme->nodeid_minor + 1];
648 /* We index commbuf modulo the local grid size */
649 commbuf += buf_my * fft_nx * fft_nz;
651 bClearBuf = bClearBufXY;
656 bClearBuf = bClearBufY;
664 bClearBuf = bClearBufX;
668 /* Copy to the communication buffer */
669 for (x = offx; x < tx1; x++)
671 for (y = offy; y < ty1; y++)
673 i0 = (x * buf_my + y) * fft_nz;
674 i0t = ((x - ox) * nsy + (y - oy)) * nsz - oz;
678 /* First access of commbuf, initialize it */
679 for (z = offz; z < tz1; z++)
681 commbuf[i0 + z] = grid_th[i0t + z];
686 for (z = offz; z < tz1; z++)
688 commbuf[i0 + z] += grid_th[i0t + z];
700 static void sum_fftgrid_dd(const gmx_pme_t* pme, real* fftgrid, int grid_index)
702 ivec local_fft_ndata, local_fft_offset, local_fft_size;
703 int send_index0, send_nindex;
710 int x, y, z, indg, indb;
712 /* Note that this routine is only used for forward communication.
713 * Since the force gathering, unlike the coefficient spreading,
714 * can be trivially parallelized over the particles,
715 * the backwards process is much simpler and can use the "old"
716 * communication setup.
719 gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index], local_fft_ndata, local_fft_offset,
722 if (pme->nnodes_minor > 1)
724 /* Major dimension */
725 const pme_overlap_t* overlap = &pme->overlap[1];
727 if (pme->nnodes_major > 1)
729 size_yx = pme->overlap[0].comm_data[0].send_nindex;
736 int datasize = (local_fft_ndata[XX] + size_yx) * local_fft_ndata[ZZ];
738 int send_size_y = overlap->send_size;
741 for (size_t ipulse = 0; ipulse < overlap->comm_data.size(); ipulse++)
743 send_index0 = overlap->comm_data[ipulse].send_index0 - overlap->comm_data[0].send_index0;
744 send_nindex = overlap->comm_data[ipulse].send_nindex;
745 /* We don't use recv_index0, as we always receive starting at 0 */
746 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
747 recv_size_y = overlap->comm_data[ipulse].recv_size;
750 const_cast<real*>(overlap->sendbuf.data()) + send_index0 * local_fft_ndata[ZZ];
751 auto* recvptr = const_cast<real*>(overlap->recvbuf.data());
753 if (debug != nullptr)
755 fprintf(debug, "PME fftgrid comm y %2d x %2d x %2d\n", local_fft_ndata[XX],
756 send_nindex, local_fft_ndata[ZZ]);
760 int send_id = overlap->comm_data[ipulse].send_id;
761 int recv_id = overlap->comm_data[ipulse].recv_id;
762 MPI_Sendrecv(sendptr, send_size_y * datasize, GMX_MPI_REAL, send_id, ipulse, recvptr,
763 recv_size_y * datasize, GMX_MPI_REAL, recv_id, ipulse, overlap->mpi_comm, &stat);
766 for (x = 0; x < local_fft_ndata[XX]; x++)
768 for (y = 0; y < recv_nindex; y++)
770 indg = (x * local_fft_size[YY] + y) * local_fft_size[ZZ];
771 indb = (x * recv_size_y + y) * local_fft_ndata[ZZ];
772 for (z = 0; z < local_fft_ndata[ZZ]; z++)
774 fftgrid[indg + z] += recvptr[indb + z];
779 if (pme->nnodes_major > 1)
781 /* Copy from the received buffer to the send buffer for dim 0 */
782 sendptr = const_cast<real*>(pme->overlap[0].sendbuf.data());
783 for (x = 0; x < size_yx; x++)
785 for (y = 0; y < recv_nindex; y++)
787 indg = (x * local_fft_ndata[YY] + y) * local_fft_ndata[ZZ];
788 indb = ((local_fft_ndata[XX] + x) * recv_size_y + y) * local_fft_ndata[ZZ];
789 for (z = 0; z < local_fft_ndata[ZZ]; z++)
791 sendptr[indg + z] += recvptr[indb + z];
799 /* We only support a single pulse here.
800 * This is not a severe limitation, as this code is only used
801 * with OpenMP and with OpenMP the (PME) domains can be larger.
803 if (pme->nnodes_major > 1)
805 /* Major dimension */
806 const pme_overlap_t* overlap = &pme->overlap[0];
810 send_nindex = overlap->comm_data[ipulse].send_nindex;
811 /* We don't use recv_index0, as we always receive starting at 0 */
812 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
814 if (debug != nullptr)
816 fprintf(debug, "PME fftgrid comm x %2d x %2d x %2d\n", send_nindex, local_fft_ndata[YY],
817 local_fft_ndata[ZZ]);
821 int datasize = local_fft_ndata[YY] * local_fft_ndata[ZZ];
822 int send_id = overlap->comm_data[ipulse].send_id;
823 int recv_id = overlap->comm_data[ipulse].recv_id;
824 auto* sendptr = const_cast<real*>(overlap->sendbuf.data());
825 auto* recvptr = const_cast<real*>(overlap->recvbuf.data());
826 MPI_Sendrecv(sendptr, send_nindex * datasize, GMX_MPI_REAL, send_id, ipulse, recvptr,
827 recv_nindex * datasize, GMX_MPI_REAL, recv_id, ipulse, overlap->mpi_comm, &stat);
830 for (x = 0; x < recv_nindex; x++)
832 for (y = 0; y < local_fft_ndata[YY]; y++)
834 indg = (x * local_fft_size[YY] + y) * local_fft_size[ZZ];
835 indb = (x * local_fft_ndata[YY] + y) * local_fft_ndata[ZZ];
836 for (z = 0; z < local_fft_ndata[ZZ]; z++)
838 fftgrid[indg + z] += overlap->recvbuf[indb + z];
845 void spread_on_grid(const gmx_pme_t* pme,
847 const pmegrids_t* grids,
848 gmx_bool bCalcSplines,
854 #ifdef PME_TIME_THREADS
855 gmx_cycles_t c1, c2, c3, ct1a, ct1b, ct1c;
856 static double cs1 = 0, cs2 = 0, cs3 = 0;
857 static double cs1a[6] = { 0, 0, 0, 0, 0, 0 };
861 const int nthread = pme->nthread;
863 GMX_ASSERT(grids != nullptr || !bSpread, "If there's no grid, we cannot be spreading");
865 #ifdef PME_TIME_THREADS
866 c1 = omp_cyc_start();
870 #pragma omp parallel for num_threads(nthread) schedule(static)
871 for (int thread = 0; thread < nthread; thread++)
877 start = atc->numAtoms() * thread / nthread;
878 end = atc->numAtoms() * (thread + 1) / nthread;
880 /* Compute fftgrid index for all atoms,
881 * with help of some extra variables.
883 calc_interpolation_idx(pme, atc, start, grid_index, end, thread);
885 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
888 #ifdef PME_TIME_THREADS
889 c1 = omp_cyc_end(c1);
893 #ifdef PME_TIME_THREADS
894 c2 = omp_cyc_start();
896 #pragma omp parallel for num_threads(nthread) schedule(static)
897 for (int thread = 0; thread < nthread; thread++)
901 splinedata_t* spline;
903 /* make local bsplines */
904 if (grids == nullptr || !pme->bUseThreads)
906 spline = &atc->spline[0];
908 spline->n = atc->numAtoms();
912 spline = &atc->spline[thread];
914 if (grids->nthread == 1)
916 /* One thread, we operate on all coefficients */
917 spline->n = atc->numAtoms();
921 /* Get the indices our thread should operate on */
922 make_thread_local_ind(atc, thread, spline);
928 make_bsplines(spline->theta.coefficients, spline->dtheta.coefficients,
929 pme->pme_order, as_rvec_array(atc->fractx.data()), spline->n,
930 spline->ind.data(), atc->coefficient.data(), bDoSplines);
935 /* put local atoms on grid. */
936 const pmegrid_t* grid = pme->bUseThreads ? &grids->grid_th[thread] : &grids->grid;
938 #ifdef PME_TIME_SPREAD
939 ct1a = omp_cyc_start();
941 spread_coefficients_bsplines_thread(grid, atc, spline, pme->spline_work);
943 if (pme->bUseThreads)
945 copy_local_grid(pme, grids, grid_index, thread, fftgrid);
947 #ifdef PME_TIME_SPREAD
948 ct1a = omp_cyc_end(ct1a);
949 cs1a[thread] += (double)ct1a;
953 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
955 #ifdef PME_TIME_THREADS
956 c2 = omp_cyc_end(c2);
960 if (bSpread && pme->bUseThreads)
962 #ifdef PME_TIME_THREADS
963 c3 = omp_cyc_start();
965 #pragma omp parallel for num_threads(grids->nthread) schedule(static)
966 for (int thread = 0; thread < grids->nthread; thread++)
970 reduce_threadgrid_overlap(pme, grids, thread, fftgrid,
971 const_cast<real*>(pme->overlap[0].sendbuf.data()),
972 const_cast<real*>(pme->overlap[1].sendbuf.data()), grid_index);
974 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
976 #ifdef PME_TIME_THREADS
977 c3 = omp_cyc_end(c3);
983 /* Communicate the overlapping part of the fftgrid.
984 * For this communication call we need to check pme->bUseThreads
985 * to have all ranks communicate here, regardless of pme->nthread.
987 sum_fftgrid_dd(pme, fftgrid, grid_index);
991 #ifdef PME_TIME_THREADS
995 printf("idx %.2f spread %.2f red %.2f", cs1 * 1e-9, cs2 * 1e-9, cs3 * 1e-9);
996 # ifdef PME_TIME_SPREAD
997 for (int thread = 0; thread < nthread; thread++)
999 printf(" %.2f", cs1a[thread] * 1e-9);