2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
39 #include "pme-spread.h"
45 #include "gromacs/ewald/pme-internal.h"
46 #include "gromacs/ewald/pme-simd.h"
47 #include "gromacs/ewald/pme-spline-work.h"
48 #include "gromacs/legacyheaders/macros.h"
49 #include "gromacs/utility/smalloc.h"
51 static void calc_interpolation_idx(struct gmx_pme_t *pme, pme_atomcomm_t *atc,
52 int start, int grid_index, int end, int thread)
55 int *idxptr, tix, tiy, tiz;
56 real *xptr, *fptr, tx, ty, tz;
57 real rxx, ryx, ryy, rzx, rzy, rzz;
59 int start_ix, start_iy, start_iz;
60 int *g2tx, *g2ty, *g2tz;
62 int *thread_idx = NULL;
63 thread_plist_t *tpl = NULL;
71 start_ix = pme->pmegrid_start_ix;
72 start_iy = pme->pmegrid_start_iy;
73 start_iz = pme->pmegrid_start_iz;
75 rxx = pme->recipbox[XX][XX];
76 ryx = pme->recipbox[YY][XX];
77 ryy = pme->recipbox[YY][YY];
78 rzx = pme->recipbox[ZZ][XX];
79 rzy = pme->recipbox[ZZ][YY];
80 rzz = pme->recipbox[ZZ][ZZ];
82 g2tx = pme->pmegrid[grid_index].g2t[XX];
83 g2ty = pme->pmegrid[grid_index].g2t[YY];
84 g2tz = pme->pmegrid[grid_index].g2t[ZZ];
86 bThreads = (atc->nthread > 1);
89 thread_idx = atc->thread_idx;
91 tpl = &atc->thread_plist[thread];
93 for (i = 0; i < atc->nthread; i++)
99 for (i = start; i < end; i++)
102 idxptr = atc->idx[i];
103 fptr = atc->fractx[i];
105 /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
106 tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
107 ty = ny * ( xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
108 tz = nz * ( xptr[ZZ] * rzz + 2.0 );
114 /* Because decomposition only occurs in x and y,
115 * we never have a fraction correction in z.
117 fptr[XX] = tx - tix + pme->fshx[tix];
118 fptr[YY] = ty - tiy + pme->fshy[tiy];
121 idxptr[XX] = pme->nnx[tix];
122 idxptr[YY] = pme->nny[tiy];
123 idxptr[ZZ] = pme->nnz[tiz];
126 range_check(idxptr[XX], 0, pme->pmegrid_nx);
127 range_check(idxptr[YY], 0, pme->pmegrid_ny);
128 range_check(idxptr[ZZ], 0, pme->pmegrid_nz);
133 thread_i = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
134 thread_idx[i] = thread_i;
141 /* Make a list of particle indices sorted on thread */
143 /* Get the cumulative count */
144 for (i = 1; i < atc->nthread; i++)
146 tpl_n[i] += tpl_n[i-1];
148 /* The current implementation distributes particles equally
149 * over the threads, so we could actually allocate for that
150 * in pme_realloc_atomcomm_things.
152 if (tpl_n[atc->nthread-1] > tpl->nalloc)
154 tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
155 srenew(tpl->i, tpl->nalloc);
157 /* Set tpl_n to the cumulative start */
158 for (i = atc->nthread-1; i >= 1; i--)
160 tpl_n[i] = tpl_n[i-1];
164 /* Fill our thread local array with indices sorted on thread */
165 for (i = start; i < end; i++)
167 tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
169 /* Now tpl_n contains the cummulative count again */
173 static void make_thread_local_ind(pme_atomcomm_t *atc,
174 int thread, splinedata_t *spline)
176 int n, t, i, start, end;
179 /* Combine the indices made by each thread into one index */
183 for (t = 0; t < atc->nthread; t++)
185 tpl = &atc->thread_plist[t];
186 /* Copy our part (start - end) from the list of thread t */
189 start = tpl->n[thread-1];
191 end = tpl->n[thread];
192 for (i = start; i < end; i++)
194 spline->ind[n++] = tpl->i[i];
201 /* Macro to force loop unrolling by fixing order.
202 * This gives a significant performance gain.
204 #define CALC_SPLINE(order) \
208 real data[PME_ORDER_MAX]; \
209 real ddata[PME_ORDER_MAX]; \
211 for (j = 0; (j < DIM); j++) \
215 /* dr is relative offset from lower cell limit */ \
220 for (k = 3; (k < order); k++) \
222 div = 1.0/(k - 1.0); \
223 data[k-1] = div*dr*data[k-2]; \
224 for (l = 1; (l < (k-1)); l++) \
226 data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
229 data[0] = div*(1-dr)*data[0]; \
231 /* differentiate */ \
232 ddata[0] = -data[0]; \
233 for (k = 1; (k < order); k++) \
235 ddata[k] = data[k-1] - data[k]; \
238 div = 1.0/(order - 1); \
239 data[order-1] = div*dr*data[order-2]; \
240 for (l = 1; (l < (order-1)); l++) \
242 data[order-l-1] = div*((dr+l)*data[order-l-2]+ \
243 (order-l-dr)*data[order-l-1]); \
245 data[0] = div*(1 - dr)*data[0]; \
247 for (k = 0; k < order; k++) \
249 theta[j][i*order+k] = data[k]; \
250 dtheta[j][i*order+k] = ddata[k]; \
255 static void make_bsplines(splinevec theta, splinevec dtheta, int order,
256 rvec fractx[], int nr, int ind[], real coefficient[],
259 /* construct splines for local atoms */
263 for (i = 0; i < nr; i++)
265 /* With free energy we do not use the coefficient check.
266 * In most cases this will be more efficient than calling make_bsplines
267 * twice, since usually more than half the particles have non-zero coefficients.
270 if (bDoSplines || coefficient[ii] != 0.0)
275 case 4: CALC_SPLINE(4); break;
276 case 5: CALC_SPLINE(5); break;
277 default: CALC_SPLINE(order); break;
283 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
284 #define DO_BSPLINE(order) \
285 for (ithx = 0; (ithx < order); ithx++) \
287 index_x = (i0+ithx)*pny*pnz; \
288 valx = coefficient*thx[ithx]; \
290 for (ithy = 0; (ithy < order); ithy++) \
292 valxy = valx*thy[ithy]; \
293 index_xy = index_x+(j0+ithy)*pnz; \
295 for (ithz = 0; (ithz < order); ithz++) \
297 index_xyz = index_xy+(k0+ithz); \
298 grid[index_xyz] += valxy*thz[ithz]; \
304 static void spread_coefficients_bsplines_thread(pmegrid_t *pmegrid,
306 splinedata_t *spline,
307 struct pme_spline_work gmx_unused *work)
310 /* spread coefficients from home atoms to local grid */
313 int b, i, nn, n, ithx, ithy, ithz, i0, j0, k0;
315 int order, norder, index_x, index_xy, index_xyz;
316 real valx, valxy, coefficient;
317 real *thx, *thy, *thz;
318 int localsize, bndsize;
319 int pnx, pny, pnz, ndatatot;
320 int offx, offy, offz;
322 #if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
323 real thz_buffer[GMX_SIMD4_WIDTH*3], *thz_aligned;
325 thz_aligned = gmx_simd4_align_r(thz_buffer);
328 pnx = pmegrid->s[XX];
329 pny = pmegrid->s[YY];
330 pnz = pmegrid->s[ZZ];
332 offx = pmegrid->offset[XX];
333 offy = pmegrid->offset[YY];
334 offz = pmegrid->offset[ZZ];
336 ndatatot = pnx*pny*pnz;
337 grid = pmegrid->grid;
338 for (i = 0; i < ndatatot; i++)
343 order = pmegrid->order;
345 for (nn = 0; nn < spline->n; nn++)
348 coefficient = atc->coefficient[n];
350 if (coefficient != 0)
352 idxptr = atc->idx[n];
355 i0 = idxptr[XX] - offx;
356 j0 = idxptr[YY] - offy;
357 k0 = idxptr[ZZ] - offz;
359 thx = spline->theta[XX] + norder;
360 thy = spline->theta[YY] + norder;
361 thz = spline->theta[ZZ] + norder;
366 #ifdef PME_SIMD4_SPREAD_GATHER
367 #ifdef PME_SIMD4_UNALIGNED
368 #define PME_SPREAD_SIMD4_ORDER4
370 #define PME_SPREAD_SIMD4_ALIGNED
373 #include "gromacs/ewald/pme-simd4.h"
379 #ifdef PME_SIMD4_SPREAD_GATHER
380 #define PME_SPREAD_SIMD4_ALIGNED
382 #include "gromacs/ewald/pme-simd4.h"
395 static void copy_local_grid(struct gmx_pme_t *pme, pmegrids_t *pmegrids,
396 int grid_index, int thread, real *fftgrid)
398 ivec local_fft_ndata, local_fft_offset, local_fft_size;
402 int offx, offy, offz, x, y, z, i0, i0t;
407 gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
411 fft_my = local_fft_size[YY];
412 fft_mz = local_fft_size[ZZ];
414 pmegrid = &pmegrids->grid_th[thread];
416 nsx = pmegrid->s[XX];
417 nsy = pmegrid->s[YY];
418 nsz = pmegrid->s[ZZ];
420 for (d = 0; d < DIM; d++)
422 nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
423 local_fft_ndata[d] - pmegrid->offset[d]);
426 offx = pmegrid->offset[XX];
427 offy = pmegrid->offset[YY];
428 offz = pmegrid->offset[ZZ];
430 /* Directly copy the non-overlapping parts of the local grids.
431 * This also initializes the full grid.
433 grid_th = pmegrid->grid;
434 for (x = 0; x < nf[XX]; x++)
436 for (y = 0; y < nf[YY]; y++)
438 i0 = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
439 i0t = (x*nsy + y)*nsz;
440 for (z = 0; z < nf[ZZ]; z++)
442 fftgrid[i0+z] = grid_th[i0t+z];
449 reduce_threadgrid_overlap(struct gmx_pme_t *pme,
450 const pmegrids_t *pmegrids, int thread,
451 real *fftgrid, real *commbuf_x, real *commbuf_y,
454 ivec local_fft_ndata, local_fft_offset, local_fft_size;
455 int fft_nx, fft_ny, fft_nz;
459 ivec localcopy_end, commcopy_end;
460 int offx, offy, offz, x, y, z, i0, i0t;
461 int sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
462 gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
463 gmx_bool bCommX, bCommY;
466 const pmegrid_t *pmegrid, *pmegrid_g, *pmegrid_f;
468 real *commbuf = NULL;
470 gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
474 fft_nx = local_fft_ndata[XX];
475 fft_ny = local_fft_ndata[YY];
476 fft_nz = local_fft_ndata[ZZ];
478 fft_my = local_fft_size[YY];
479 fft_mz = local_fft_size[ZZ];
481 /* This routine is called when all thread have finished spreading.
482 * Here each thread sums grid contributions calculated by other threads
483 * to the thread local grid volume.
484 * To minimize the number of grid copying operations,
485 * this routines sums immediately from the pmegrid to the fftgrid.
488 /* Determine which part of the full node grid we should operate on,
489 * this is our thread local part of the full grid.
491 pmegrid = &pmegrids->grid_th[thread];
493 for (d = 0; d < DIM; d++)
495 /* Determine up to where our thread needs to copy from the
496 * thread-local charge spreading grid to the rank-local FFT grid.
497 * This is up to our spreading grid end minus order-1 and
498 * not beyond the local FFT grid.
501 min(pmegrid->offset[d] + pmegrid->n[d] - (pmegrid->order - 1),
504 /* Determine up to where our thread needs to copy from the
505 * thread-local charge spreading grid to the communication buffer.
506 * Note: only relevant with communication, ignored otherwise.
508 commcopy_end[d] = localcopy_end[d];
509 if (pmegrid->ci[d] == pmegrids->nc[d] - 1)
511 /* The last thread should copy up to the last pme grid line.
512 * When the rank-local FFT grid is narrower than pme-order,
513 * we need the max below to ensure copying of all data.
515 commcopy_end[d] = max(commcopy_end[d], pme->pme_order);
519 offx = pmegrid->offset[XX];
520 offy = pmegrid->offset[YY];
521 offz = pmegrid->offset[ZZ];
528 /* Now loop over all the thread data blocks that contribute
529 * to the grid region we (our thread) are operating on.
531 /* Note that fft_nx/y is equal to the number of grid points
532 * between the first point of our node grid and the one of the next node.
534 for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
536 fx = pmegrid->ci[XX] + sx;
541 fx += pmegrids->nc[XX];
543 bCommX = (pme->nnodes_major > 1);
545 pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
546 ox += pmegrid_g->offset[XX];
547 /* Determine the end of our part of the source grid.
548 * Use our thread local source grid and target grid part
550 tx1 = min(ox + pmegrid_g->n[XX],
551 !bCommX ? localcopy_end[XX] : commcopy_end[XX]);
553 for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
555 fy = pmegrid->ci[YY] + sy;
560 fy += pmegrids->nc[YY];
562 bCommY = (pme->nnodes_minor > 1);
564 pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
565 oy += pmegrid_g->offset[YY];
566 /* Determine the end of our part of the source grid.
567 * Use our thread local source grid and target grid part
569 ty1 = min(oy + pmegrid_g->n[YY],
570 !bCommY ? localcopy_end[YY] : commcopy_end[YY]);
572 for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
574 fz = pmegrid->ci[ZZ] + sz;
578 fz += pmegrids->nc[ZZ];
581 pmegrid_g = &pmegrids->grid_th[fz];
582 oz += pmegrid_g->offset[ZZ];
583 tz1 = min(oz + pmegrid_g->n[ZZ], localcopy_end[ZZ]);
585 if (sx == 0 && sy == 0 && sz == 0)
587 /* We have already added our local contribution
588 * before calling this routine, so skip it here.
593 thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
595 pmegrid_f = &pmegrids->grid_th[thread_f];
597 grid_th = pmegrid_f->grid;
599 nsx = pmegrid_f->s[XX];
600 nsy = pmegrid_f->s[YY];
601 nsz = pmegrid_f->s[ZZ];
603 #ifdef DEBUG_PME_REDUCE
604 printf("n%d t%d add %d %2d %2d %2d %2d %2d %2d %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
605 pme->nodeid, thread, thread_f,
606 pme->pmegrid_start_ix,
607 pme->pmegrid_start_iy,
608 pme->pmegrid_start_iz,
610 offx-ox, tx1-ox, offx, tx1,
611 offy-oy, ty1-oy, offy, ty1,
612 offz-oz, tz1-oz, offz, tz1);
615 if (!(bCommX || bCommY))
617 /* Copy from the thread local grid to the node grid */
618 for (x = offx; x < tx1; x++)
620 for (y = offy; y < ty1; y++)
622 i0 = (x*fft_my + y)*fft_mz;
623 i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
624 for (z = offz; z < tz1; z++)
626 fftgrid[i0+z] += grid_th[i0t+z];
633 /* The order of this conditional decides
634 * where the corner volume gets stored with x+y decomp.
639 /* The y-size of the communication buffer is set by
640 * the overlap of the grid part of our local slab
641 * with the part starting at the next slab.
644 pme->overlap[1].s2g1[pme->nodeid_minor] -
645 pme->overlap[1].s2g0[pme->nodeid_minor+1];
648 /* We index commbuf modulo the local grid size */
649 commbuf += buf_my*fft_nx*fft_nz;
651 bClearBuf = bClearBufXY;
656 bClearBuf = bClearBufY;
664 bClearBuf = bClearBufX;
668 /* Copy to the communication buffer */
669 for (x = offx; x < tx1; x++)
671 for (y = offy; y < ty1; y++)
673 i0 = (x*buf_my + y)*fft_nz;
674 i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
678 /* First access of commbuf, initialize it */
679 for (z = offz; z < tz1; z++)
681 commbuf[i0+z] = grid_th[i0t+z];
686 for (z = offz; z < tz1; z++)
688 commbuf[i0+z] += grid_th[i0t+z];
700 static void sum_fftgrid_dd(struct gmx_pme_t *pme, real *fftgrid, int grid_index)
702 ivec local_fft_ndata, local_fft_offset, local_fft_size;
703 pme_overlap_t *overlap;
704 int send_index0, send_nindex;
709 int send_size_y, recv_size_y;
710 int ipulse, send_id, recv_id, datasize, gridsize, size_yx;
711 real *sendptr, *recvptr;
712 int x, y, z, indg, indb;
714 /* Note that this routine is only used for forward communication.
715 * Since the force gathering, unlike the coefficient spreading,
716 * can be trivially parallelized over the particles,
717 * the backwards process is much simpler and can use the "old"
718 * communication setup.
721 gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
726 if (pme->nnodes_minor > 1)
728 /* Major dimension */
729 overlap = &pme->overlap[1];
731 if (pme->nnodes_major > 1)
733 size_yx = pme->overlap[0].comm_data[0].send_nindex;
739 datasize = (local_fft_ndata[XX] + size_yx)*local_fft_ndata[ZZ];
741 send_size_y = overlap->send_size;
743 for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
745 send_id = overlap->send_id[ipulse];
746 recv_id = overlap->recv_id[ipulse];
748 overlap->comm_data[ipulse].send_index0 -
749 overlap->comm_data[0].send_index0;
750 send_nindex = overlap->comm_data[ipulse].send_nindex;
751 /* We don't use recv_index0, as we always receive starting at 0 */
752 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
753 recv_size_y = overlap->comm_data[ipulse].recv_size;
755 sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
756 recvptr = overlap->recvbuf;
760 fprintf(debug, "PME fftgrid comm y %2d x %2d x %2d\n",
761 local_fft_ndata[XX], send_nindex, local_fft_ndata[ZZ]);
765 MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
767 recvptr, recv_size_y*datasize, GMX_MPI_REAL,
769 overlap->mpi_comm, &stat);
772 for (x = 0; x < local_fft_ndata[XX]; x++)
774 for (y = 0; y < recv_nindex; y++)
776 indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
777 indb = (x*recv_size_y + y)*local_fft_ndata[ZZ];
778 for (z = 0; z < local_fft_ndata[ZZ]; z++)
780 fftgrid[indg+z] += recvptr[indb+z];
785 if (pme->nnodes_major > 1)
787 /* Copy from the received buffer to the send buffer for dim 0 */
788 sendptr = pme->overlap[0].sendbuf;
789 for (x = 0; x < size_yx; x++)
791 for (y = 0; y < recv_nindex; y++)
793 indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
794 indb = ((local_fft_ndata[XX] + x)*recv_size_y + y)*local_fft_ndata[ZZ];
795 for (z = 0; z < local_fft_ndata[ZZ]; z++)
797 sendptr[indg+z] += recvptr[indb+z];
805 /* We only support a single pulse here.
806 * This is not a severe limitation, as this code is only used
807 * with OpenMP and with OpenMP the (PME) domains can be larger.
809 if (pme->nnodes_major > 1)
811 /* Major dimension */
812 overlap = &pme->overlap[0];
814 datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
815 gridsize = local_fft_size[YY] *local_fft_size[ZZ];
819 send_id = overlap->send_id[ipulse];
820 recv_id = overlap->recv_id[ipulse];
821 send_nindex = overlap->comm_data[ipulse].send_nindex;
822 /* We don't use recv_index0, as we always receive starting at 0 */
823 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
825 sendptr = overlap->sendbuf;
826 recvptr = overlap->recvbuf;
830 fprintf(debug, "PME fftgrid comm x %2d x %2d x %2d\n",
831 send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
835 MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
837 recvptr, recv_nindex*datasize, GMX_MPI_REAL,
839 overlap->mpi_comm, &stat);
842 for (x = 0; x < recv_nindex; x++)
844 for (y = 0; y < local_fft_ndata[YY]; y++)
846 indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
847 indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
848 for (z = 0; z < local_fft_ndata[ZZ]; z++)
850 fftgrid[indg+z] += recvptr[indb+z];
857 void spread_on_grid(struct gmx_pme_t *pme,
858 pme_atomcomm_t *atc, pmegrids_t *grids,
859 gmx_bool bCalcSplines, gmx_bool bSpread,
860 real *fftgrid, gmx_bool bDoSplines, int grid_index)
863 #ifdef PME_TIME_THREADS
864 gmx_cycles_t c1, c2, c3, ct1a, ct1b, ct1c;
865 static double cs1 = 0, cs2 = 0, cs3 = 0;
866 static double cs1a[6] = {0, 0, 0, 0, 0, 0};
870 nthread = pme->nthread;
873 #ifdef PME_TIME_THREADS
874 c1 = omp_cyc_start();
878 #pragma omp parallel for num_threads(nthread) schedule(static)
879 for (thread = 0; thread < nthread; thread++)
883 start = atc->n* thread /nthread;
884 end = atc->n*(thread+1)/nthread;
886 /* Compute fftgrid index for all atoms,
887 * with help of some extra variables.
889 calc_interpolation_idx(pme, atc, start, grid_index, end, thread);
892 #ifdef PME_TIME_THREADS
893 c1 = omp_cyc_end(c1);
897 #ifdef PME_TIME_THREADS
898 c2 = omp_cyc_start();
900 #pragma omp parallel for num_threads(nthread) schedule(static)
901 for (thread = 0; thread < nthread; thread++)
903 splinedata_t *spline;
904 pmegrid_t *grid = NULL;
906 /* make local bsplines */
907 if (grids == NULL || !pme->bUseThreads)
909 spline = &atc->spline[0];
920 spline = &atc->spline[thread];
922 if (grids->nthread == 1)
924 /* One thread, we operate on all coefficients */
929 /* Get the indices our thread should operate on */
930 make_thread_local_ind(atc, thread, spline);
933 grid = &grids->grid_th[thread];
938 make_bsplines(spline->theta, spline->dtheta, pme->pme_order,
939 atc->fractx, spline->n, spline->ind, atc->coefficient, bDoSplines);
944 /* put local atoms on grid. */
945 #ifdef PME_TIME_SPREAD
946 ct1a = omp_cyc_start();
948 spread_coefficients_bsplines_thread(grid, atc, spline, pme->spline_work);
950 if (pme->bUseThreads)
952 copy_local_grid(pme, grids, grid_index, thread, fftgrid);
954 #ifdef PME_TIME_SPREAD
955 ct1a = omp_cyc_end(ct1a);
956 cs1a[thread] += (double)ct1a;
960 #ifdef PME_TIME_THREADS
961 c2 = omp_cyc_end(c2);
965 if (bSpread && pme->bUseThreads)
967 #ifdef PME_TIME_THREADS
968 c3 = omp_cyc_start();
970 #pragma omp parallel for num_threads(grids->nthread) schedule(static)
971 for (thread = 0; thread < grids->nthread; thread++)
973 reduce_threadgrid_overlap(pme, grids, thread,
975 pme->overlap[0].sendbuf,
976 pme->overlap[1].sendbuf,
979 #ifdef PME_TIME_THREADS
980 c3 = omp_cyc_end(c3);
986 /* Communicate the overlapping part of the fftgrid.
987 * For this communication call we need to check pme->bUseThreads
988 * to have all ranks communicate here, regardless of pme->nthread.
990 sum_fftgrid_dd(pme, fftgrid, grid_index);
994 #ifdef PME_TIME_THREADS
998 printf("idx %.2f spread %.2f red %.2f",
999 cs1*1e-9, cs2*1e-9, cs3*1e-9);
1000 #ifdef PME_TIME_SPREAD
1001 for (thread = 0; thread < nthread; thread++)
1003 printf(" %.2f", cs1a[thread]*1e-9);