2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
37 /* IMPORTANT FOR DEVELOPERS:
39 * Triclinic pme stuff isn't entirely trivial, and we've experienced
40 * some bugs during development (many of them due to me). To avoid
41 * this in the future, please check the following things if you make
42 * changes in this file:
44 * 1. You should obtain identical (at least to the PME precision)
45 * energies, forces, and virial for
46 * a rectangular box and a triclinic one where the z (or y) axis is
47 * tilted a whole box side. For instance you could use these boxes:
49 * rectangular triclinic
54 * 2. You should check the energy conservation in a triclinic box.
56 * It might seem an overkill, but better safe than sorry.
71 #include "gromacs/ewald/pme-internal.h"
72 #include "gromacs/fft/fft.h"
73 #include "gromacs/fft/parallel_3dfft.h"
74 #include "gromacs/legacyheaders/macros.h"
75 #include "gromacs/legacyheaders/nrnb.h"
76 #include "gromacs/legacyheaders/types/commrec.h"
77 #include "gromacs/legacyheaders/types/enums.h"
78 #include "gromacs/legacyheaders/types/forcerec.h"
79 #include "gromacs/legacyheaders/types/inputrec.h"
80 #include "gromacs/legacyheaders/types/nrnb.h"
81 #include "gromacs/math/gmxcomplex.h"
82 #include "gromacs/math/units.h"
83 #include "gromacs/math/vec.h"
84 #include "gromacs/math/vectypes.h"
85 /* Include the SIMD macro file and then check for support */
86 #include "gromacs/simd/simd.h"
87 #include "gromacs/simd/simd_math.h"
88 #include "gromacs/timing/cyclecounter.h"
89 #include "gromacs/timing/wallcycle.h"
90 #include "gromacs/timing/walltime_accounting.h"
91 #include "gromacs/utility/basedefinitions.h"
92 #include "gromacs/utility/fatalerror.h"
93 #include "gromacs/utility/gmxmpi.h"
94 #include "gromacs/utility/gmxomp.h"
95 #include "gromacs/utility/real.h"
96 #include "gromacs/utility/smalloc.h"
99 #include "gromacs/fileio/pdbio.h"
100 #include "gromacs/utility/cstringutil.h"
101 #include "gromacs/utility/futil.h"
104 #ifdef GMX_SIMD_HAVE_REAL
105 /* Turn on arbitrary width SIMD intrinsics for PME solve */
106 # define PME_SIMD_SOLVE
109 #define PME_GRID_QA 0 /* Gridindex for A-state for Q */
110 #define PME_GRID_C6A 2 /* Gridindex for A-state for LJ */
111 #define DO_Q 2 /* Electrostatic grids have index q<2 */
112 #define DO_Q_AND_LJ 4 /* non-LB LJ grids have index 2 <= q < 4 */
113 #define DO_Q_AND_LJ_LB 9 /* With LB rules we need a total of 2+7 grids */
115 /* Pascal triangle coefficients scaled with (1/2)^6 for LJ-PME with LB-rules */
116 const real lb_scale_factor[] = {
117 1.0/64, 6.0/64, 15.0/64, 20.0/64,
118 15.0/64, 6.0/64, 1.0/64
121 /* Pascal triangle coefficients used in solve_pme_lj_yzx, only need to do 4 calculations due to symmetry */
122 const real lb_scale_factor_symm[] = { 2.0/64, 12.0/64, 30.0/64, 20.0/64 };
124 /* Check if we have 4-wide SIMD macro support */
125 #if (defined GMX_SIMD4_HAVE_REAL)
126 /* Do PME spread and gather with 4-wide SIMD.
127 * NOTE: SIMD is only used with PME order 4 and 5 (which are the most common).
129 # define PME_SIMD4_SPREAD_GATHER
131 # if (defined GMX_SIMD_HAVE_LOADU) && (defined GMX_SIMD_HAVE_STOREU)
132 /* With PME-order=4 on x86, unaligned load+store is slightly faster
133 * than doubling all SIMD operations when using aligned load+store.
135 # define PME_SIMD4_UNALIGNED
140 /* #define PRT_FORCE */
141 /* conditions for on the fly time-measurement */
142 /* #define TAKETIME (step > 1 && timesteps < 10) */
143 #define TAKETIME FALSE
145 /* #define PME_TIME_THREADS */
148 #define mpi_type MPI_DOUBLE
150 #define mpi_type MPI_FLOAT
153 #ifdef PME_SIMD4_SPREAD_GATHER
154 # define SIMD4_ALIGNMENT (GMX_SIMD4_WIDTH*sizeof(real))
156 /* We can use any alignment, apart from 0, so we use 4 reals */
157 # define SIMD4_ALIGNMENT (4*sizeof(real))
160 /* GMX_CACHE_SEP should be a multiple of the SIMD and SIMD4 register size
161 * to preserve alignment.
163 #define GMX_CACHE_SEP 64
165 /* We only define a maximum to be able to use local arrays without allocation.
166 * An order larger than 12 should never be needed, even for test cases.
167 * If needed it can be changed here.
169 #define PME_ORDER_MAX 12
171 /* Internal datastructures */
177 int recv_size; /* Receive buffer width, used with OpenMP */
180 typedef real *splinevec[DIM];
190 int *send_id, *recv_id;
191 int send_size; /* Send buffer width, used with OpenMP */
192 pme_grid_comm_t *comm_data;
198 int *n; /* Cumulative counts of the number of particles per thread */
199 int nalloc; /* Allocation size of i */
200 int *i; /* Particle indices ordered on thread index (n) */
214 int dimind; /* The index of the dimension, 0=x, 1=y */
221 int *node_dest; /* The nodes to send x and q to with DD */
222 int *node_src; /* The nodes to receive x and q from with DD */
223 int *buf_index; /* Index for commnode into the buffers */
230 int *count; /* The number of atoms to send to each node */
232 int *rcount; /* The number of atoms to receive */
239 gmx_bool bSpread; /* These coordinates are used for spreading */
242 rvec *fractx; /* Fractional coordinate relative to
243 * the lower cell boundary
246 int *thread_idx; /* Which thread should spread which coefficient */
247 thread_plist_t *thread_plist;
248 splinedata_t *spline;
255 ivec ci; /* The spatial location of this grid */
256 ivec n; /* The used size of *grid, including order-1 */
257 ivec offset; /* The grid offset from the full node grid */
258 int order; /* PME spreading order */
259 ivec s; /* The allocated size of *grid, s >= n */
260 real *grid; /* The grid local thread, size n */
264 pmegrid_t grid; /* The full node grid (non thread-local) */
265 int nthread; /* The number of threads operating on this grid */
266 ivec nc; /* The local spatial decomposition over the threads */
267 pmegrid_t *grid_th; /* Array of grids for each thread */
268 real *grid_all; /* Allocated array for the grids in *grid_th */
269 int **g2t; /* The grid to thread index */
270 ivec nthread_comm; /* The number of threads to communicate with */
274 #ifdef PME_SIMD4_SPREAD_GATHER
275 /* Masks for 4-wide SIMD aligned spreading and gathering */
276 gmx_simd4_bool_t mask_S0[6], mask_S1[6];
278 int dummy; /* C89 requires that struct has at least one member */
283 /* work data for solve_pme */
302 typedef struct gmx_pme {
303 int ndecompdim; /* The number of decomposition dimensions */
304 int nodeid; /* Our nodeid in mpi->mpi_comm */
307 int nnodes; /* The number of nodes doing PME */
312 MPI_Comm mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
314 MPI_Datatype rvec_mpi; /* the pme vector's MPI type */
317 gmx_bool bUseThreads; /* Does any of the PME ranks have nthread>1 ? */
318 int nthread; /* The number of threads doing PME on our rank */
320 gmx_bool bPPnode; /* Node also does particle-particle forces */
321 gmx_bool bFEP; /* Compute Free energy contribution */
324 int nkx, nky, nkz; /* Grid dimensions */
325 gmx_bool bP3M; /* Do P3M: optimize the influence function */
329 int ljpme_combination_rule; /* Type of combination rule in LJ-PME */
331 int ngrids; /* number of grids we maintain for pmegrid, (c)fftgrid and pfft_setups*/
333 pmegrids_t pmegrid[DO_Q_AND_LJ_LB]; /* Grids on which we do spreading/interpolation,
334 * includes overlap Grid indices are ordered as
336 * 0: Coloumb PME, state A
337 * 1: Coloumb PME, state B
339 * This can probably be done in a better way
340 * but this simple hack works for now
342 /* The PME coefficient spreading grid sizes/strides, includes pme_order-1 */
343 int pmegrid_nx, pmegrid_ny, pmegrid_nz;
344 /* pmegrid_nz might be larger than strictly necessary to ensure
345 * memory alignment, pmegrid_nz_base gives the real base size.
348 /* The local PME grid starting indices */
349 int pmegrid_start_ix, pmegrid_start_iy, pmegrid_start_iz;
351 /* Work data for spreading and gathering */
352 pme_spline_work_t *spline_work;
354 real **fftgrid; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
355 /* inside the interpolation grid, but separate for 2D PME decomp. */
356 int fftgrid_nx, fftgrid_ny, fftgrid_nz;
358 t_complex **cfftgrid; /* Grids for complex FFT data */
360 int cfftgrid_nx, cfftgrid_ny, cfftgrid_nz;
362 gmx_parallel_3dfft_t *pfft_setup;
364 int *nnx, *nny, *nnz;
365 real *fshx, *fshy, *fshz;
367 pme_atomcomm_t atc[2]; /* Indexed on decomposition index */
370 /* Buffers to store data for local atoms for L-B combination rule
371 * calculations in LJ-PME. lb_buf1 stores either the coefficients
372 * for spreading/gathering (in serial), or the C6 coefficient for
373 * local atoms (in parallel). lb_buf2 is only used in parallel,
374 * and stores the sigma values for local atoms. */
375 real *lb_buf1, *lb_buf2;
376 int lb_buf_nalloc; /* Allocation size for the above buffers. */
378 pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
380 pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
382 rvec *bufv; /* Communication buffer */
383 real *bufr; /* Communication buffer */
384 int buf_nalloc; /* The communication buffer size */
386 /* thread local work data for solve_pme */
389 /* Work data for sum_qgrid */
390 real * sum_qgrid_tmp;
391 real * sum_qgrid_dd_tmp;
394 static void calc_interpolation_idx(gmx_pme_t pme, pme_atomcomm_t *atc,
395 int start, int grid_index, int end, int thread)
398 int *idxptr, tix, tiy, tiz;
399 real *xptr, *fptr, tx, ty, tz;
400 real rxx, ryx, ryy, rzx, rzy, rzz;
402 int start_ix, start_iy, start_iz;
403 int *g2tx, *g2ty, *g2tz;
405 int *thread_idx = NULL;
406 thread_plist_t *tpl = NULL;
414 start_ix = pme->pmegrid_start_ix;
415 start_iy = pme->pmegrid_start_iy;
416 start_iz = pme->pmegrid_start_iz;
418 rxx = pme->recipbox[XX][XX];
419 ryx = pme->recipbox[YY][XX];
420 ryy = pme->recipbox[YY][YY];
421 rzx = pme->recipbox[ZZ][XX];
422 rzy = pme->recipbox[ZZ][YY];
423 rzz = pme->recipbox[ZZ][ZZ];
425 g2tx = pme->pmegrid[grid_index].g2t[XX];
426 g2ty = pme->pmegrid[grid_index].g2t[YY];
427 g2tz = pme->pmegrid[grid_index].g2t[ZZ];
429 bThreads = (atc->nthread > 1);
432 thread_idx = atc->thread_idx;
434 tpl = &atc->thread_plist[thread];
436 for (i = 0; i < atc->nthread; i++)
442 for (i = start; i < end; i++)
445 idxptr = atc->idx[i];
446 fptr = atc->fractx[i];
448 /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
449 tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
450 ty = ny * ( xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
451 tz = nz * ( xptr[ZZ] * rzz + 2.0 );
457 /* Because decomposition only occurs in x and y,
458 * we never have a fraction correction in z.
460 fptr[XX] = tx - tix + pme->fshx[tix];
461 fptr[YY] = ty - tiy + pme->fshy[tiy];
464 idxptr[XX] = pme->nnx[tix];
465 idxptr[YY] = pme->nny[tiy];
466 idxptr[ZZ] = pme->nnz[tiz];
469 range_check(idxptr[XX], 0, pme->pmegrid_nx);
470 range_check(idxptr[YY], 0, pme->pmegrid_ny);
471 range_check(idxptr[ZZ], 0, pme->pmegrid_nz);
476 thread_i = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
477 thread_idx[i] = thread_i;
484 /* Make a list of particle indices sorted on thread */
486 /* Get the cumulative count */
487 for (i = 1; i < atc->nthread; i++)
489 tpl_n[i] += tpl_n[i-1];
491 /* The current implementation distributes particles equally
492 * over the threads, so we could actually allocate for that
493 * in pme_realloc_atomcomm_things.
495 if (tpl_n[atc->nthread-1] > tpl->nalloc)
497 tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
498 srenew(tpl->i, tpl->nalloc);
500 /* Set tpl_n to the cumulative start */
501 for (i = atc->nthread-1; i >= 1; i--)
503 tpl_n[i] = tpl_n[i-1];
507 /* Fill our thread local array with indices sorted on thread */
508 for (i = start; i < end; i++)
510 tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
512 /* Now tpl_n contains the cummulative count again */
516 static void make_thread_local_ind(pme_atomcomm_t *atc,
517 int thread, splinedata_t *spline)
519 int n, t, i, start, end;
522 /* Combine the indices made by each thread into one index */
526 for (t = 0; t < atc->nthread; t++)
528 tpl = &atc->thread_plist[t];
529 /* Copy our part (start - end) from the list of thread t */
532 start = tpl->n[thread-1];
534 end = tpl->n[thread];
535 for (i = start; i < end; i++)
537 spline->ind[n++] = tpl->i[i];
545 static void pme_calc_pidx(int start, int end,
546 matrix recipbox, rvec x[],
547 pme_atomcomm_t *atc, int *count)
552 real rxx, ryx, rzx, ryy, rzy;
555 /* Calculate PME task index (pidx) for each grid index.
556 * Here we always assign equally sized slabs to each node
557 * for load balancing reasons (the PME grid spacing is not used).
563 /* Reset the count */
564 for (i = 0; i < nslab; i++)
569 if (atc->dimind == 0)
571 rxx = recipbox[XX][XX];
572 ryx = recipbox[YY][XX];
573 rzx = recipbox[ZZ][XX];
574 /* Calculate the node index in x-dimension */
575 for (i = start; i < end; i++)
578 /* Fractional coordinates along box vectors */
579 s = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
580 si = (int)(s + 2*nslab) % nslab;
587 ryy = recipbox[YY][YY];
588 rzy = recipbox[ZZ][YY];
589 /* Calculate the node index in y-dimension */
590 for (i = start; i < end; i++)
593 /* Fractional coordinates along box vectors */
594 s = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
595 si = (int)(s + 2*nslab) % nslab;
602 static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
605 int nthread, thread, slab;
607 nthread = atc->nthread;
609 #pragma omp parallel for num_threads(nthread) schedule(static)
610 for (thread = 0; thread < nthread; thread++)
612 pme_calc_pidx(natoms* thread /nthread,
613 natoms*(thread+1)/nthread,
614 recipbox, x, atc, atc->count_thread[thread]);
616 /* Non-parallel reduction, since nslab is small */
618 for (thread = 1; thread < nthread; thread++)
620 for (slab = 0; slab < atc->nslab; slab++)
622 atc->count_thread[0][slab] += atc->count_thread[thread][slab];
627 static void realloc_splinevec(splinevec th, real **ptr_z, int nalloc)
629 const int padding = 4;
632 srenew(th[XX], nalloc);
633 srenew(th[YY], nalloc);
634 /* In z we add padding, this is only required for the aligned SIMD code */
635 sfree_aligned(*ptr_z);
636 snew_aligned(*ptr_z, nalloc+2*padding, SIMD4_ALIGNMENT);
637 th[ZZ] = *ptr_z + padding;
639 for (i = 0; i < padding; i++)
642 (*ptr_z)[padding+nalloc+i] = 0;
646 static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
650 srenew(spline->ind, atc->nalloc);
651 /* Initialize the index to identity so it works without threads */
652 for (i = 0; i < atc->nalloc; i++)
657 realloc_splinevec(spline->theta, &spline->ptr_theta_z,
658 atc->pme_order*atc->nalloc);
659 realloc_splinevec(spline->dtheta, &spline->ptr_dtheta_z,
660 atc->pme_order*atc->nalloc);
663 static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
665 int nalloc_old, i, j, nalloc_tpl;
667 /* We have to avoid a NULL pointer for atc->x to avoid
668 * possible fatal errors in MPI routines.
670 if (atc->n > atc->nalloc || atc->nalloc == 0)
672 nalloc_old = atc->nalloc;
673 atc->nalloc = over_alloc_dd(max(atc->n, 1));
677 srenew(atc->x, atc->nalloc);
678 srenew(atc->coefficient, atc->nalloc);
679 srenew(atc->f, atc->nalloc);
680 for (i = nalloc_old; i < atc->nalloc; i++)
682 clear_rvec(atc->f[i]);
687 srenew(atc->fractx, atc->nalloc);
688 srenew(atc->idx, atc->nalloc);
690 if (atc->nthread > 1)
692 srenew(atc->thread_idx, atc->nalloc);
695 for (i = 0; i < atc->nthread; i++)
697 pme_realloc_splinedata(&atc->spline[i], atc);
703 static void pme_dd_sendrecv(pme_atomcomm_t gmx_unused *atc,
704 gmx_bool gmx_unused bBackward, int gmx_unused shift,
705 void gmx_unused *buf_s, int gmx_unused nbyte_s,
706 void gmx_unused *buf_r, int gmx_unused nbyte_r)
712 if (bBackward == FALSE)
714 dest = atc->node_dest[shift];
715 src = atc->node_src[shift];
719 dest = atc->node_src[shift];
720 src = atc->node_dest[shift];
723 if (nbyte_s > 0 && nbyte_r > 0)
725 MPI_Sendrecv(buf_s, nbyte_s, MPI_BYTE,
727 buf_r, nbyte_r, MPI_BYTE,
729 atc->mpi_comm, &stat);
731 else if (nbyte_s > 0)
733 MPI_Send(buf_s, nbyte_s, MPI_BYTE,
737 else if (nbyte_r > 0)
739 MPI_Recv(buf_r, nbyte_r, MPI_BYTE,
741 atc->mpi_comm, &stat);
746 static void dd_pmeredist_pos_coeffs(gmx_pme_t pme,
747 int n, gmx_bool bX, rvec *x, real *data,
750 int *commnode, *buf_index;
751 int nnodes_comm, i, nsend, local_pos, buf_pos, node, scount, rcount;
753 commnode = atc->node_dest;
754 buf_index = atc->buf_index;
756 nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
759 for (i = 0; i < nnodes_comm; i++)
761 buf_index[commnode[i]] = nsend;
762 nsend += atc->count[commnode[i]];
766 if (atc->count[atc->nodeid] + nsend != n)
768 gmx_fatal(FARGS, "%d particles communicated to PME rank %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
769 "This usually means that your system is not well equilibrated.",
770 n - (atc->count[atc->nodeid] + nsend),
771 pme->nodeid, 'x'+atc->dimind);
774 if (nsend > pme->buf_nalloc)
776 pme->buf_nalloc = over_alloc_dd(nsend);
777 srenew(pme->bufv, pme->buf_nalloc);
778 srenew(pme->bufr, pme->buf_nalloc);
781 atc->n = atc->count[atc->nodeid];
782 for (i = 0; i < nnodes_comm; i++)
784 scount = atc->count[commnode[i]];
785 /* Communicate the count */
788 fprintf(debug, "dimind %d PME rank %d send to rank %d: %d\n",
789 atc->dimind, atc->nodeid, commnode[i], scount);
791 pme_dd_sendrecv(atc, FALSE, i,
792 &scount, sizeof(int),
793 &atc->rcount[i], sizeof(int));
794 atc->n += atc->rcount[i];
797 pme_realloc_atomcomm_things(atc);
801 for (i = 0; i < n; i++)
804 if (node == atc->nodeid)
806 /* Copy direct to the receive buffer */
809 copy_rvec(x[i], atc->x[local_pos]);
811 atc->coefficient[local_pos] = data[i];
816 /* Copy to the send buffer */
819 copy_rvec(x[i], pme->bufv[buf_index[node]]);
821 pme->bufr[buf_index[node]] = data[i];
827 for (i = 0; i < nnodes_comm; i++)
829 scount = atc->count[commnode[i]];
830 rcount = atc->rcount[i];
831 if (scount > 0 || rcount > 0)
835 /* Communicate the coordinates */
836 pme_dd_sendrecv(atc, FALSE, i,
837 pme->bufv[buf_pos], scount*sizeof(rvec),
838 atc->x[local_pos], rcount*sizeof(rvec));
840 /* Communicate the coefficients */
841 pme_dd_sendrecv(atc, FALSE, i,
842 pme->bufr+buf_pos, scount*sizeof(real),
843 atc->coefficient+local_pos, rcount*sizeof(real));
845 local_pos += atc->rcount[i];
850 static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
854 int *commnode, *buf_index;
855 int nnodes_comm, local_pos, buf_pos, i, scount, rcount, node;
857 commnode = atc->node_dest;
858 buf_index = atc->buf_index;
860 nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
862 local_pos = atc->count[atc->nodeid];
864 for (i = 0; i < nnodes_comm; i++)
866 scount = atc->rcount[i];
867 rcount = atc->count[commnode[i]];
868 if (scount > 0 || rcount > 0)
870 /* Communicate the forces */
871 pme_dd_sendrecv(atc, TRUE, i,
872 atc->f[local_pos], scount*sizeof(rvec),
873 pme->bufv[buf_pos], rcount*sizeof(rvec));
876 buf_index[commnode[i]] = buf_pos;
883 for (i = 0; i < n; i++)
886 if (node == atc->nodeid)
888 /* Add from the local force array */
889 rvec_inc(f[i], atc->f[local_pos]);
894 /* Add from the receive buffer */
895 rvec_inc(f[i], pme->bufv[buf_index[node]]);
902 for (i = 0; i < n; i++)
905 if (node == atc->nodeid)
907 /* Copy from the local force array */
908 copy_rvec(atc->f[local_pos], f[i]);
913 /* Copy from the receive buffer */
914 copy_rvec(pme->bufv[buf_index[node]], f[i]);
922 static void gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
924 pme_overlap_t *overlap;
925 int send_index0, send_nindex;
926 int recv_index0, recv_nindex;
928 int i, j, k, ix, iy, iz, icnt;
929 int ipulse, send_id, recv_id, datasize;
931 real *sendptr, *recvptr;
933 /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
934 overlap = &pme->overlap[1];
936 for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
938 /* Since we have already (un)wrapped the overlap in the z-dimension,
939 * we only have to communicate 0 to nkz (not pmegrid_nz).
941 if (direction == GMX_SUM_GRID_FORWARD)
943 send_id = overlap->send_id[ipulse];
944 recv_id = overlap->recv_id[ipulse];
945 send_index0 = overlap->comm_data[ipulse].send_index0;
946 send_nindex = overlap->comm_data[ipulse].send_nindex;
947 recv_index0 = overlap->comm_data[ipulse].recv_index0;
948 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
952 send_id = overlap->recv_id[ipulse];
953 recv_id = overlap->send_id[ipulse];
954 send_index0 = overlap->comm_data[ipulse].recv_index0;
955 send_nindex = overlap->comm_data[ipulse].recv_nindex;
956 recv_index0 = overlap->comm_data[ipulse].send_index0;
957 recv_nindex = overlap->comm_data[ipulse].send_nindex;
960 /* Copy data to contiguous send buffer */
963 fprintf(debug, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
964 pme->nodeid, overlap->nodeid, send_id,
965 pme->pmegrid_start_iy,
966 send_index0-pme->pmegrid_start_iy,
967 send_index0-pme->pmegrid_start_iy+send_nindex);
970 for (i = 0; i < pme->pmegrid_nx; i++)
973 for (j = 0; j < send_nindex; j++)
975 iy = j + send_index0 - pme->pmegrid_start_iy;
976 for (k = 0; k < pme->nkz; k++)
979 overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
984 datasize = pme->pmegrid_nx * pme->nkz;
986 MPI_Sendrecv(overlap->sendbuf, send_nindex*datasize, GMX_MPI_REAL,
988 overlap->recvbuf, recv_nindex*datasize, GMX_MPI_REAL,
990 overlap->mpi_comm, &stat);
992 /* Get data from contiguous recv buffer */
995 fprintf(debug, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
996 pme->nodeid, overlap->nodeid, recv_id,
997 pme->pmegrid_start_iy,
998 recv_index0-pme->pmegrid_start_iy,
999 recv_index0-pme->pmegrid_start_iy+recv_nindex);
1002 for (i = 0; i < pme->pmegrid_nx; i++)
1005 for (j = 0; j < recv_nindex; j++)
1007 iy = j + recv_index0 - pme->pmegrid_start_iy;
1008 for (k = 0; k < pme->nkz; k++)
1011 if (direction == GMX_SUM_GRID_FORWARD)
1013 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
1017 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] = overlap->recvbuf[icnt++];
1024 /* Major dimension is easier, no copying required,
1025 * but we might have to sum to separate array.
1026 * Since we don't copy, we have to communicate up to pmegrid_nz,
1027 * not nkz as for the minor direction.
1029 overlap = &pme->overlap[0];
1031 for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
1033 if (direction == GMX_SUM_GRID_FORWARD)
1035 send_id = overlap->send_id[ipulse];
1036 recv_id = overlap->recv_id[ipulse];
1037 send_index0 = overlap->comm_data[ipulse].send_index0;
1038 send_nindex = overlap->comm_data[ipulse].send_nindex;
1039 recv_index0 = overlap->comm_data[ipulse].recv_index0;
1040 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
1041 recvptr = overlap->recvbuf;
1045 send_id = overlap->recv_id[ipulse];
1046 recv_id = overlap->send_id[ipulse];
1047 send_index0 = overlap->comm_data[ipulse].recv_index0;
1048 send_nindex = overlap->comm_data[ipulse].recv_nindex;
1049 recv_index0 = overlap->comm_data[ipulse].send_index0;
1050 recv_nindex = overlap->comm_data[ipulse].send_nindex;
1051 recvptr = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1054 sendptr = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1055 datasize = pme->pmegrid_ny * pme->pmegrid_nz;
1059 fprintf(debug, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
1060 pme->nodeid, overlap->nodeid, send_id,
1061 pme->pmegrid_start_ix,
1062 send_index0-pme->pmegrid_start_ix,
1063 send_index0-pme->pmegrid_start_ix+send_nindex);
1064 fprintf(debug, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
1065 pme->nodeid, overlap->nodeid, recv_id,
1066 pme->pmegrid_start_ix,
1067 recv_index0-pme->pmegrid_start_ix,
1068 recv_index0-pme->pmegrid_start_ix+recv_nindex);
1071 MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
1073 recvptr, recv_nindex*datasize, GMX_MPI_REAL,
1075 overlap->mpi_comm, &stat);
1077 /* ADD data from contiguous recv buffer */
1078 if (direction == GMX_SUM_GRID_FORWARD)
1080 p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1081 for (i = 0; i < recv_nindex*datasize; i++)
1083 p[i] += overlap->recvbuf[i];
1091 static int copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid, int grid_index)
1093 ivec local_fft_ndata, local_fft_offset, local_fft_size;
1094 ivec local_pme_size;
1098 /* Dimensions should be identical for A/B grid, so we just use A here */
1099 gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
1104 local_pme_size[0] = pme->pmegrid_nx;
1105 local_pme_size[1] = pme->pmegrid_ny;
1106 local_pme_size[2] = pme->pmegrid_nz;
1108 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1109 the offset is identical, and the PME grid always has more data (due to overlap)
1116 sprintf(fn, "pmegrid%d.pdb", pme->nodeid);
1117 fp = gmx_ffopen(fn, "w");
1118 sprintf(fn, "pmegrid%d.txt", pme->nodeid);
1119 fp2 = gmx_ffopen(fn, "w");
1122 for (ix = 0; ix < local_fft_ndata[XX]; ix++)
1124 for (iy = 0; iy < local_fft_ndata[YY]; iy++)
1126 for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
1128 pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
1129 fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
1130 fftgrid[fftidx] = pmegrid[pmeidx];
1132 val = 100*pmegrid[pmeidx];
1133 if (pmegrid[pmeidx] != 0)
1135 gmx_fprintf_pdb_atomline(fp, epdbATOM, pmeidx, "CA", ' ', "GLY", ' ', pmeidx, ' ',
1136 5.0*ix, 5.0*iy, 5.0*iz, 1.0, val, "");
1138 if (pmegrid[pmeidx] != 0)
1140 fprintf(fp2, "%-12s %5d %5d %5d %12.5e\n",
1142 pme->pmegrid_start_ix + ix,
1143 pme->pmegrid_start_iy + iy,
1144 pme->pmegrid_start_iz + iz,
1160 static gmx_cycles_t omp_cyc_start()
1162 return gmx_cycles_read();
1165 static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
1167 return gmx_cycles_read() - c;
1171 static int copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid, int grid_index,
1172 int nthread, int thread)
1174 ivec local_fft_ndata, local_fft_offset, local_fft_size;
1175 ivec local_pme_size;
1176 int ixy0, ixy1, ixy, ix, iy, iz;
1178 #ifdef PME_TIME_THREADS
1180 static double cs1 = 0;
1184 #ifdef PME_TIME_THREADS
1185 c1 = omp_cyc_start();
1187 /* Dimensions should be identical for A/B grid, so we just use A here */
1188 gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
1193 local_pme_size[0] = pme->pmegrid_nx;
1194 local_pme_size[1] = pme->pmegrid_ny;
1195 local_pme_size[2] = pme->pmegrid_nz;
1197 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1198 the offset is identical, and the PME grid always has more data (due to overlap)
1200 ixy0 = ((thread )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1201 ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1203 for (ixy = ixy0; ixy < ixy1; ixy++)
1205 ix = ixy/local_fft_ndata[YY];
1206 iy = ixy - ix*local_fft_ndata[YY];
1208 pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
1209 fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
1210 for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
1212 pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
1216 #ifdef PME_TIME_THREADS
1217 c1 = omp_cyc_end(c1);
1222 printf("copy %.2f\n", cs1*1e-9);
1230 static void wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1232 int nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix, iy, iz;
1238 pnx = pme->pmegrid_nx;
1239 pny = pme->pmegrid_ny;
1240 pnz = pme->pmegrid_nz;
1242 overlap = pme->pme_order - 1;
1244 /* Add periodic overlap in z */
1245 for (ix = 0; ix < pme->pmegrid_nx; ix++)
1247 for (iy = 0; iy < pme->pmegrid_ny; iy++)
1249 for (iz = 0; iz < overlap; iz++)
1251 pmegrid[(ix*pny+iy)*pnz+iz] +=
1252 pmegrid[(ix*pny+iy)*pnz+nz+iz];
1257 if (pme->nnodes_minor == 1)
1259 for (ix = 0; ix < pme->pmegrid_nx; ix++)
1261 for (iy = 0; iy < overlap; iy++)
1263 for (iz = 0; iz < nz; iz++)
1265 pmegrid[(ix*pny+iy)*pnz+iz] +=
1266 pmegrid[(ix*pny+ny+iy)*pnz+iz];
1272 if (pme->nnodes_major == 1)
1274 ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1276 for (ix = 0; ix < overlap; ix++)
1278 for (iy = 0; iy < ny_x; iy++)
1280 for (iz = 0; iz < nz; iz++)
1282 pmegrid[(ix*pny+iy)*pnz+iz] +=
1283 pmegrid[((nx+ix)*pny+iy)*pnz+iz];
1291 static void unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1293 int nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix;
1299 pnx = pme->pmegrid_nx;
1300 pny = pme->pmegrid_ny;
1301 pnz = pme->pmegrid_nz;
1303 overlap = pme->pme_order - 1;
1305 if (pme->nnodes_major == 1)
1307 ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1309 for (ix = 0; ix < overlap; ix++)
1313 for (iy = 0; iy < ny_x; iy++)
1315 for (iz = 0; iz < nz; iz++)
1317 pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
1318 pmegrid[(ix*pny+iy)*pnz+iz];
1324 if (pme->nnodes_minor == 1)
1326 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1327 for (ix = 0; ix < pme->pmegrid_nx; ix++)
1331 for (iy = 0; iy < overlap; iy++)
1333 for (iz = 0; iz < nz; iz++)
1335 pmegrid[(ix*pny+ny+iy)*pnz+iz] =
1336 pmegrid[(ix*pny+iy)*pnz+iz];
1342 /* Copy periodic overlap in z */
1343 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1344 for (ix = 0; ix < pme->pmegrid_nx; ix++)
1348 for (iy = 0; iy < pme->pmegrid_ny; iy++)
1350 for (iz = 0; iz < overlap; iz++)
1352 pmegrid[(ix*pny+iy)*pnz+nz+iz] =
1353 pmegrid[(ix*pny+iy)*pnz+iz];
1360 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
1361 #define DO_BSPLINE(order) \
1362 for (ithx = 0; (ithx < order); ithx++) \
1364 index_x = (i0+ithx)*pny*pnz; \
1365 valx = coefficient*thx[ithx]; \
1367 for (ithy = 0; (ithy < order); ithy++) \
1369 valxy = valx*thy[ithy]; \
1370 index_xy = index_x+(j0+ithy)*pnz; \
1372 for (ithz = 0; (ithz < order); ithz++) \
1374 index_xyz = index_xy+(k0+ithz); \
1375 grid[index_xyz] += valxy*thz[ithz]; \
1381 static void spread_coefficients_bsplines_thread(pmegrid_t *pmegrid,
1382 pme_atomcomm_t *atc,
1383 splinedata_t *spline,
1384 pme_spline_work_t gmx_unused *work)
1387 /* spread coefficients from home atoms to local grid */
1390 int b, i, nn, n, ithx, ithy, ithz, i0, j0, k0;
1392 int order, norder, index_x, index_xy, index_xyz;
1393 real valx, valxy, coefficient;
1394 real *thx, *thy, *thz;
1395 int localsize, bndsize;
1396 int pnx, pny, pnz, ndatatot;
1397 int offx, offy, offz;
1399 #if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
1400 real thz_buffer[GMX_SIMD4_WIDTH*3], *thz_aligned;
1402 thz_aligned = gmx_simd4_align_r(thz_buffer);
1405 pnx = pmegrid->s[XX];
1406 pny = pmegrid->s[YY];
1407 pnz = pmegrid->s[ZZ];
1409 offx = pmegrid->offset[XX];
1410 offy = pmegrid->offset[YY];
1411 offz = pmegrid->offset[ZZ];
1413 ndatatot = pnx*pny*pnz;
1414 grid = pmegrid->grid;
1415 for (i = 0; i < ndatatot; i++)
1420 order = pmegrid->order;
1422 for (nn = 0; nn < spline->n; nn++)
1424 n = spline->ind[nn];
1425 coefficient = atc->coefficient[n];
1427 if (coefficient != 0)
1429 idxptr = atc->idx[n];
1432 i0 = idxptr[XX] - offx;
1433 j0 = idxptr[YY] - offy;
1434 k0 = idxptr[ZZ] - offz;
1436 thx = spline->theta[XX] + norder;
1437 thy = spline->theta[YY] + norder;
1438 thz = spline->theta[ZZ] + norder;
1443 #ifdef PME_SIMD4_SPREAD_GATHER
1444 #ifdef PME_SIMD4_UNALIGNED
1445 #define PME_SPREAD_SIMD4_ORDER4
1447 #define PME_SPREAD_SIMD4_ALIGNED
1450 #include "gromacs/ewald/pme-simd4.h" /* IWYU pragma: keep */
1456 #ifdef PME_SIMD4_SPREAD_GATHER
1457 #define PME_SPREAD_SIMD4_ALIGNED
1459 #include "gromacs/ewald/pme-simd4.h" /* IWYU pragma: keep */
1472 static void set_grid_alignment(int gmx_unused *pmegrid_nz, int gmx_unused pme_order)
1474 #ifdef PME_SIMD4_SPREAD_GATHER
1476 #ifndef PME_SIMD4_UNALIGNED
1481 /* Round nz up to a multiple of 4 to ensure alignment */
1482 *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
1487 static void set_gridsize_alignment(int gmx_unused *gridsize, int gmx_unused pme_order)
1489 #ifdef PME_SIMD4_SPREAD_GATHER
1490 #ifndef PME_SIMD4_UNALIGNED
1493 /* Add extra elements to ensured aligned operations do not go
1494 * beyond the allocated grid size.
1495 * Note that for pme_order=5, the pme grid z-size alignment
1496 * ensures that we will not go beyond the grid size.
1504 static void pmegrid_init(pmegrid_t *grid,
1505 int cx, int cy, int cz,
1506 int x0, int y0, int z0,
1507 int x1, int y1, int z1,
1508 gmx_bool set_alignment,
1517 grid->offset[XX] = x0;
1518 grid->offset[YY] = y0;
1519 grid->offset[ZZ] = z0;
1520 grid->n[XX] = x1 - x0 + pme_order - 1;
1521 grid->n[YY] = y1 - y0 + pme_order - 1;
1522 grid->n[ZZ] = z1 - z0 + pme_order - 1;
1523 copy_ivec(grid->n, grid->s);
1526 set_grid_alignment(&nz, pme_order);
1531 else if (nz != grid->s[ZZ])
1533 gmx_incons("pmegrid_init call with an unaligned z size");
1536 grid->order = pme_order;
1539 gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ];
1540 set_gridsize_alignment(&gridsize, pme_order);
1541 snew_aligned(grid->grid, gridsize, SIMD4_ALIGNMENT);
1549 static int div_round_up(int enumerator, int denominator)
1551 return (enumerator + denominator - 1)/denominator;
1554 static void make_subgrid_division(const ivec n, int ovl, int nthread,
1557 int gsize_opt, gsize;
1562 for (nsx = 1; nsx <= nthread; nsx++)
1564 if (nthread % nsx == 0)
1566 for (nsy = 1; nsy <= nthread; nsy++)
1568 if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
1570 nsz = nthread/(nsx*nsy);
1572 /* Determine the number of grid points per thread */
1574 (div_round_up(n[XX], nsx) + ovl)*
1575 (div_round_up(n[YY], nsy) + ovl)*
1576 (div_round_up(n[ZZ], nsz) + ovl);
1578 /* Minimize the number of grids points per thread
1579 * and, secondarily, the number of cuts in minor dimensions.
1581 if (gsize_opt == -1 ||
1582 gsize < gsize_opt ||
1583 (gsize == gsize_opt &&
1584 (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
1596 env = getenv("GMX_PME_THREAD_DIVISION");
1599 sscanf(env, "%d %d %d", &nsub[XX], &nsub[YY], &nsub[ZZ]);
1602 if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
1604 gmx_fatal(FARGS, "PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)", nsub[XX], nsub[YY], nsub[ZZ], nthread);
1608 static void pmegrids_init(pmegrids_t *grids,
1609 int nx, int ny, int nz, int nz_base,
1611 gmx_bool bUseThreads,
1616 ivec n, n_base, g0, g1;
1617 int t, x, y, z, d, i, tfac;
1618 int max_comm_lines = -1;
1620 n[XX] = nx - (pme_order - 1);
1621 n[YY] = ny - (pme_order - 1);
1622 n[ZZ] = nz - (pme_order - 1);
1624 copy_ivec(n, n_base);
1625 n_base[ZZ] = nz_base;
1627 pmegrid_init(&grids->grid, 0, 0, 0, 0, 0, 0, n[XX], n[YY], n[ZZ], FALSE, pme_order,
1630 grids->nthread = nthread;
1632 make_subgrid_division(n_base, pme_order-1, grids->nthread, grids->nc);
1639 for (d = 0; d < DIM; d++)
1641 nst[d] = div_round_up(n[d], grids->nc[d]) + pme_order - 1;
1643 set_grid_alignment(&nst[ZZ], pme_order);
1647 fprintf(debug, "pmegrid thread local division: %d x %d x %d\n",
1648 grids->nc[XX], grids->nc[YY], grids->nc[ZZ]);
1649 fprintf(debug, "pmegrid %d %d %d max thread pmegrid %d %d %d\n",
1651 nst[XX], nst[YY], nst[ZZ]);
1654 snew(grids->grid_th, grids->nthread);
1656 gridsize = nst[XX]*nst[YY]*nst[ZZ];
1657 set_gridsize_alignment(&gridsize, pme_order);
1658 snew_aligned(grids->grid_all,
1659 grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
1662 for (x = 0; x < grids->nc[XX]; x++)
1664 for (y = 0; y < grids->nc[YY]; y++)
1666 for (z = 0; z < grids->nc[ZZ]; z++)
1668 pmegrid_init(&grids->grid_th[t],
1670 (n[XX]*(x ))/grids->nc[XX],
1671 (n[YY]*(y ))/grids->nc[YY],
1672 (n[ZZ]*(z ))/grids->nc[ZZ],
1673 (n[XX]*(x+1))/grids->nc[XX],
1674 (n[YY]*(y+1))/grids->nc[YY],
1675 (n[ZZ]*(z+1))/grids->nc[ZZ],
1678 grids->grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
1686 grids->grid_th = NULL;
1689 snew(grids->g2t, DIM);
1691 for (d = DIM-1; d >= 0; d--)
1693 snew(grids->g2t[d], n[d]);
1695 for (i = 0; i < n[d]; i++)
1697 /* The second check should match the parameters
1698 * of the pmegrid_init call above.
1700 while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
1704 grids->g2t[d][i] = t*tfac;
1707 tfac *= grids->nc[d];
1711 case XX: max_comm_lines = overlap_x; break;
1712 case YY: max_comm_lines = overlap_y; break;
1713 case ZZ: max_comm_lines = pme_order - 1; break;
1715 grids->nthread_comm[d] = 0;
1716 while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines &&
1717 grids->nthread_comm[d] < grids->nc[d])
1719 grids->nthread_comm[d]++;
1723 fprintf(debug, "pmegrid thread grid communication range in %c: %d\n",
1724 'x'+d, grids->nthread_comm[d]);
1726 /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
1727 * work, but this is not a problematic restriction.
1729 if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
1731 gmx_fatal(FARGS, "Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME", grids->nthread);
1737 static void pmegrids_destroy(pmegrids_t *grids)
1741 if (grids->grid.grid != NULL)
1743 sfree(grids->grid.grid);
1745 if (grids->nthread > 0)
1747 for (t = 0; t < grids->nthread; t++)
1749 sfree(grids->grid_th[t].grid);
1751 sfree(grids->grid_th);
1757 static void realloc_work(pme_work_t *work, int nkx)
1761 if (nkx > work->nalloc)
1764 srenew(work->mhx, work->nalloc);
1765 srenew(work->mhy, work->nalloc);
1766 srenew(work->mhz, work->nalloc);
1767 srenew(work->m2, work->nalloc);
1768 /* Allocate an aligned pointer for SIMD operations, including extra
1769 * elements at the end for padding.
1771 #ifdef PME_SIMD_SOLVE
1772 simd_width = GMX_SIMD_REAL_WIDTH;
1774 /* We can use any alignment, apart from 0, so we use 4 */
1777 sfree_aligned(work->denom);
1778 sfree_aligned(work->tmp1);
1779 sfree_aligned(work->tmp2);
1780 sfree_aligned(work->eterm);
1781 snew_aligned(work->denom, work->nalloc+simd_width, simd_width*sizeof(real));
1782 snew_aligned(work->tmp1, work->nalloc+simd_width, simd_width*sizeof(real));
1783 snew_aligned(work->tmp2, work->nalloc+simd_width, simd_width*sizeof(real));
1784 snew_aligned(work->eterm, work->nalloc+simd_width, simd_width*sizeof(real));
1785 srenew(work->m2inv, work->nalloc);
1787 for (i = 0; i < work->nalloc+simd_width; i++)
1789 work->denom[i] = 1; /* init to 1 to avoid 1/0 exceptions of simd padded elements */
1796 static void free_work(pme_work_t *work)
1802 sfree_aligned(work->denom);
1803 sfree_aligned(work->tmp1);
1804 sfree_aligned(work->tmp2);
1805 sfree_aligned(work->eterm);
1810 #if defined PME_SIMD_SOLVE
1811 /* Calculate exponentials through SIMD */
1812 gmx_inline static void calc_exponentials_q(int gmx_unused start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
1815 const gmx_simd_real_t two = gmx_simd_set1_r(2.0);
1816 gmx_simd_real_t f_simd;
1818 gmx_simd_real_t tmp_d1, d_inv, tmp_r, tmp_e;
1820 f_simd = gmx_simd_set1_r(f);
1821 /* We only need to calculate from start. But since start is 0 or 1
1822 * and we want to use aligned loads/stores, we always start from 0.
1824 for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH)
1826 tmp_d1 = gmx_simd_load_r(d_aligned+kx);
1827 d_inv = gmx_simd_inv_r(tmp_d1);
1828 tmp_r = gmx_simd_load_r(r_aligned+kx);
1829 tmp_r = gmx_simd_exp_r(tmp_r);
1830 tmp_e = gmx_simd_mul_r(f_simd, d_inv);
1831 tmp_e = gmx_simd_mul_r(tmp_e, tmp_r);
1832 gmx_simd_store_r(e_aligned+kx, tmp_e);
1837 gmx_inline static void calc_exponentials_q(int start, int end, real f, real *d, real *r, real *e)
1840 for (kx = start; kx < end; kx++)
1844 for (kx = start; kx < end; kx++)
1848 for (kx = start; kx < end; kx++)
1850 e[kx] = f*r[kx]*d[kx];
1855 #if defined PME_SIMD_SOLVE
1856 /* Calculate exponentials through SIMD */
1857 gmx_inline static void calc_exponentials_lj(int gmx_unused start, int end, real *r_aligned, real *factor_aligned, real *d_aligned)
1859 gmx_simd_real_t tmp_r, tmp_d, tmp_fac, d_inv, tmp_mk;
1860 const gmx_simd_real_t sqr_PI = gmx_simd_sqrt_r(gmx_simd_set1_r(M_PI));
1862 for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH)
1864 /* We only need to calculate from start. But since start is 0 or 1
1865 * and we want to use aligned loads/stores, we always start from 0.
1867 tmp_d = gmx_simd_load_r(d_aligned+kx);
1868 d_inv = gmx_simd_inv_r(tmp_d);
1869 gmx_simd_store_r(d_aligned+kx, d_inv);
1870 tmp_r = gmx_simd_load_r(r_aligned+kx);
1871 tmp_r = gmx_simd_exp_r(tmp_r);
1872 gmx_simd_store_r(r_aligned+kx, tmp_r);
1873 tmp_mk = gmx_simd_load_r(factor_aligned+kx);
1874 tmp_fac = gmx_simd_mul_r(sqr_PI, gmx_simd_mul_r(tmp_mk, gmx_simd_erfc_r(tmp_mk)));
1875 gmx_simd_store_r(factor_aligned+kx, tmp_fac);
1879 gmx_inline static void calc_exponentials_lj(int start, int end, real *r, real *tmp2, real *d)
1883 for (kx = start; kx < end; kx++)
1888 for (kx = start; kx < end; kx++)
1893 for (kx = start; kx < end; kx++)
1896 tmp2[kx] = sqrt(M_PI)*mk*gmx_erfc(mk);
1901 static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
1902 real ewaldcoeff, real vol,
1904 int nthread, int thread)
1906 /* do recip sum over local cells in grid */
1907 /* y major, z middle, x minor or continuous */
1909 int kx, ky, kz, maxkx, maxky, maxkz;
1910 int nx, ny, nz, iyz0, iyz1, iyz, iy, iz, kxstart, kxend;
1912 real factor = M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
1913 real ets2, struct2, vfactor, ets2vf;
1914 real d1, d2, energy = 0;
1916 real virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
1917 real rxx, ryx, ryy, rzx, rzy, rzz;
1919 real *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *eterm, *m2inv;
1920 real mhxk, mhyk, mhzk, m2k;
1923 ivec local_ndata, local_offset, local_size;
1926 elfac = ONE_4PI_EPS0/pme->epsilon_r;
1932 /* Dimensions should be identical for A/B grid, so we just use A here */
1933 gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_QA],
1939 rxx = pme->recipbox[XX][XX];
1940 ryx = pme->recipbox[YY][XX];
1941 ryy = pme->recipbox[YY][YY];
1942 rzx = pme->recipbox[ZZ][XX];
1943 rzy = pme->recipbox[ZZ][YY];
1944 rzz = pme->recipbox[ZZ][ZZ];
1950 work = &pme->work[thread];
1955 denom = work->denom;
1957 eterm = work->eterm;
1958 m2inv = work->m2inv;
1960 iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread /nthread;
1961 iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
1963 for (iyz = iyz0; iyz < iyz1; iyz++)
1965 iy = iyz/local_ndata[ZZ];
1966 iz = iyz - iy*local_ndata[ZZ];
1968 ky = iy + local_offset[YY];
1979 by = M_PI*vol*pme->bsp_mod[YY][ky];
1981 kz = iz + local_offset[ZZ];
1985 bz = pme->bsp_mod[ZZ][kz];
1987 /* 0.5 correction for corner points */
1989 if (kz == 0 || kz == (nz+1)/2)
1994 p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
1996 /* We should skip the k-space point (0,0,0) */
1997 /* Note that since here x is the minor index, local_offset[XX]=0 */
1998 if (local_offset[XX] > 0 || ky > 0 || kz > 0)
2000 kxstart = local_offset[XX];
2004 kxstart = local_offset[XX] + 1;
2007 kxend = local_offset[XX] + local_ndata[XX];
2011 /* More expensive inner loop, especially because of the storage
2012 * of the mh elements in array's.
2013 * Because x is the minor grid index, all mh elements
2014 * depend on kx for triclinic unit cells.
2017 /* Two explicit loops to avoid a conditional inside the loop */
2018 for (kx = kxstart; kx < maxkx; kx++)
2023 mhyk = mx * ryx + my * ryy;
2024 mhzk = mx * rzx + my * rzy + mz * rzz;
2025 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2030 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2031 tmp1[kx] = -factor*m2k;
2034 for (kx = maxkx; kx < kxend; kx++)
2039 mhyk = mx * ryx + my * ryy;
2040 mhzk = mx * rzx + my * rzy + mz * rzz;
2041 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2046 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2047 tmp1[kx] = -factor*m2k;
2050 for (kx = kxstart; kx < kxend; kx++)
2052 m2inv[kx] = 1.0/m2[kx];
2055 calc_exponentials_q(kxstart, kxend, elfac, denom, tmp1, eterm);
2057 for (kx = kxstart; kx < kxend; kx++, p0++)
2062 p0->re = d1*eterm[kx];
2063 p0->im = d2*eterm[kx];
2065 struct2 = 2.0*(d1*d1+d2*d2);
2067 tmp1[kx] = eterm[kx]*struct2;
2070 for (kx = kxstart; kx < kxend; kx++)
2072 ets2 = corner_fac*tmp1[kx];
2073 vfactor = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
2076 ets2vf = ets2*vfactor;
2077 virxx += ets2vf*mhx[kx]*mhx[kx] - ets2;
2078 virxy += ets2vf*mhx[kx]*mhy[kx];
2079 virxz += ets2vf*mhx[kx]*mhz[kx];
2080 viryy += ets2vf*mhy[kx]*mhy[kx] - ets2;
2081 viryz += ets2vf*mhy[kx]*mhz[kx];
2082 virzz += ets2vf*mhz[kx]*mhz[kx] - ets2;
2087 /* We don't need to calculate the energy and the virial.
2088 * In this case the triclinic overhead is small.
2091 /* Two explicit loops to avoid a conditional inside the loop */
2093 for (kx = kxstart; kx < maxkx; kx++)
2098 mhyk = mx * ryx + my * ryy;
2099 mhzk = mx * rzx + my * rzy + mz * rzz;
2100 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2101 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2102 tmp1[kx] = -factor*m2k;
2105 for (kx = maxkx; kx < kxend; kx++)
2110 mhyk = mx * ryx + my * ryy;
2111 mhzk = mx * rzx + my * rzy + mz * rzz;
2112 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2113 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2114 tmp1[kx] = -factor*m2k;
2117 calc_exponentials_q(kxstart, kxend, elfac, denom, tmp1, eterm);
2119 for (kx = kxstart; kx < kxend; kx++, p0++)
2124 p0->re = d1*eterm[kx];
2125 p0->im = d2*eterm[kx];
2132 /* Update virial with local values.
2133 * The virial is symmetric by definition.
2134 * this virial seems ok for isotropic scaling, but I'm
2135 * experiencing problems on semiisotropic membranes.
2136 * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
2138 work->vir_q[XX][XX] = 0.25*virxx;
2139 work->vir_q[YY][YY] = 0.25*viryy;
2140 work->vir_q[ZZ][ZZ] = 0.25*virzz;
2141 work->vir_q[XX][YY] = work->vir_q[YY][XX] = 0.25*virxy;
2142 work->vir_q[XX][ZZ] = work->vir_q[ZZ][XX] = 0.25*virxz;
2143 work->vir_q[YY][ZZ] = work->vir_q[ZZ][YY] = 0.25*viryz;
2145 /* This energy should be corrected for a charged system */
2146 work->energy_q = 0.5*energy;
2149 /* Return the loop count */
2150 return local_ndata[YY]*local_ndata[XX];
2153 static int solve_pme_lj_yzx(gmx_pme_t pme, t_complex **grid, gmx_bool bLB,
2154 real ewaldcoeff, real vol,
2155 gmx_bool bEnerVir, int nthread, int thread)
2157 /* do recip sum over local cells in grid */
2158 /* y major, z middle, x minor or continuous */
2160 int kx, ky, kz, maxkx, maxky, maxkz;
2161 int nx, ny, nz, iy, iyz0, iyz1, iyz, iz, kxstart, kxend;
2163 real factor = M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
2165 real eterm, vterm, d1, d2, energy = 0;
2167 real virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
2168 real rxx, ryx, ryy, rzx, rzy, rzz;
2169 real *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *tmp2;
2170 real mhxk, mhyk, mhzk, m2k;
2175 ivec local_ndata, local_offset, local_size;
2180 /* Dimensions should be identical for A/B grid, so we just use A here */
2181 gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_C6A],
2186 rxx = pme->recipbox[XX][XX];
2187 ryx = pme->recipbox[YY][XX];
2188 ryy = pme->recipbox[YY][YY];
2189 rzx = pme->recipbox[ZZ][XX];
2190 rzy = pme->recipbox[ZZ][YY];
2191 rzz = pme->recipbox[ZZ][ZZ];
2197 work = &pme->work[thread];
2202 denom = work->denom;
2206 iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread /nthread;
2207 iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
2209 for (iyz = iyz0; iyz < iyz1; iyz++)
2211 iy = iyz/local_ndata[ZZ];
2212 iz = iyz - iy*local_ndata[ZZ];
2214 ky = iy + local_offset[YY];
2225 by = 3.0*vol*pme->bsp_mod[YY][ky]
2226 / (M_PI*sqrt(M_PI)*ewaldcoeff*ewaldcoeff*ewaldcoeff);
2228 kz = iz + local_offset[ZZ];
2232 bz = pme->bsp_mod[ZZ][kz];
2234 /* 0.5 correction for corner points */
2236 if (kz == 0 || kz == (nz+1)/2)
2241 kxstart = local_offset[XX];
2242 kxend = local_offset[XX] + local_ndata[XX];
2245 /* More expensive inner loop, especially because of the
2246 * storage of the mh elements in array's. Because x is the
2247 * minor grid index, all mh elements depend on kx for
2248 * triclinic unit cells.
2251 /* Two explicit loops to avoid a conditional inside the loop */
2252 for (kx = kxstart; kx < maxkx; kx++)
2257 mhyk = mx * ryx + my * ryy;
2258 mhzk = mx * rzx + my * rzy + mz * rzz;
2259 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2264 denom[kx] = bz*by*pme->bsp_mod[XX][kx];
2265 tmp1[kx] = -factor*m2k;
2266 tmp2[kx] = sqrt(factor*m2k);
2269 for (kx = maxkx; kx < kxend; kx++)
2274 mhyk = mx * ryx + my * ryy;
2275 mhzk = mx * rzx + my * rzy + mz * rzz;
2276 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2281 denom[kx] = bz*by*pme->bsp_mod[XX][kx];
2282 tmp1[kx] = -factor*m2k;
2283 tmp2[kx] = sqrt(factor*m2k);
2286 calc_exponentials_lj(kxstart, kxend, tmp1, tmp2, denom);
2288 for (kx = kxstart; kx < kxend; kx++)
2290 m2k = factor*m2[kx];
2291 eterm = -((1.0 - 2.0*m2k)*tmp1[kx]
2292 + 2.0*m2k*tmp2[kx]);
2293 vterm = 3.0*(-tmp1[kx] + tmp2[kx]);
2294 tmp1[kx] = eterm*denom[kx];
2295 tmp2[kx] = vterm*denom[kx];
2303 p0 = grid[0] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
2304 for (kx = kxstart; kx < kxend; kx++, p0++)
2314 struct2 = 2.0*(d1*d1+d2*d2);
2316 tmp1[kx] = eterm*struct2;
2317 tmp2[kx] = vterm*struct2;
2322 real *struct2 = denom;
2325 for (kx = kxstart; kx < kxend; kx++)
2329 /* Due to symmetry we only need to calculate 4 of the 7 terms */
2330 for (ig = 0; ig <= 3; ++ig)
2335 p0 = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
2336 p1 = grid[6-ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
2337 scale = 2.0*lb_scale_factor_symm[ig];
2338 for (kx = kxstart; kx < kxend; ++kx, ++p0, ++p1)
2340 struct2[kx] += scale*(p0->re*p1->re + p0->im*p1->im);
2344 for (ig = 0; ig <= 6; ++ig)
2348 p0 = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
2349 for (kx = kxstart; kx < kxend; kx++, p0++)
2359 for (kx = kxstart; kx < kxend; kx++)
2364 tmp1[kx] = eterm*str2;
2365 tmp2[kx] = vterm*str2;
2369 for (kx = kxstart; kx < kxend; kx++)
2371 ets2 = corner_fac*tmp1[kx];
2372 vterm = 2.0*factor*tmp2[kx];
2374 ets2vf = corner_fac*vterm;
2375 virxx += ets2vf*mhx[kx]*mhx[kx] - ets2;
2376 virxy += ets2vf*mhx[kx]*mhy[kx];
2377 virxz += ets2vf*mhx[kx]*mhz[kx];
2378 viryy += ets2vf*mhy[kx]*mhy[kx] - ets2;
2379 viryz += ets2vf*mhy[kx]*mhz[kx];
2380 virzz += ets2vf*mhz[kx]*mhz[kx] - ets2;
2385 /* We don't need to calculate the energy and the virial.
2386 * In this case the triclinic overhead is small.
2389 /* Two explicit loops to avoid a conditional inside the loop */
2391 for (kx = kxstart; kx < maxkx; kx++)
2396 mhyk = mx * ryx + my * ryy;
2397 mhzk = mx * rzx + my * rzy + mz * rzz;
2398 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2400 denom[kx] = bz*by*pme->bsp_mod[XX][kx];
2401 tmp1[kx] = -factor*m2k;
2402 tmp2[kx] = sqrt(factor*m2k);
2405 for (kx = maxkx; kx < kxend; kx++)
2410 mhyk = mx * ryx + my * ryy;
2411 mhzk = mx * rzx + my * rzy + mz * rzz;
2412 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2414 denom[kx] = bz*by*pme->bsp_mod[XX][kx];
2415 tmp1[kx] = -factor*m2k;
2416 tmp2[kx] = sqrt(factor*m2k);
2419 calc_exponentials_lj(kxstart, kxend, tmp1, tmp2, denom);
2421 for (kx = kxstart; kx < kxend; kx++)
2423 m2k = factor*m2[kx];
2424 eterm = -((1.0 - 2.0*m2k)*tmp1[kx]
2425 + 2.0*m2k*tmp2[kx]);
2426 tmp1[kx] = eterm*denom[kx];
2428 gcount = (bLB ? 7 : 1);
2429 for (ig = 0; ig < gcount; ++ig)
2433 p0 = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
2434 for (kx = kxstart; kx < kxend; kx++, p0++)
2449 work->vir_lj[XX][XX] = 0.25*virxx;
2450 work->vir_lj[YY][YY] = 0.25*viryy;
2451 work->vir_lj[ZZ][ZZ] = 0.25*virzz;
2452 work->vir_lj[XX][YY] = work->vir_lj[YY][XX] = 0.25*virxy;
2453 work->vir_lj[XX][ZZ] = work->vir_lj[ZZ][XX] = 0.25*virxz;
2454 work->vir_lj[YY][ZZ] = work->vir_lj[ZZ][YY] = 0.25*viryz;
2456 /* This energy should be corrected for a charged system */
2457 work->energy_lj = 0.5*energy;
2459 /* Return the loop count */
2460 return local_ndata[YY]*local_ndata[XX];
2463 static void get_pme_ener_vir_q(const gmx_pme_t pme, int nthread,
2464 real *mesh_energy, matrix vir)
2466 /* This function sums output over threads and should therefore
2467 * only be called after thread synchronization.
2471 *mesh_energy = pme->work[0].energy_q;
2472 copy_mat(pme->work[0].vir_q, vir);
2474 for (thread = 1; thread < nthread; thread++)
2476 *mesh_energy += pme->work[thread].energy_q;
2477 m_add(vir, pme->work[thread].vir_q, vir);
2481 static void get_pme_ener_vir_lj(const gmx_pme_t pme, int nthread,
2482 real *mesh_energy, matrix vir)
2484 /* This function sums output over threads and should therefore
2485 * only be called after thread synchronization.
2489 *mesh_energy = pme->work[0].energy_lj;
2490 copy_mat(pme->work[0].vir_lj, vir);
2492 for (thread = 1; thread < nthread; thread++)
2494 *mesh_energy += pme->work[thread].energy_lj;
2495 m_add(vir, pme->work[thread].vir_lj, vir);
2500 #define DO_FSPLINE(order) \
2501 for (ithx = 0; (ithx < order); ithx++) \
2503 index_x = (i0+ithx)*pny*pnz; \
2507 for (ithy = 0; (ithy < order); ithy++) \
2509 index_xy = index_x+(j0+ithy)*pnz; \
2514 for (ithz = 0; (ithz < order); ithz++) \
2516 gval = grid[index_xy+(k0+ithz)]; \
2517 fxy1 += thz[ithz]*gval; \
2518 fz1 += dthz[ithz]*gval; \
2527 static void gather_f_bsplines(gmx_pme_t pme, real *grid,
2528 gmx_bool bClearF, pme_atomcomm_t *atc,
2529 splinedata_t *spline,
2532 /* sum forces for local particles */
2533 int nn, n, ithx, ithy, ithz, i0, j0, k0;
2534 int index_x, index_xy;
2535 int nx, ny, nz, pnx, pny, pnz;
2537 real tx, ty, dx, dy, coefficient;
2538 real fx, fy, fz, gval;
2540 real *thx, *thy, *thz, *dthx, *dthy, *dthz;
2542 real rxx, ryx, ryy, rzx, rzy, rzz;
2545 pme_spline_work_t *work;
2547 #if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
2548 real thz_buffer[GMX_SIMD4_WIDTH*3], *thz_aligned;
2549 real dthz_buffer[GMX_SIMD4_WIDTH*3], *dthz_aligned;
2551 thz_aligned = gmx_simd4_align_r(thz_buffer);
2552 dthz_aligned = gmx_simd4_align_r(dthz_buffer);
2555 work = pme->spline_work;
2557 order = pme->pme_order;
2558 thx = spline->theta[XX];
2559 thy = spline->theta[YY];
2560 thz = spline->theta[ZZ];
2561 dthx = spline->dtheta[XX];
2562 dthy = spline->dtheta[YY];
2563 dthz = spline->dtheta[ZZ];
2567 pnx = pme->pmegrid_nx;
2568 pny = pme->pmegrid_ny;
2569 pnz = pme->pmegrid_nz;
2571 rxx = pme->recipbox[XX][XX];
2572 ryx = pme->recipbox[YY][XX];
2573 ryy = pme->recipbox[YY][YY];
2574 rzx = pme->recipbox[ZZ][XX];
2575 rzy = pme->recipbox[ZZ][YY];
2576 rzz = pme->recipbox[ZZ][ZZ];
2578 for (nn = 0; nn < spline->n; nn++)
2580 n = spline->ind[nn];
2581 coefficient = scale*atc->coefficient[n];
2589 if (coefficient != 0)
2594 idxptr = atc->idx[n];
2601 /* Pointer arithmetic alert, next six statements */
2602 thx = spline->theta[XX] + norder;
2603 thy = spline->theta[YY] + norder;
2604 thz = spline->theta[ZZ] + norder;
2605 dthx = spline->dtheta[XX] + norder;
2606 dthy = spline->dtheta[YY] + norder;
2607 dthz = spline->dtheta[ZZ] + norder;
2612 #ifdef PME_SIMD4_SPREAD_GATHER
2613 #ifdef PME_SIMD4_UNALIGNED
2614 #define PME_GATHER_F_SIMD4_ORDER4
2616 #define PME_GATHER_F_SIMD4_ALIGNED
2619 #include "gromacs/ewald/pme-simd4.h" /* IWYU pragma: keep */
2625 #ifdef PME_SIMD4_SPREAD_GATHER
2626 #define PME_GATHER_F_SIMD4_ALIGNED
2628 #include "gromacs/ewald/pme-simd4.h" /* IWYU pragma: keep */
2638 atc->f[n][XX] += -coefficient*( fx*nx*rxx );
2639 atc->f[n][YY] += -coefficient*( fx*nx*ryx + fy*ny*ryy );
2640 atc->f[n][ZZ] += -coefficient*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
2643 /* Since the energy and not forces are interpolated
2644 * the net force might not be exactly zero.
2645 * This can be solved by also interpolating F, but
2646 * that comes at a cost.
2647 * A better hack is to remove the net force every
2648 * step, but that must be done at a higher level
2649 * since this routine doesn't see all atoms if running
2650 * in parallel. Don't know how important it is? EL 990726
2655 static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
2656 pme_atomcomm_t *atc)
2658 splinedata_t *spline;
2659 int n, ithx, ithy, ithz, i0, j0, k0;
2660 int index_x, index_xy;
2662 real energy, pot, tx, ty, coefficient, gval;
2663 real *thx, *thy, *thz;
2667 spline = &atc->spline[0];
2669 order = pme->pme_order;
2672 for (n = 0; (n < atc->n); n++)
2674 coefficient = atc->coefficient[n];
2676 if (coefficient != 0)
2678 idxptr = atc->idx[n];
2685 /* Pointer arithmetic alert, next three statements */
2686 thx = spline->theta[XX] + norder;
2687 thy = spline->theta[YY] + norder;
2688 thz = spline->theta[ZZ] + norder;
2691 for (ithx = 0; (ithx < order); ithx++)
2693 index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
2696 for (ithy = 0; (ithy < order); ithy++)
2698 index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
2701 for (ithz = 0; (ithz < order); ithz++)
2703 gval = grid[index_xy+(k0+ithz)];
2704 pot += tx*ty*thz[ithz]*gval;
2710 energy += pot*coefficient;
2717 /* Macro to force loop unrolling by fixing order.
2718 * This gives a significant performance gain.
2720 #define CALC_SPLINE(order) \
2724 real data[PME_ORDER_MAX]; \
2725 real ddata[PME_ORDER_MAX]; \
2727 for (j = 0; (j < DIM); j++) \
2731 /* dr is relative offset from lower cell limit */ \
2732 data[order-1] = 0; \
2736 for (k = 3; (k < order); k++) \
2738 div = 1.0/(k - 1.0); \
2739 data[k-1] = div*dr*data[k-2]; \
2740 for (l = 1; (l < (k-1)); l++) \
2742 data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
2745 data[0] = div*(1-dr)*data[0]; \
2747 /* differentiate */ \
2748 ddata[0] = -data[0]; \
2749 for (k = 1; (k < order); k++) \
2751 ddata[k] = data[k-1] - data[k]; \
2754 div = 1.0/(order - 1); \
2755 data[order-1] = div*dr*data[order-2]; \
2756 for (l = 1; (l < (order-1)); l++) \
2758 data[order-l-1] = div*((dr+l)*data[order-l-2]+ \
2759 (order-l-dr)*data[order-l-1]); \
2761 data[0] = div*(1 - dr)*data[0]; \
2763 for (k = 0; k < order; k++) \
2765 theta[j][i*order+k] = data[k]; \
2766 dtheta[j][i*order+k] = ddata[k]; \
2771 void make_bsplines(splinevec theta, splinevec dtheta, int order,
2772 rvec fractx[], int nr, int ind[], real coefficient[],
2773 gmx_bool bDoSplines)
2775 /* construct splines for local atoms */
2779 for (i = 0; i < nr; i++)
2781 /* With free energy we do not use the coefficient check.
2782 * In most cases this will be more efficient than calling make_bsplines
2783 * twice, since usually more than half the particles have non-zero coefficients.
2786 if (bDoSplines || coefficient[ii] != 0.0)
2791 case 4: CALC_SPLINE(4); break;
2792 case 5: CALC_SPLINE(5); break;
2793 default: CALC_SPLINE(order); break;
2800 void make_dft_mod(real *mod, real *data, int ndata)
2805 for (i = 0; i < ndata; i++)
2808 for (j = 0; j < ndata; j++)
2810 arg = (2.0*M_PI*i*j)/ndata;
2811 sc += data[j]*cos(arg);
2812 ss += data[j]*sin(arg);
2814 mod[i] = sc*sc+ss*ss;
2816 for (i = 0; i < ndata; i++)
2820 mod[i] = (mod[i-1]+mod[i+1])*0.5;
2826 static void make_bspline_moduli(splinevec bsp_mod,
2827 int nx, int ny, int nz, int order)
2829 int nmax = max(nx, max(ny, nz));
2830 real *data, *ddata, *bsp_data;
2836 snew(bsp_data, nmax);
2842 for (k = 3; k < order; k++)
2846 for (l = 1; l < (k-1); l++)
2848 data[k-l-1] = div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
2850 data[0] = div*data[0];
2853 ddata[0] = -data[0];
2854 for (k = 1; k < order; k++)
2856 ddata[k] = data[k-1]-data[k];
2858 div = 1.0/(order-1);
2860 for (l = 1; l < (order-1); l++)
2862 data[order-l-1] = div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
2864 data[0] = div*data[0];
2866 for (i = 0; i < nmax; i++)
2870 for (i = 1; i <= order; i++)
2872 bsp_data[i] = data[i-1];
2875 make_dft_mod(bsp_mod[XX], bsp_data, nx);
2876 make_dft_mod(bsp_mod[YY], bsp_data, ny);
2877 make_dft_mod(bsp_mod[ZZ], bsp_data, nz);
2885 /* Return the P3M optimal influence function */
2886 static double do_p3m_influence(double z, int order)
2893 /* The formula and most constants can be found in:
2894 * Ballenegger et al., JCTC 8, 936 (2012)
2899 return 1.0 - 2.0*z2/3.0;
2902 return 1.0 - z2 + 2.0*z4/15.0;
2905 return 1.0 - 4.0*z2/3.0 + 2.0*z4/5.0 + 4.0*z2*z4/315.0;
2908 return 1.0 - 5.0*z2/3.0 + 7.0*z4/9.0 - 17.0*z2*z4/189.0 + 2.0*z4*z4/2835.0;
2911 return 1.0 - 2.0*z2 + 19.0*z4/15.0 - 256.0*z2*z4/945.0 + 62.0*z4*z4/4725.0 + 4.0*z2*z4*z4/155925.0;
2914 return 1.0 - 7.0*z2/3.0 + 28.0*z4/15.0 - 16.0*z2*z4/27.0 + 26.0*z4*z4/405.0 - 2.0*z2*z4*z4/1485.0 + 4.0*z4*z4*z4/6081075.0;
2916 return 1.0 - 8.0*z2/3.0 + 116.0*z4/45.0 - 344.0*z2*z4/315.0 + 914.0*z4*z4/4725.0 - 248.0*z4*z4*z2/22275.0 + 21844.0*z4*z4*z4/212837625.0 - 8.0*z4*z4*z4*z2/638512875.0;
2923 /* Calculate the P3M B-spline moduli for one dimension */
2924 static void make_p3m_bspline_moduli_dim(real *bsp_mod, int n, int order)
2926 double zarg, zai, sinzai, infl;
2931 gmx_fatal(FARGS, "The current P3M code only supports orders up to 8");
2938 for (i = -maxk; i < 0; i++)
2942 infl = do_p3m_influence(sinzai, order);
2943 bsp_mod[n+i] = infl*infl*pow(sinzai/zai, -2.0*order);
2946 for (i = 1; i < maxk; i++)
2950 infl = do_p3m_influence(sinzai, order);
2951 bsp_mod[i] = infl*infl*pow(sinzai/zai, -2.0*order);
2955 /* Calculate the P3M B-spline moduli */
2956 static void make_p3m_bspline_moduli(splinevec bsp_mod,
2957 int nx, int ny, int nz, int order)
2959 make_p3m_bspline_moduli_dim(bsp_mod[XX], nx, order);
2960 make_p3m_bspline_moduli_dim(bsp_mod[YY], ny, order);
2961 make_p3m_bspline_moduli_dim(bsp_mod[ZZ], nz, order);
2965 static void setup_coordinate_communication(pme_atomcomm_t *atc)
2973 for (i = 1; i <= nslab/2; i++)
2975 fw = (atc->nodeid + i) % nslab;
2976 bw = (atc->nodeid - i + nslab) % nslab;
2979 atc->node_dest[n] = fw;
2980 atc->node_src[n] = bw;
2985 atc->node_dest[n] = bw;
2986 atc->node_src[n] = fw;
2992 int gmx_pme_destroy(FILE *log, gmx_pme_t *pmedata)
2998 fprintf(log, "Destroying PME data structures.\n");
3001 sfree((*pmedata)->nnx);
3002 sfree((*pmedata)->nny);
3003 sfree((*pmedata)->nnz);
3005 for (i = 0; i < (*pmedata)->ngrids; ++i)
3007 pmegrids_destroy(&(*pmedata)->pmegrid[i]);
3008 sfree((*pmedata)->fftgrid[i]);
3009 sfree((*pmedata)->cfftgrid[i]);
3010 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setup[i]);
3013 sfree((*pmedata)->lb_buf1);
3014 sfree((*pmedata)->lb_buf2);
3016 for (thread = 0; thread < (*pmedata)->nthread; thread++)
3018 free_work(&(*pmedata)->work[thread]);
3020 sfree((*pmedata)->work);
3028 static int mult_up(int n, int f)
3030 return ((n + f - 1)/f)*f;
3034 static double pme_load_imbalance(gmx_pme_t pme)
3039 nma = pme->nnodes_major;
3040 nmi = pme->nnodes_minor;
3042 n1 = mult_up(pme->nkx, nma)*mult_up(pme->nky, nmi)*pme->nkz;
3043 n2 = mult_up(pme->nkx, nma)*mult_up(pme->nkz, nmi)*pme->nky;
3044 n3 = mult_up(pme->nky, nma)*mult_up(pme->nkz, nmi)*pme->nkx;
3046 /* pme_solve is roughly double the cost of an fft */
3048 return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
3051 static void init_atomcomm(gmx_pme_t pme, pme_atomcomm_t *atc,
3052 int dimind, gmx_bool bSpread)
3054 int nk, k, s, thread;
3056 atc->dimind = dimind;
3061 if (pme->nnodes > 1)
3063 atc->mpi_comm = pme->mpi_comm_d[dimind];
3064 MPI_Comm_size(atc->mpi_comm, &atc->nslab);
3065 MPI_Comm_rank(atc->mpi_comm, &atc->nodeid);
3069 fprintf(debug, "For PME atom communication in dimind %d: nslab %d rank %d\n", atc->dimind, atc->nslab, atc->nodeid);
3073 atc->bSpread = bSpread;
3074 atc->pme_order = pme->pme_order;
3078 snew(atc->node_dest, atc->nslab);
3079 snew(atc->node_src, atc->nslab);
3080 setup_coordinate_communication(atc);
3082 snew(atc->count_thread, pme->nthread);
3083 for (thread = 0; thread < pme->nthread; thread++)
3085 snew(atc->count_thread[thread], atc->nslab);
3087 atc->count = atc->count_thread[0];
3088 snew(atc->rcount, atc->nslab);
3089 snew(atc->buf_index, atc->nslab);
3092 atc->nthread = pme->nthread;
3093 if (atc->nthread > 1)
3095 snew(atc->thread_plist, atc->nthread);
3097 snew(atc->spline, atc->nthread);
3098 for (thread = 0; thread < atc->nthread; thread++)
3100 if (atc->nthread > 1)
3102 snew(atc->thread_plist[thread].n, atc->nthread+2*GMX_CACHE_SEP);
3103 atc->thread_plist[thread].n += GMX_CACHE_SEP;
3105 snew(atc->spline[thread].thread_one, pme->nthread);
3106 atc->spline[thread].thread_one[thread] = 1;
3111 init_overlap_comm(pme_overlap_t * ol,
3121 int lbnd, rbnd, maxlr, b, i;
3124 pme_grid_comm_t *pgc;
3126 int fft_start, fft_end, send_index1, recv_index1;
3130 ol->mpi_comm = comm;
3133 ol->nnodes = nnodes;
3134 ol->nodeid = nodeid;
3136 /* Linear translation of the PME grid won't affect reciprocal space
3137 * calculations, so to optimize we only interpolate "upwards",
3138 * which also means we only have to consider overlap in one direction.
3139 * I.e., particles on this node might also be spread to grid indices
3140 * that belong to higher nodes (modulo nnodes)
3143 snew(ol->s2g0, ol->nnodes+1);
3144 snew(ol->s2g1, ol->nnodes);
3147 fprintf(debug, "PME slab boundaries:");
3149 for (i = 0; i < nnodes; i++)
3151 /* s2g0 the local interpolation grid start.
3152 * s2g1 the local interpolation grid end.
3153 * Since in calc_pidx we divide particles, and not grid lines,
3154 * spatially uniform along dimension x or y, we need to round
3155 * s2g0 down and s2g1 up.
3157 ol->s2g0[i] = ( i *ndata + 0 )/nnodes;
3158 ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
3162 fprintf(debug, " %3d %3d", ol->s2g0[i], ol->s2g1[i]);
3165 ol->s2g0[nnodes] = ndata;
3168 fprintf(debug, "\n");
3171 /* Determine with how many nodes we need to communicate the grid overlap */
3177 for (i = 0; i < nnodes; i++)
3179 if ((i+b < nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
3180 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
3186 while (bCont && b < nnodes);
3187 ol->noverlap_nodes = b - 1;
3189 snew(ol->send_id, ol->noverlap_nodes);
3190 snew(ol->recv_id, ol->noverlap_nodes);
3191 for (b = 0; b < ol->noverlap_nodes; b++)
3193 ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
3194 ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
3196 snew(ol->comm_data, ol->noverlap_nodes);
3199 for (b = 0; b < ol->noverlap_nodes; b++)
3201 pgc = &ol->comm_data[b];
3203 fft_start = ol->s2g0[ol->send_id[b]];
3204 fft_end = ol->s2g0[ol->send_id[b]+1];
3205 if (ol->send_id[b] < nodeid)
3210 send_index1 = ol->s2g1[nodeid];
3211 send_index1 = min(send_index1, fft_end);
3212 pgc->send_index0 = fft_start;
3213 pgc->send_nindex = max(0, send_index1 - pgc->send_index0);
3214 ol->send_size += pgc->send_nindex;
3216 /* We always start receiving to the first index of our slab */
3217 fft_start = ol->s2g0[ol->nodeid];
3218 fft_end = ol->s2g0[ol->nodeid+1];
3219 recv_index1 = ol->s2g1[ol->recv_id[b]];
3220 if (ol->recv_id[b] > nodeid)
3222 recv_index1 -= ndata;
3224 recv_index1 = min(recv_index1, fft_end);
3225 pgc->recv_index0 = fft_start;
3226 pgc->recv_nindex = max(0, recv_index1 - pgc->recv_index0);
3230 /* Communicate the buffer sizes to receive */
3231 for (b = 0; b < ol->noverlap_nodes; b++)
3233 MPI_Sendrecv(&ol->send_size, 1, MPI_INT, ol->send_id[b], b,
3234 &ol->comm_data[b].recv_size, 1, MPI_INT, ol->recv_id[b], b,
3235 ol->mpi_comm, &stat);
3239 /* For non-divisible grid we need pme_order iso pme_order-1 */
3240 snew(ol->sendbuf, norder*commplainsize);
3241 snew(ol->recvbuf, norder*commplainsize);
3245 make_gridindex5_to_localindex(int n, int local_start, int local_range,
3246 int **global_to_local,
3247 real **fraction_shift)
3255 for (i = 0; (i < 5*n); i++)
3257 /* Determine the global to local grid index */
3258 gtl[i] = (i - local_start + n) % n;
3259 /* For coordinates that fall within the local grid the fraction
3260 * is correct, we don't need to shift it.
3263 if (local_range < n)
3265 /* Due to rounding issues i could be 1 beyond the lower or
3266 * upper boundary of the local grid. Correct the index for this.
3267 * If we shift the index, we need to shift the fraction by
3268 * the same amount in the other direction to not affect
3270 * Note that due to this shifting the weights at the end of
3271 * the spline might change, but that will only involve values
3272 * between zero and values close to the precision of a real,
3273 * which is anyhow the accuracy of the whole mesh calculation.
3275 /* With local_range=0 we should not change i=local_start */
3276 if (i % n != local_start)
3283 else if (gtl[i] == local_range)
3285 gtl[i] = local_range - 1;
3292 *global_to_local = gtl;
3293 *fraction_shift = fsh;
3296 static pme_spline_work_t *make_pme_spline_work(int gmx_unused order)
3298 pme_spline_work_t *work;
3300 #ifdef PME_SIMD4_SPREAD_GATHER
3301 real tmp[GMX_SIMD4_WIDTH*3], *tmp_aligned;
3302 gmx_simd4_real_t zero_S;
3303 gmx_simd4_real_t real_mask_S0, real_mask_S1;
3306 snew_aligned(work, 1, SIMD4_ALIGNMENT);
3308 tmp_aligned = gmx_simd4_align_r(tmp);
3310 zero_S = gmx_simd4_setzero_r();
3312 /* Generate bit masks to mask out the unused grid entries,
3313 * as we only operate on order of the 8 grid entries that are
3314 * load into 2 SIMD registers.
3316 for (of = 0; of < 2*GMX_SIMD4_WIDTH-(order-1); of++)
3318 for (i = 0; i < 2*GMX_SIMD4_WIDTH; i++)
3320 tmp_aligned[i] = (i >= of && i < of+order ? -1.0 : 1.0);
3322 real_mask_S0 = gmx_simd4_load_r(tmp_aligned);
3323 real_mask_S1 = gmx_simd4_load_r(tmp_aligned+GMX_SIMD4_WIDTH);
3324 work->mask_S0[of] = gmx_simd4_cmplt_r(real_mask_S0, zero_S);
3325 work->mask_S1[of] = gmx_simd4_cmplt_r(real_mask_S1, zero_S);
3334 void gmx_pme_check_restrictions(int pme_order,
3335 int nkx, int nky, int nkz,
3338 gmx_bool bUseThreads,
3340 gmx_bool *bValidSettings)
3342 if (pme_order > PME_ORDER_MAX)
3346 *bValidSettings = FALSE;
3349 gmx_fatal(FARGS, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
3350 pme_order, PME_ORDER_MAX);
3353 if (nkx <= pme_order*(nnodes_major > 1 ? 2 : 1) ||
3354 nky <= pme_order*(nnodes_minor > 1 ? 2 : 1) ||
3359 *bValidSettings = FALSE;
3362 gmx_fatal(FARGS, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order",
3366 /* Check for a limitation of the (current) sum_fftgrid_dd code.
3367 * We only allow multiple communication pulses in dim 1, not in dim 0.
3369 if (bUseThreads && (nkx < nnodes_major*pme_order &&
3370 nkx != nnodes_major*(pme_order - 1)))
3374 *bValidSettings = FALSE;
3377 gmx_fatal(FARGS, "The number of PME grid lines per rank along x is %g. But when using OpenMP threads, the number of grid lines per rank along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use fewer ranks along x (and possibly more along y and/or z) by specifying -dd manually.",
3378 nkx/(double)nnodes_major, pme_order);
3381 if (bValidSettings != NULL)
3383 *bValidSettings = TRUE;
3389 int gmx_pme_init(gmx_pme_t * pmedata,
3395 gmx_bool bFreeEnergy_q,
3396 gmx_bool bFreeEnergy_lj,
3397 gmx_bool bReproducible,
3400 gmx_pme_t pme = NULL;
3402 int use_threads, sum_use_threads, i;
3407 fprintf(debug, "Creating PME data structures.\n");
3411 pme->sum_qgrid_tmp = NULL;
3412 pme->sum_qgrid_dd_tmp = NULL;
3413 pme->buf_nalloc = 0;
3416 pme->bPPnode = TRUE;
3418 pme->nnodes_major = nnodes_major;
3419 pme->nnodes_minor = nnodes_minor;
3422 if (nnodes_major*nnodes_minor > 1)
3424 pme->mpi_comm = cr->mpi_comm_mygroup;
3426 MPI_Comm_rank(pme->mpi_comm, &pme->nodeid);
3427 MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
3428 if (pme->nnodes != nnodes_major*nnodes_minor)
3430 gmx_incons("PME rank count mismatch");
3435 pme->mpi_comm = MPI_COMM_NULL;
3439 if (pme->nnodes == 1)
3442 pme->mpi_comm_d[0] = MPI_COMM_NULL;
3443 pme->mpi_comm_d[1] = MPI_COMM_NULL;
3445 pme->ndecompdim = 0;
3446 pme->nodeid_major = 0;
3447 pme->nodeid_minor = 0;
3449 pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
3454 if (nnodes_minor == 1)
3457 pme->mpi_comm_d[0] = pme->mpi_comm;
3458 pme->mpi_comm_d[1] = MPI_COMM_NULL;
3460 pme->ndecompdim = 1;
3461 pme->nodeid_major = pme->nodeid;
3462 pme->nodeid_minor = 0;
3465 else if (nnodes_major == 1)
3468 pme->mpi_comm_d[0] = MPI_COMM_NULL;
3469 pme->mpi_comm_d[1] = pme->mpi_comm;
3471 pme->ndecompdim = 1;
3472 pme->nodeid_major = 0;
3473 pme->nodeid_minor = pme->nodeid;
3477 if (pme->nnodes % nnodes_major != 0)
3479 gmx_incons("For 2D PME decomposition, #PME ranks must be divisible by the number of ranks in the major dimension");
3481 pme->ndecompdim = 2;
3484 MPI_Comm_split(pme->mpi_comm, pme->nodeid % nnodes_minor,
3485 pme->nodeid, &pme->mpi_comm_d[0]); /* My communicator along major dimension */
3486 MPI_Comm_split(pme->mpi_comm, pme->nodeid/nnodes_minor,
3487 pme->nodeid, &pme->mpi_comm_d[1]); /* My communicator along minor dimension */
3489 MPI_Comm_rank(pme->mpi_comm_d[0], &pme->nodeid_major);
3490 MPI_Comm_size(pme->mpi_comm_d[0], &pme->nnodes_major);
3491 MPI_Comm_rank(pme->mpi_comm_d[1], &pme->nodeid_minor);
3492 MPI_Comm_size(pme->mpi_comm_d[1], &pme->nnodes_minor);
3495 pme->bPPnode = (cr->duty & DUTY_PP);
3498 pme->nthread = nthread;
3500 /* Check if any of the PME MPI ranks uses threads */
3501 use_threads = (pme->nthread > 1 ? 1 : 0);
3503 if (pme->nnodes > 1)
3505 MPI_Allreduce(&use_threads, &sum_use_threads, 1, MPI_INT,
3506 MPI_SUM, pme->mpi_comm);
3511 sum_use_threads = use_threads;
3513 pme->bUseThreads = (sum_use_threads > 0);
3515 if (ir->ePBC == epbcSCREW)
3517 gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
3520 pme->bFEP_q = ((ir->efep != efepNO) && bFreeEnergy_q);
3521 pme->bFEP_lj = ((ir->efep != efepNO) && bFreeEnergy_lj);
3522 pme->bFEP = (pme->bFEP_q || pme->bFEP_lj);
3526 pme->bP3M = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
3527 pme->pme_order = ir->pme_order;
3529 /* Always constant electrostatics coefficients */
3530 pme->epsilon_r = ir->epsilon_r;
3532 /* Always constant LJ coefficients */
3533 pme->ljpme_combination_rule = ir->ljpme_combination_rule;
3535 /* If we violate restrictions, generate a fatal error here */
3536 gmx_pme_check_restrictions(pme->pme_order,
3537 pme->nkx, pme->nky, pme->nkz,
3544 if (pme->nnodes > 1)
3549 MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
3550 MPI_Type_commit(&(pme->rvec_mpi));
3553 /* Note that the coefficient spreading and force gathering, which usually
3554 * takes about the same amount of time as FFT+solve_pme,
3555 * is always fully load balanced
3556 * (unless the coefficient distribution is inhomogeneous).
3559 imbal = pme_load_imbalance(pme);
3560 if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
3564 "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
3565 " For optimal PME load balancing\n"
3566 " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_ranks_x (%d)\n"
3567 " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_ranks_y (%d)\n"
3569 (int)((imbal-1)*100 + 0.5),
3570 pme->nkx, pme->nky, pme->nnodes_major,
3571 pme->nky, pme->nkz, pme->nnodes_minor);
3575 /* For non-divisible grid we need pme_order iso pme_order-1 */
3576 /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
3577 * y is always copied through a buffer: we don't need padding in z,
3578 * but we do need the overlap in x because of the communication order.
3580 init_overlap_comm(&pme->overlap[0], pme->pme_order,
3584 pme->nnodes_major, pme->nodeid_major,
3586 (div_round_up(pme->nky, pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
3588 /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
3589 * We do this with an offset buffer of equal size, so we need to allocate
3590 * extra for the offset. That's what the (+1)*pme->nkz is for.
3592 init_overlap_comm(&pme->overlap[1], pme->pme_order,
3596 pme->nnodes_minor, pme->nodeid_minor,
3598 (div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
3600 /* Double-check for a limitation of the (current) sum_fftgrid_dd code.
3601 * Note that gmx_pme_check_restrictions checked for this already.
3603 if (pme->bUseThreads && pme->overlap[0].noverlap_nodes > 1)
3605 gmx_incons("More than one communication pulse required for grid overlap communication along the major dimension while using threads");
3608 snew(pme->bsp_mod[XX], pme->nkx);
3609 snew(pme->bsp_mod[YY], pme->nky);
3610 snew(pme->bsp_mod[ZZ], pme->nkz);
3612 /* The required size of the interpolation grid, including overlap.
3613 * The allocated size (pmegrid_n?) might be slightly larger.
3615 pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
3616 pme->overlap[0].s2g0[pme->nodeid_major];
3617 pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
3618 pme->overlap[1].s2g0[pme->nodeid_minor];
3619 pme->pmegrid_nz_base = pme->nkz;
3620 pme->pmegrid_nz = pme->pmegrid_nz_base + pme->pme_order - 1;
3621 set_grid_alignment(&pme->pmegrid_nz, pme->pme_order);
3623 pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
3624 pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
3625 pme->pmegrid_start_iz = 0;
3627 make_gridindex5_to_localindex(pme->nkx,
3628 pme->pmegrid_start_ix,
3629 pme->pmegrid_nx - (pme->pme_order-1),
3630 &pme->nnx, &pme->fshx);
3631 make_gridindex5_to_localindex(pme->nky,
3632 pme->pmegrid_start_iy,
3633 pme->pmegrid_ny - (pme->pme_order-1),
3634 &pme->nny, &pme->fshy);
3635 make_gridindex5_to_localindex(pme->nkz,
3636 pme->pmegrid_start_iz,
3637 pme->pmegrid_nz_base,
3638 &pme->nnz, &pme->fshz);
3640 pme->spline_work = make_pme_spline_work(pme->pme_order);
3642 ndata[0] = pme->nkx;
3643 ndata[1] = pme->nky;
3644 ndata[2] = pme->nkz;
3645 /* It doesn't matter if we allocate too many grids here,
3646 * we only allocate and use the ones we need.
3648 if (EVDW_PME(ir->vdwtype))
3650 pme->ngrids = ((ir->ljpme_combination_rule == eljpmeLB) ? DO_Q_AND_LJ_LB : DO_Q_AND_LJ);
3656 snew(pme->fftgrid, pme->ngrids);
3657 snew(pme->cfftgrid, pme->ngrids);
3658 snew(pme->pfft_setup, pme->ngrids);
3660 for (i = 0; i < pme->ngrids; ++i)
3662 if ((i < DO_Q && EEL_PME(ir->coulombtype) && (i == 0 ||
3664 (i >= DO_Q && EVDW_PME(ir->vdwtype) && (i == 2 ||
3666 ir->ljpme_combination_rule == eljpmeLB)))
3668 pmegrids_init(&pme->pmegrid[i],
3669 pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
3670 pme->pmegrid_nz_base,
3674 pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
3675 pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
3676 /* This routine will allocate the grid data to fit the FFTs */
3677 gmx_parallel_3dfft_init(&pme->pfft_setup[i], ndata,
3678 &pme->fftgrid[i], &pme->cfftgrid[i],
3680 bReproducible, pme->nthread);
3687 /* Use plain SPME B-spline interpolation */
3688 make_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
3692 /* Use the P3M grid-optimized influence function */
3693 make_p3m_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
3696 /* Use atc[0] for spreading */
3697 init_atomcomm(pme, &pme->atc[0], nnodes_major > 1 ? 0 : 1, TRUE);
3698 if (pme->ndecompdim >= 2)
3700 init_atomcomm(pme, &pme->atc[1], 1, FALSE);
3703 if (pme->nnodes == 1)
3705 pme->atc[0].n = homenr;
3706 pme_realloc_atomcomm_things(&pme->atc[0]);
3709 pme->lb_buf1 = NULL;
3710 pme->lb_buf2 = NULL;
3711 pme->lb_buf_nalloc = 0;
3716 /* Use fft5d, order after FFT is y major, z, x minor */
3718 snew(pme->work, pme->nthread);
3719 for (thread = 0; thread < pme->nthread; thread++)
3721 realloc_work(&pme->work[thread], pme->nkx);
3730 static void reuse_pmegrids(const pmegrids_t *old, pmegrids_t *new)
3734 for (d = 0; d < DIM; d++)
3736 if (new->grid.n[d] > old->grid.n[d])
3742 sfree_aligned(new->grid.grid);
3743 new->grid.grid = old->grid.grid;
3745 if (new->grid_th != NULL && new->nthread == old->nthread)
3747 sfree_aligned(new->grid_all);
3748 for (t = 0; t < new->nthread; t++)
3750 new->grid_th[t].grid = old->grid_th[t].grid;
3755 int gmx_pme_reinit(gmx_pme_t * pmedata,
3758 const t_inputrec * ir,
3766 irc.nkx = grid_size[XX];
3767 irc.nky = grid_size[YY];
3768 irc.nkz = grid_size[ZZ];
3770 if (pme_src->nnodes == 1)
3772 homenr = pme_src->atc[0].n;
3779 ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
3780 &irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE, pme_src->nthread);
3784 /* We can easily reuse the allocated pme grids in pme_src */
3785 reuse_pmegrids(&pme_src->pmegrid[PME_GRID_QA], &(*pmedata)->pmegrid[PME_GRID_QA]);
3786 /* We would like to reuse the fft grids, but that's harder */
3793 static void copy_local_grid(gmx_pme_t pme, pmegrids_t *pmegrids,
3794 int grid_index, int thread, real *fftgrid)
3796 ivec local_fft_ndata, local_fft_offset, local_fft_size;
3800 int offx, offy, offz, x, y, z, i0, i0t;
3805 gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
3809 fft_my = local_fft_size[YY];
3810 fft_mz = local_fft_size[ZZ];
3812 pmegrid = &pmegrids->grid_th[thread];
3814 nsx = pmegrid->s[XX];
3815 nsy = pmegrid->s[YY];
3816 nsz = pmegrid->s[ZZ];
3818 for (d = 0; d < DIM; d++)
3820 nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
3821 local_fft_ndata[d] - pmegrid->offset[d]);
3824 offx = pmegrid->offset[XX];
3825 offy = pmegrid->offset[YY];
3826 offz = pmegrid->offset[ZZ];
3828 /* Directly copy the non-overlapping parts of the local grids.
3829 * This also initializes the full grid.
3831 grid_th = pmegrid->grid;
3832 for (x = 0; x < nf[XX]; x++)
3834 for (y = 0; y < nf[YY]; y++)
3836 i0 = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
3837 i0t = (x*nsy + y)*nsz;
3838 for (z = 0; z < nf[ZZ]; z++)
3840 fftgrid[i0+z] = grid_th[i0t+z];
3847 reduce_threadgrid_overlap(gmx_pme_t pme,
3848 const pmegrids_t *pmegrids, int thread,
3849 real *fftgrid, real *commbuf_x, real *commbuf_y,
3852 ivec local_fft_ndata, local_fft_offset, local_fft_size;
3853 int fft_nx, fft_ny, fft_nz;
3858 int offx, offy, offz, x, y, z, i0, i0t;
3859 int sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
3860 gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
3861 gmx_bool bCommX, bCommY;
3864 const pmegrid_t *pmegrid, *pmegrid_g, *pmegrid_f;
3865 const real *grid_th;
3866 real *commbuf = NULL;
3868 gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
3872 fft_nx = local_fft_ndata[XX];
3873 fft_ny = local_fft_ndata[YY];
3874 fft_nz = local_fft_ndata[ZZ];
3876 fft_my = local_fft_size[YY];
3877 fft_mz = local_fft_size[ZZ];
3879 /* This routine is called when all thread have finished spreading.
3880 * Here each thread sums grid contributions calculated by other threads
3881 * to the thread local grid volume.
3882 * To minimize the number of grid copying operations,
3883 * this routines sums immediately from the pmegrid to the fftgrid.
3886 /* Determine which part of the full node grid we should operate on,
3887 * this is our thread local part of the full grid.
3889 pmegrid = &pmegrids->grid_th[thread];
3891 for (d = 0; d < DIM; d++)
3893 /* Determine up to where our thread needs to copy from the
3894 * thread-local charge spreading grid to the rank-local FFT grid.
3895 * This is up to our spreading grid end minus order-1 and
3896 * not beyond the local FFT grid.
3899 min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
3900 local_fft_ndata[d]);
3903 offx = pmegrid->offset[XX];
3904 offy = pmegrid->offset[YY];
3905 offz = pmegrid->offset[ZZ];
3912 /* Now loop over all the thread data blocks that contribute
3913 * to the grid region we (our thread) are operating on.
3915 /* Note that fft_nx/y is equal to the number of grid points
3916 * between the first point of our node grid and the one of the next node.
3918 for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
3920 fx = pmegrid->ci[XX] + sx;
3925 fx += pmegrids->nc[XX];
3927 bCommX = (pme->nnodes_major > 1);
3929 pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
3930 ox += pmegrid_g->offset[XX];
3931 /* Determine the end of our part of the source grid */
3934 /* Use our thread local source grid and target grid part */
3935 tx1 = min(ox + pmegrid_g->n[XX], localcopy_end[XX]);
3939 /* Use our thread local source grid and the spreading range */
3940 tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order);
3943 for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
3945 fy = pmegrid->ci[YY] + sy;
3950 fy += pmegrids->nc[YY];
3952 bCommY = (pme->nnodes_minor > 1);
3954 pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
3955 oy += pmegrid_g->offset[YY];
3956 /* Determine the end of our part of the source grid */
3959 /* Use our thread local source grid and target grid part */
3960 ty1 = min(oy + pmegrid_g->n[YY], localcopy_end[YY]);
3964 /* Use our thread local source grid and the spreading range */
3965 ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order);
3968 for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
3970 fz = pmegrid->ci[ZZ] + sz;
3974 fz += pmegrids->nc[ZZ];
3977 pmegrid_g = &pmegrids->grid_th[fz];
3978 oz += pmegrid_g->offset[ZZ];
3979 tz1 = min(oz + pmegrid_g->n[ZZ], localcopy_end[ZZ]);
3981 if (sx == 0 && sy == 0 && sz == 0)
3983 /* We have already added our local contribution
3984 * before calling this routine, so skip it here.
3989 thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
3991 pmegrid_f = &pmegrids->grid_th[thread_f];
3993 grid_th = pmegrid_f->grid;
3995 nsx = pmegrid_f->s[XX];
3996 nsy = pmegrid_f->s[YY];
3997 nsz = pmegrid_f->s[ZZ];
3999 #ifdef DEBUG_PME_REDUCE
4000 printf("n%d t%d add %d %2d %2d %2d %2d %2d %2d %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
4001 pme->nodeid, thread, thread_f,
4002 pme->pmegrid_start_ix,
4003 pme->pmegrid_start_iy,
4004 pme->pmegrid_start_iz,
4006 offx-ox, tx1-ox, offx, tx1,
4007 offy-oy, ty1-oy, offy, ty1,
4008 offz-oz, tz1-oz, offz, tz1);
4011 if (!(bCommX || bCommY))
4013 /* Copy from the thread local grid to the node grid */
4014 for (x = offx; x < tx1; x++)
4016 for (y = offy; y < ty1; y++)
4018 i0 = (x*fft_my + y)*fft_mz;
4019 i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
4020 for (z = offz; z < tz1; z++)
4022 fftgrid[i0+z] += grid_th[i0t+z];
4029 /* The order of this conditional decides
4030 * where the corner volume gets stored with x+y decomp.
4034 commbuf = commbuf_y;
4035 /* The y-size of the communication buffer is set by
4036 * the overlap of the grid part of our local slab
4037 * with the part starting at the next slab.
4040 pme->overlap[1].s2g1[pme->nodeid_minor] -
4041 pme->overlap[1].s2g0[pme->nodeid_minor+1];
4044 /* We index commbuf modulo the local grid size */
4045 commbuf += buf_my*fft_nx*fft_nz;
4047 bClearBuf = bClearBufXY;
4048 bClearBufXY = FALSE;
4052 bClearBuf = bClearBufY;
4058 commbuf = commbuf_x;
4060 bClearBuf = bClearBufX;
4064 /* Copy to the communication buffer */
4065 for (x = offx; x < tx1; x++)
4067 for (y = offy; y < ty1; y++)
4069 i0 = (x*buf_my + y)*fft_nz;
4070 i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
4074 /* First access of commbuf, initialize it */
4075 for (z = offz; z < tz1; z++)
4077 commbuf[i0+z] = grid_th[i0t+z];
4082 for (z = offz; z < tz1; z++)
4084 commbuf[i0+z] += grid_th[i0t+z];
4096 static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid, int grid_index)
4098 ivec local_fft_ndata, local_fft_offset, local_fft_size;
4099 pme_overlap_t *overlap;
4100 int send_index0, send_nindex;
4105 int send_size_y, recv_size_y;
4106 int ipulse, send_id, recv_id, datasize, gridsize, size_yx;
4107 real *sendptr, *recvptr;
4108 int x, y, z, indg, indb;
4110 /* Note that this routine is only used for forward communication.
4111 * Since the force gathering, unlike the coefficient spreading,
4112 * can be trivially parallelized over the particles,
4113 * the backwards process is much simpler and can use the "old"
4114 * communication setup.
4117 gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
4122 if (pme->nnodes_minor > 1)
4124 /* Major dimension */
4125 overlap = &pme->overlap[1];
4127 if (pme->nnodes_major > 1)
4129 size_yx = pme->overlap[0].comm_data[0].send_nindex;
4135 datasize = (local_fft_ndata[XX] + size_yx)*local_fft_ndata[ZZ];
4137 send_size_y = overlap->send_size;
4139 for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
4141 send_id = overlap->send_id[ipulse];
4142 recv_id = overlap->recv_id[ipulse];
4144 overlap->comm_data[ipulse].send_index0 -
4145 overlap->comm_data[0].send_index0;
4146 send_nindex = overlap->comm_data[ipulse].send_nindex;
4147 /* We don't use recv_index0, as we always receive starting at 0 */
4148 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
4149 recv_size_y = overlap->comm_data[ipulse].recv_size;
4151 sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
4152 recvptr = overlap->recvbuf;
4156 fprintf(debug, "PME fftgrid comm y %2d x %2d x %2d\n",
4157 local_fft_ndata[XX], send_nindex, local_fft_ndata[ZZ]);
4161 MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
4163 recvptr, recv_size_y*datasize, GMX_MPI_REAL,
4165 overlap->mpi_comm, &stat);
4168 for (x = 0; x < local_fft_ndata[XX]; x++)
4170 for (y = 0; y < recv_nindex; y++)
4172 indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
4173 indb = (x*recv_size_y + y)*local_fft_ndata[ZZ];
4174 for (z = 0; z < local_fft_ndata[ZZ]; z++)
4176 fftgrid[indg+z] += recvptr[indb+z];
4181 if (pme->nnodes_major > 1)
4183 /* Copy from the received buffer to the send buffer for dim 0 */
4184 sendptr = pme->overlap[0].sendbuf;
4185 for (x = 0; x < size_yx; x++)
4187 for (y = 0; y < recv_nindex; y++)
4189 indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
4190 indb = ((local_fft_ndata[XX] + x)*recv_size_y + y)*local_fft_ndata[ZZ];
4191 for (z = 0; z < local_fft_ndata[ZZ]; z++)
4193 sendptr[indg+z] += recvptr[indb+z];
4201 /* We only support a single pulse here.
4202 * This is not a severe limitation, as this code is only used
4203 * with OpenMP and with OpenMP the (PME) domains can be larger.
4205 if (pme->nnodes_major > 1)
4207 /* Major dimension */
4208 overlap = &pme->overlap[0];
4210 datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
4211 gridsize = local_fft_size[YY] *local_fft_size[ZZ];
4215 send_id = overlap->send_id[ipulse];
4216 recv_id = overlap->recv_id[ipulse];
4217 send_nindex = overlap->comm_data[ipulse].send_nindex;
4218 /* We don't use recv_index0, as we always receive starting at 0 */
4219 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
4221 sendptr = overlap->sendbuf;
4222 recvptr = overlap->recvbuf;
4226 fprintf(debug, "PME fftgrid comm x %2d x %2d x %2d\n",
4227 send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
4231 MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
4233 recvptr, recv_nindex*datasize, GMX_MPI_REAL,
4235 overlap->mpi_comm, &stat);
4238 for (x = 0; x < recv_nindex; x++)
4240 for (y = 0; y < local_fft_ndata[YY]; y++)
4242 indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
4243 indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
4244 for (z = 0; z < local_fft_ndata[ZZ]; z++)
4246 fftgrid[indg+z] += recvptr[indb+z];
4254 static void spread_on_grid(gmx_pme_t pme,
4255 pme_atomcomm_t *atc, pmegrids_t *grids,
4256 gmx_bool bCalcSplines, gmx_bool bSpread,
4257 real *fftgrid, gmx_bool bDoSplines, int grid_index)
4259 int nthread, thread;
4260 #ifdef PME_TIME_THREADS
4261 gmx_cycles_t c1, c2, c3, ct1a, ct1b, ct1c;
4262 static double cs1 = 0, cs2 = 0, cs3 = 0;
4263 static double cs1a[6] = {0, 0, 0, 0, 0, 0};
4267 nthread = pme->nthread;
4268 assert(nthread > 0);
4270 #ifdef PME_TIME_THREADS
4271 c1 = omp_cyc_start();
4275 #pragma omp parallel for num_threads(nthread) schedule(static)
4276 for (thread = 0; thread < nthread; thread++)
4280 start = atc->n* thread /nthread;
4281 end = atc->n*(thread+1)/nthread;
4283 /* Compute fftgrid index for all atoms,
4284 * with help of some extra variables.
4286 calc_interpolation_idx(pme, atc, start, grid_index, end, thread);
4289 #ifdef PME_TIME_THREADS
4290 c1 = omp_cyc_end(c1);
4294 #ifdef PME_TIME_THREADS
4295 c2 = omp_cyc_start();
4297 #pragma omp parallel for num_threads(nthread) schedule(static)
4298 for (thread = 0; thread < nthread; thread++)
4300 splinedata_t *spline;
4301 pmegrid_t *grid = NULL;
4303 /* make local bsplines */
4304 if (grids == NULL || !pme->bUseThreads)
4306 spline = &atc->spline[0];
4312 grid = &grids->grid;
4317 spline = &atc->spline[thread];
4319 if (grids->nthread == 1)
4321 /* One thread, we operate on all coefficients */
4326 /* Get the indices our thread should operate on */
4327 make_thread_local_ind(atc, thread, spline);
4330 grid = &grids->grid_th[thread];
4335 make_bsplines(spline->theta, spline->dtheta, pme->pme_order,
4336 atc->fractx, spline->n, spline->ind, atc->coefficient, bDoSplines);
4341 /* put local atoms on grid. */
4342 #ifdef PME_TIME_SPREAD
4343 ct1a = omp_cyc_start();
4345 spread_coefficients_bsplines_thread(grid, atc, spline, pme->spline_work);
4347 if (pme->bUseThreads)
4349 copy_local_grid(pme, grids, grid_index, thread, fftgrid);
4351 #ifdef PME_TIME_SPREAD
4352 ct1a = omp_cyc_end(ct1a);
4353 cs1a[thread] += (double)ct1a;
4357 #ifdef PME_TIME_THREADS
4358 c2 = omp_cyc_end(c2);
4362 if (bSpread && pme->bUseThreads)
4364 #ifdef PME_TIME_THREADS
4365 c3 = omp_cyc_start();
4367 #pragma omp parallel for num_threads(grids->nthread) schedule(static)
4368 for (thread = 0; thread < grids->nthread; thread++)
4370 reduce_threadgrid_overlap(pme, grids, thread,
4372 pme->overlap[0].sendbuf,
4373 pme->overlap[1].sendbuf,
4376 #ifdef PME_TIME_THREADS
4377 c3 = omp_cyc_end(c3);
4381 if (pme->nnodes > 1)
4383 /* Communicate the overlapping part of the fftgrid.
4384 * For this communication call we need to check pme->bUseThreads
4385 * to have all ranks communicate here, regardless of pme->nthread.
4387 sum_fftgrid_dd(pme, fftgrid, grid_index);
4391 #ifdef PME_TIME_THREADS
4395 printf("idx %.2f spread %.2f red %.2f",
4396 cs1*1e-9, cs2*1e-9, cs3*1e-9);
4397 #ifdef PME_TIME_SPREAD
4398 for (thread = 0; thread < nthread; thread++)
4400 printf(" %.2f", cs1a[thread]*1e-9);
4409 static void dump_grid(FILE *fp,
4410 int sx, int sy, int sz, int nx, int ny, int nz,
4411 int my, int mz, const real *g)
4415 for (x = 0; x < nx; x++)
4417 for (y = 0; y < ny; y++)
4419 for (z = 0; z < nz; z++)
4421 fprintf(fp, "%2d %2d %2d %6.3f\n",
4422 sx+x, sy+y, sz+z, g[(x*my + y)*mz + z]);
4428 static void dump_local_fftgrid(gmx_pme_t pme, const real *fftgrid)
4430 ivec local_fft_ndata, local_fft_offset, local_fft_size;
4432 gmx_parallel_3dfft_real_limits(pme->pfft_setup[PME_GRID_QA],
4438 pme->pmegrid_start_ix,
4439 pme->pmegrid_start_iy,
4440 pme->pmegrid_start_iz,
4441 pme->pmegrid_nx-pme->pme_order+1,
4442 pme->pmegrid_ny-pme->pme_order+1,
4443 pme->pmegrid_nz-pme->pme_order+1,
4450 void gmx_pme_calc_energy(gmx_pme_t pme, int n, rvec *x, real *q, real *V)
4452 pme_atomcomm_t *atc;
4455 if (pme->nnodes > 1)
4457 gmx_incons("gmx_pme_calc_energy called in parallel");
4459 if (pme->bFEP_q > 1)
4461 gmx_incons("gmx_pme_calc_energy with free energy");
4464 atc = &pme->atc_energy;
4466 if (atc->spline == NULL)
4468 snew(atc->spline, atc->nthread);
4471 atc->bSpread = TRUE;
4472 atc->pme_order = pme->pme_order;
4474 pme_realloc_atomcomm_things(atc);
4476 atc->coefficient = q;
4478 /* We only use the A-charges grid */
4479 grid = &pme->pmegrid[PME_GRID_QA];
4481 /* Only calculate the spline coefficients, don't actually spread */
4482 spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgrid[PME_GRID_QA], FALSE, PME_GRID_QA);
4484 *V = gather_energy_bsplines(pme, grid->grid.grid, atc);
4488 static void reset_pmeonly_counters(gmx_wallcycle_t wcycle,
4489 gmx_walltime_accounting_t walltime_accounting,
4490 t_nrnb *nrnb, t_inputrec *ir,
4493 /* Reset all the counters related to performance over the run */
4494 wallcycle_stop(wcycle, ewcRUN);
4495 wallcycle_reset_all(wcycle);
4497 if (ir->nsteps >= 0)
4499 /* ir->nsteps is not used here, but we update it for consistency */
4500 ir->nsteps -= step - ir->init_step;
4502 ir->init_step = step;
4503 wallcycle_start(wcycle, ewcRUN);
4504 walltime_accounting_start(walltime_accounting);
4508 static void gmx_pmeonly_switch(int *npmedata, gmx_pme_t **pmedata,
4510 t_commrec *cr, t_inputrec *ir,
4514 gmx_pme_t pme = NULL;
4517 while (ind < *npmedata)
4519 pme = (*pmedata)[ind];
4520 if (pme->nkx == grid_size[XX] &&
4521 pme->nky == grid_size[YY] &&
4522 pme->nkz == grid_size[ZZ])
4533 srenew(*pmedata, *npmedata);
4535 /* Generate a new PME data structure, copying part of the old pointers */
4536 gmx_pme_reinit(&((*pmedata)[ind]), cr, pme, ir, grid_size);
4538 *pme_ret = (*pmedata)[ind];
4541 int gmx_pmeonly(gmx_pme_t pme,
4542 t_commrec *cr, t_nrnb *mynrnb,
4543 gmx_wallcycle_t wcycle,
4544 gmx_walltime_accounting_t walltime_accounting,
4545 real ewaldcoeff_q, real ewaldcoeff_lj,
4550 gmx_pme_pp_t pme_pp;
4554 rvec *x_pp = NULL, *f_pp = NULL;
4555 real *chargeA = NULL, *chargeB = NULL;
4556 real *c6A = NULL, *c6B = NULL;
4557 real *sigmaA = NULL, *sigmaB = NULL;
4560 int maxshift_x = 0, maxshift_y = 0;
4561 real energy_q, energy_lj, dvdlambda_q, dvdlambda_lj;
4562 matrix vir_q, vir_lj;
4567 gmx_int64_t step, step_rel;
4570 /* This data will only use with PME tuning, i.e. switching PME grids */
4572 snew(pmedata, npmedata);
4575 pme_pp = gmx_pme_pp_init(cr);
4580 do /****** this is a quasi-loop over time steps! */
4582 /* The reason for having a loop here is PME grid tuning/switching */
4585 /* Domain decomposition */
4586 ret = gmx_pme_recv_coeffs_coords(pme_pp,
4592 &maxshift_x, &maxshift_y,
4593 &pme->bFEP_q, &pme->bFEP_lj,
4594 &lambda_q, &lambda_lj,
4598 grid_switch, &ewaldcoeff_q, &ewaldcoeff_lj);
4600 if (ret == pmerecvqxSWITCHGRID)
4602 /* Switch the PME grid to grid_switch */
4603 gmx_pmeonly_switch(&npmedata, &pmedata, grid_switch, cr, ir, &pme);
4606 if (ret == pmerecvqxRESETCOUNTERS)
4608 /* Reset the cycle and flop counters */
4609 reset_pmeonly_counters(wcycle, walltime_accounting, mynrnb, ir, step);
4612 while (ret == pmerecvqxSWITCHGRID || ret == pmerecvqxRESETCOUNTERS);
4614 if (ret == pmerecvqxFINISH)
4616 /* We should stop: break out of the loop */
4620 step_rel = step - ir->init_step;
4624 wallcycle_start(wcycle, ewcRUN);
4625 walltime_accounting_start(walltime_accounting);
4628 wallcycle_start(wcycle, ewcPMEMESH);
4635 gmx_pme_do(pme, 0, natoms, x_pp, f_pp,
4636 chargeA, chargeB, c6A, c6B, sigmaA, sigmaB, box,
4637 cr, maxshift_x, maxshift_y, mynrnb, wcycle,
4638 vir_q, ewaldcoeff_q, vir_lj, ewaldcoeff_lj,
4639 &energy_q, &energy_lj, lambda_q, lambda_lj, &dvdlambda_q, &dvdlambda_lj,
4640 pme_flags | GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
4642 cycles = wallcycle_stop(wcycle, ewcPMEMESH);
4644 gmx_pme_send_force_vir_ener(pme_pp,
4645 f_pp, vir_q, energy_q, vir_lj, energy_lj,
4646 dvdlambda_q, dvdlambda_lj, cycles);
4649 } /***** end of quasi-loop, we stop with the break above */
4652 walltime_accounting_end(walltime_accounting);
4658 calc_initial_lb_coeffs(gmx_pme_t pme, real *local_c6, real *local_sigma)
4662 for (i = 0; i < pme->atc[0].n; ++i)
4666 sigma4 = local_sigma[i];
4667 sigma4 = sigma4*sigma4;
4668 sigma4 = sigma4*sigma4;
4669 pme->atc[0].coefficient[i] = local_c6[i] / sigma4;
4674 calc_next_lb_coeffs(gmx_pme_t pme, real *local_sigma)
4678 for (i = 0; i < pme->atc[0].n; ++i)
4680 pme->atc[0].coefficient[i] *= local_sigma[i];
4685 do_redist_pos_coeffs(gmx_pme_t pme, t_commrec *cr, int start, int homenr,
4686 gmx_bool bFirst, rvec x[], real *data)
4689 pme_atomcomm_t *atc;
4692 for (d = pme->ndecompdim - 1; d >= 0; d--)
4698 if (d == pme->ndecompdim - 1)
4706 n_d = pme->atc[d + 1].n;
4708 param_d = atc->coefficient;
4712 if (atc->npd > atc->pd_nalloc)
4714 atc->pd_nalloc = over_alloc_dd(atc->npd);
4715 srenew(atc->pd, atc->pd_nalloc);
4717 pme_calc_pidx_wrapper(n_d, pme->recipbox, x_d, atc);
4719 /* Redistribute x (only once) and qA/c6A or qB/c6B */
4720 if (DOMAINDECOMP(cr))
4722 dd_pmeredist_pos_coeffs(pme, n_d, bFirst, x_d, param_d, atc);
4727 int gmx_pme_do(gmx_pme_t pme,
4728 int start, int homenr,
4730 real *chargeA, real *chargeB,
4731 real *c6A, real *c6B,
4732 real *sigmaA, real *sigmaB,
4733 matrix box, t_commrec *cr,
4734 int maxshift_x, int maxshift_y,
4735 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
4736 matrix vir_q, real ewaldcoeff_q,
4737 matrix vir_lj, real ewaldcoeff_lj,
4738 real *energy_q, real *energy_lj,
4739 real lambda_q, real lambda_lj,
4740 real *dvdlambda_q, real *dvdlambda_lj,
4743 int d, i, j, k, ntot, npme, grid_index, max_grid_index;
4746 pme_atomcomm_t *atc = NULL;
4747 pmegrids_t *pmegrid = NULL;
4751 real *coefficient = NULL;
4756 gmx_parallel_3dfft_t pfft_setup;
4758 t_complex * cfftgrid;
4760 gmx_bool bFirst, bDoSplines;
4762 int fep_states_lj = pme->bFEP_lj ? 2 : 1;
4763 const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
4764 const gmx_bool bCalcF = flags & GMX_PME_CALC_F;
4766 assert(pme->nnodes > 0);
4767 assert(pme->nnodes == 1 || pme->ndecompdim > 0);
4769 if (pme->nnodes > 1)
4773 if (atc->npd > atc->pd_nalloc)
4775 atc->pd_nalloc = over_alloc_dd(atc->npd);
4776 srenew(atc->pd, atc->pd_nalloc);
4778 for (d = pme->ndecompdim-1; d >= 0; d--)
4781 atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
4787 /* This could be necessary for TPI */
4788 pme->atc[0].n = homenr;
4789 if (DOMAINDECOMP(cr))
4791 pme_realloc_atomcomm_things(atc);
4797 m_inv_ur0(box, pme->recipbox);
4800 /* For simplicity, we construct the splines for all particles if
4801 * more than one PME calculations is needed. Some optimization
4802 * could be done by keeping track of which atoms have splines
4803 * constructed, and construct new splines on each pass for atoms
4804 * that don't yet have them.
4807 bDoSplines = pme->bFEP || ((flags & GMX_PME_DO_COULOMB) && (flags & GMX_PME_DO_LJ));
4809 /* We need a maximum of four separate PME calculations:
4810 * grid_index=0: Coulomb PME with charges from state A
4811 * grid_index=1: Coulomb PME with charges from state B
4812 * grid_index=2: LJ PME with C6 from state A
4813 * grid_index=3: LJ PME with C6 from state B
4814 * For Lorentz-Berthelot combination rules, a separate loop is used to
4815 * calculate all the terms
4818 /* If we are doing LJ-PME with LB, we only do Q here */
4819 max_grid_index = (pme->ljpme_combination_rule == eljpmeLB) ? DO_Q : DO_Q_AND_LJ;
4821 for (grid_index = 0; grid_index < max_grid_index; ++grid_index)
4823 /* Check if we should do calculations at this grid_index
4824 * If grid_index is odd we should be doing FEP
4825 * If grid_index < 2 we should be doing electrostatic PME
4826 * If grid_index >= 2 we should be doing LJ-PME
4828 if ((grid_index < DO_Q && (!(flags & GMX_PME_DO_COULOMB) ||
4829 (grid_index == 1 && !pme->bFEP_q))) ||
4830 (grid_index >= DO_Q && (!(flags & GMX_PME_DO_LJ) ||
4831 (grid_index == 3 && !pme->bFEP_lj))))
4835 /* Unpack structure */
4836 pmegrid = &pme->pmegrid[grid_index];
4837 fftgrid = pme->fftgrid[grid_index];
4838 cfftgrid = pme->cfftgrid[grid_index];
4839 pfft_setup = pme->pfft_setup[grid_index];
4842 case 0: coefficient = chargeA + start; break;
4843 case 1: coefficient = chargeB + start; break;
4844 case 2: coefficient = c6A + start; break;
4845 case 3: coefficient = c6B + start; break;
4848 grid = pmegrid->grid.grid;
4852 fprintf(debug, "PME: number of ranks = %d, rank = %d\n",
4853 cr->nnodes, cr->nodeid);
4854 fprintf(debug, "Grid = %p\n", (void*)grid);
4857 gmx_fatal(FARGS, "No grid!");
4862 if (pme->nnodes == 1)
4864 atc->coefficient = coefficient;
4868 wallcycle_start(wcycle, ewcPME_REDISTXF);
4869 do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, coefficient);
4872 wallcycle_stop(wcycle, ewcPME_REDISTXF);
4877 fprintf(debug, "Rank= %6d, pme local particles=%6d\n",
4878 cr->nodeid, atc->n);
4881 if (flags & GMX_PME_SPREAD)
4883 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
4885 /* Spread the coefficients on a grid */
4886 spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
4890 inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
4892 inc_nrnb(nrnb, eNR_SPREADBSP,
4893 pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
4895 if (!pme->bUseThreads)
4897 wrap_periodic_pmegrid(pme, grid);
4899 /* sum contributions to local grid from other nodes */
4901 if (pme->nnodes > 1)
4903 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
4908 copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
4911 wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
4914 dump_local_fftgrid(pme,fftgrid);
4919 /* Here we start a large thread parallel region */
4920 #pragma omp parallel num_threads(pme->nthread) private(thread)
4922 thread = gmx_omp_get_thread_num();
4923 if (flags & GMX_PME_SOLVE)
4930 wallcycle_start(wcycle, ewcPME_FFT);
4932 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
4936 wallcycle_stop(wcycle, ewcPME_FFT);
4940 /* solve in k-space for our local cells */
4943 wallcycle_start(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
4945 if (grid_index < DO_Q)
4948 solve_pme_yzx(pme, cfftgrid, ewaldcoeff_q,
4949 box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
4951 pme->nthread, thread);
4956 solve_pme_lj_yzx(pme, &cfftgrid, FALSE, ewaldcoeff_lj,
4957 box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
4959 pme->nthread, thread);
4964 wallcycle_stop(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
4966 inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
4976 wallcycle_start(wcycle, ewcPME_FFT);
4978 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
4982 wallcycle_stop(wcycle, ewcPME_FFT);
4986 if (pme->nodeid == 0)
4988 ntot = pme->nkx*pme->nky*pme->nkz;
4989 npme = ntot*log((real)ntot)/log(2.0);
4990 inc_nrnb(nrnb, eNR_FFT, 2*npme);
4993 /* Note: this wallcycle region is closed below
4994 outside an OpenMP region, so take care if
4995 refactoring code here. */
4996 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
4999 copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
5002 /* End of thread parallel section.
5003 * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
5008 /* distribute local grid to all nodes */
5010 if (pme->nnodes > 1)
5012 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
5017 unwrap_periodic_pmegrid(pme, grid);
5019 /* interpolate forces for our local atoms */
5023 /* If we are running without parallelization,
5024 * atc->f is the actual force array, not a buffer,
5025 * therefore we should not clear it.
5027 lambda = grid_index < DO_Q ? lambda_q : lambda_lj;
5028 bClearF = (bFirst && PAR(cr));
5029 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
5030 for (thread = 0; thread < pme->nthread; thread++)
5032 gather_f_bsplines(pme, grid, bClearF, atc,
5033 &atc->spline[thread],
5034 pme->bFEP ? (grid_index % 2 == 0 ? 1.0-lambda : lambda) : 1.0);
5039 inc_nrnb(nrnb, eNR_GATHERFBSP,
5040 pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
5041 /* Note: this wallcycle region is opened above inside an OpenMP
5042 region, so take care if refactoring code here. */
5043 wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
5048 /* This should only be called on the master thread
5049 * and after the threads have synchronized.
5053 get_pme_ener_vir_q(pme, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
5057 get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
5061 } /* of grid_index-loop */
5063 /* For Lorentz-Berthelot combination rules in LJ-PME, we need to calculate
5066 if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB)
5068 /* Loop over A- and B-state if we are doing FEP */
5069 for (fep_state = 0; fep_state < fep_states_lj; ++fep_state)
5071 real *local_c6 = NULL, *local_sigma = NULL, *RedistC6 = NULL, *RedistSigma = NULL;
5072 if (pme->nnodes == 1)
5074 if (pme->lb_buf1 == NULL)
5076 pme->lb_buf_nalloc = pme->atc[0].n;
5077 snew(pme->lb_buf1, pme->lb_buf_nalloc);
5079 pme->atc[0].coefficient = pme->lb_buf1;
5084 local_sigma = sigmaA;
5088 local_sigma = sigmaB;
5091 gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
5101 RedistSigma = sigmaA;
5105 RedistSigma = sigmaB;
5108 gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
5110 wallcycle_start(wcycle, ewcPME_REDISTXF);
5112 do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, RedistC6);
5113 if (pme->lb_buf_nalloc < atc->n)
5115 pme->lb_buf_nalloc = atc->nalloc;
5116 srenew(pme->lb_buf1, pme->lb_buf_nalloc);
5117 srenew(pme->lb_buf2, pme->lb_buf_nalloc);
5119 local_c6 = pme->lb_buf1;
5120 for (i = 0; i < atc->n; ++i)
5122 local_c6[i] = atc->coefficient[i];
5126 do_redist_pos_coeffs(pme, cr, start, homenr, FALSE, x, RedistSigma);
5127 local_sigma = pme->lb_buf2;
5128 for (i = 0; i < atc->n; ++i)
5130 local_sigma[i] = atc->coefficient[i];
5134 wallcycle_stop(wcycle, ewcPME_REDISTXF);
5136 calc_initial_lb_coeffs(pme, local_c6, local_sigma);
5138 /*Seven terms in LJ-PME with LB, grid_index < 2 reserved for electrostatics*/
5139 for (grid_index = 2; grid_index < 9; ++grid_index)
5141 /* Unpack structure */
5142 pmegrid = &pme->pmegrid[grid_index];
5143 fftgrid = pme->fftgrid[grid_index];
5144 cfftgrid = pme->cfftgrid[grid_index];
5145 pfft_setup = pme->pfft_setup[grid_index];
5146 calc_next_lb_coeffs(pme, local_sigma);
5147 grid = pmegrid->grid.grid;
5150 if (flags & GMX_PME_SPREAD)
5152 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
5153 /* Spread the c6 on a grid */
5154 spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
5158 inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
5161 inc_nrnb(nrnb, eNR_SPREADBSP,
5162 pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
5163 if (pme->nthread == 1)
5165 wrap_periodic_pmegrid(pme, grid);
5166 /* sum contributions to local grid from other nodes */
5168 if (pme->nnodes > 1)
5170 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
5174 copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
5176 wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
5178 /*Here we start a large thread parallel region*/
5179 #pragma omp parallel num_threads(pme->nthread) private(thread)
5181 thread = gmx_omp_get_thread_num();
5182 if (flags & GMX_PME_SOLVE)
5187 wallcycle_start(wcycle, ewcPME_FFT);
5190 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
5194 wallcycle_stop(wcycle, ewcPME_FFT);
5201 if (flags & GMX_PME_SOLVE)
5203 /* solve in k-space for our local cells */
5204 #pragma omp parallel num_threads(pme->nthread) private(thread)
5207 thread = gmx_omp_get_thread_num();
5210 wallcycle_start(wcycle, ewcLJPME);
5214 solve_pme_lj_yzx(pme, &pme->cfftgrid[2], TRUE, ewaldcoeff_lj,
5215 box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
5217 pme->nthread, thread);
5220 wallcycle_stop(wcycle, ewcLJPME);
5222 inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
5229 /* This should only be called on the master thread and
5230 * after the threads have synchronized.
5232 get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[2+fep_state], vir_AB[2+fep_state]);
5237 bFirst = !(flags & GMX_PME_DO_COULOMB);
5238 calc_initial_lb_coeffs(pme, local_c6, local_sigma);
5239 for (grid_index = 8; grid_index >= 2; --grid_index)
5241 /* Unpack structure */
5242 pmegrid = &pme->pmegrid[grid_index];
5243 fftgrid = pme->fftgrid[grid_index];
5244 cfftgrid = pme->cfftgrid[grid_index];
5245 pfft_setup = pme->pfft_setup[grid_index];
5246 grid = pmegrid->grid.grid;
5247 calc_next_lb_coeffs(pme, local_sigma);
5249 #pragma omp parallel num_threads(pme->nthread) private(thread)
5251 thread = gmx_omp_get_thread_num();
5256 wallcycle_start(wcycle, ewcPME_FFT);
5259 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
5263 wallcycle_stop(wcycle, ewcPME_FFT);
5267 if (pme->nodeid == 0)
5269 ntot = pme->nkx*pme->nky*pme->nkz;
5270 npme = ntot*log((real)ntot)/log(2.0);
5271 inc_nrnb(nrnb, eNR_FFT, 2*npme);
5273 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
5276 copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
5278 } /*#pragma omp parallel*/
5280 /* distribute local grid to all nodes */
5282 if (pme->nnodes > 1)
5284 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
5289 unwrap_periodic_pmegrid(pme, grid);
5291 /* interpolate forces for our local atoms */
5293 bClearF = (bFirst && PAR(cr));
5294 scale = pme->bFEP ? (fep_state < 1 ? 1.0-lambda_lj : lambda_lj) : 1.0;
5295 scale *= lb_scale_factor[grid_index-2];
5296 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
5297 for (thread = 0; thread < pme->nthread; thread++)
5299 gather_f_bsplines(pme, grid, bClearF, &pme->atc[0],
5300 &pme->atc[0].spline[thread],
5305 inc_nrnb(nrnb, eNR_GATHERFBSP,
5306 pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
5307 wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
5310 } /* for (grid_index = 8; grid_index >= 2; --grid_index) */
5312 } /* for (fep_state = 0; fep_state < fep_states_lj; ++fep_state) */
5313 } /* if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB) */
5315 if (bCalcF && pme->nnodes > 1)
5317 wallcycle_start(wcycle, ewcPME_REDISTXF);
5318 for (d = 0; d < pme->ndecompdim; d++)
5321 if (d == pme->ndecompdim - 1)
5328 n_d = pme->atc[d+1].n;
5329 f_d = pme->atc[d+1].f;
5331 if (DOMAINDECOMP(cr))
5333 dd_pmeredist_f(pme, atc, n_d, f_d,
5334 d == pme->ndecompdim-1 && pme->bPPnode);
5338 wallcycle_stop(wcycle, ewcPME_REDISTXF);
5344 if (flags & GMX_PME_DO_COULOMB)
5348 *energy_q = energy_AB[0];
5349 m_add(vir_q, vir_AB[0], vir_q);
5353 *energy_q = (1.0-lambda_q)*energy_AB[0] + lambda_q*energy_AB[1];
5354 *dvdlambda_q += energy_AB[1] - energy_AB[0];
5355 for (i = 0; i < DIM; i++)
5357 for (j = 0; j < DIM; j++)
5359 vir_q[i][j] += (1.0-lambda_q)*vir_AB[0][i][j] +
5360 lambda_q*vir_AB[1][i][j];
5366 fprintf(debug, "Electrostatic PME mesh energy: %g\n", *energy_q);
5374 if (flags & GMX_PME_DO_LJ)
5378 *energy_lj = energy_AB[2];
5379 m_add(vir_lj, vir_AB[2], vir_lj);
5383 *energy_lj = (1.0-lambda_lj)*energy_AB[2] + lambda_lj*energy_AB[3];
5384 *dvdlambda_lj += energy_AB[3] - energy_AB[2];
5385 for (i = 0; i < DIM; i++)
5387 for (j = 0; j < DIM; j++)
5389 vir_lj[i][j] += (1.0-lambda_lj)*vir_AB[2][i][j] + lambda_lj*vir_AB[3][i][j];
5395 fprintf(debug, "Lennard-Jones PME mesh energy: %g\n", *energy_lj);