2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
39 * \brief This file contains function definitions necessary for
40 * computing energies and forces for the PME long-ranged part (Coulomb
43 * \author Erik Lindahl <erik@kth.se>
44 * \author Berk Hess <hess@kth.se>
45 * \ingroup module_ewald
47 /* IMPORTANT FOR DEVELOPERS:
49 * Triclinic pme stuff isn't entirely trivial, and we've experienced
50 * some bugs during development (many of them due to me). To avoid
51 * this in the future, please check the following things if you make
52 * changes in this file:
54 * 1. You should obtain identical (at least to the PME precision)
55 * energies, forces, and virial for
56 * a rectangular box and a triclinic one where the z (or y) axis is
57 * tilted a whole box side. For instance you could use these boxes:
59 * rectangular triclinic
64 * 2. You should check the energy conservation in a triclinic box.
66 * It might seem an overkill, but better safe than sorry.
84 #include "gromacs/fft/parallel_3dfft.h"
85 #include "gromacs/fileio/pdbio.h"
86 #include "gromacs/gmxlib/network.h"
87 #include "gromacs/gmxlib/nrnb.h"
88 #include "gromacs/math/gmxcomplex.h"
89 #include "gromacs/math/invertmatrix.h"
90 #include "gromacs/math/units.h"
91 #include "gromacs/math/vec.h"
92 #include "gromacs/math/vectypes.h"
93 #include "gromacs/mdtypes/commrec.h"
94 #include "gromacs/mdtypes/forcerec.h"
95 #include "gromacs/mdtypes/inputrec.h"
96 #include "gromacs/mdtypes/md_enums.h"
97 #include "gromacs/pbcutil/pbc.h"
98 #include "gromacs/timing/cyclecounter.h"
99 #include "gromacs/timing/wallcycle.h"
100 #include "gromacs/timing/walltime_accounting.h"
101 #include "gromacs/utility/basedefinitions.h"
102 #include "gromacs/utility/exceptions.h"
103 #include "gromacs/utility/fatalerror.h"
104 #include "gromacs/utility/gmxmpi.h"
105 #include "gromacs/utility/gmxomp.h"
106 #include "gromacs/utility/real.h"
107 #include "gromacs/utility/smalloc.h"
109 #include "calculate-spline-moduli.h"
110 #include "pme-gather.h"
111 #include "pme-grid.h"
112 #include "pme-internal.h"
113 #include "pme-redistribute.h"
114 #include "pme-solve.h"
115 #include "pme-spline-work.h"
116 #include "pme-spread.h"
118 /*! \brief Number of bytes in a cache line.
120 * Must also be a multiple of the SIMD and SIMD4 register size, to
121 * preserve alignment.
123 const int gmxCacheLineSize = 64;
125 //! Set up coordinate communication
126 static void setup_coordinate_communication(pme_atomcomm_t *atc)
134 for (i = 1; i <= nslab/2; i++)
136 fw = (atc->nodeid + i) % nslab;
137 bw = (atc->nodeid - i + nslab) % nslab;
140 atc->node_dest[n] = fw;
141 atc->node_src[n] = bw;
146 atc->node_dest[n] = bw;
147 atc->node_src[n] = fw;
153 int gmx_pme_destroy(FILE *log, struct gmx_pme_t **pmedata)
159 fprintf(log, "Destroying PME data structures.\n");
162 sfree((*pmedata)->nnx);
163 sfree((*pmedata)->nny);
164 sfree((*pmedata)->nnz);
166 for (i = 0; i < (*pmedata)->ngrids; ++i)
168 pmegrids_destroy(&(*pmedata)->pmegrid[i]);
169 sfree((*pmedata)->fftgrid[i]);
170 sfree((*pmedata)->cfftgrid[i]);
171 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setup[i]);
174 sfree((*pmedata)->lb_buf1);
175 sfree((*pmedata)->lb_buf2);
177 pme_free_all_work(&(*pmedata)->solve_work, (*pmedata)->nthread);
185 /*! \brief Round \p n up to the next multiple of \p f */
186 static int mult_up(int n, int f)
188 return ((n + f - 1)/f)*f;
191 /*! \brief Return estimate of the load imbalance from the PME grid not being a good match for the number of PME ranks */
192 static double estimate_pme_load_imbalance(struct gmx_pme_t *pme)
197 nma = pme->nnodes_major;
198 nmi = pme->nnodes_minor;
200 n1 = mult_up(pme->nkx, nma)*mult_up(pme->nky, nmi)*pme->nkz;
201 n2 = mult_up(pme->nkx, nma)*mult_up(pme->nkz, nmi)*pme->nky;
202 n3 = mult_up(pme->nky, nma)*mult_up(pme->nkz, nmi)*pme->nkx;
204 /* pme_solve is roughly double the cost of an fft */
206 return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
209 /*! \brief Initialize atom communication data structure */
210 static void init_atomcomm(struct gmx_pme_t *pme, pme_atomcomm_t *atc,
211 int dimind, gmx_bool bSpread)
215 atc->dimind = dimind;
222 atc->mpi_comm = pme->mpi_comm_d[dimind];
223 MPI_Comm_size(atc->mpi_comm, &atc->nslab);
224 MPI_Comm_rank(atc->mpi_comm, &atc->nodeid);
228 fprintf(debug, "For PME atom communication in dimind %d: nslab %d rank %d\n", atc->dimind, atc->nslab, atc->nodeid);
232 atc->bSpread = bSpread;
233 atc->pme_order = pme->pme_order;
237 snew(atc->node_dest, atc->nslab);
238 snew(atc->node_src, atc->nslab);
239 setup_coordinate_communication(atc);
241 snew(atc->count_thread, pme->nthread);
242 for (thread = 0; thread < pme->nthread; thread++)
244 snew(atc->count_thread[thread], atc->nslab);
246 atc->count = atc->count_thread[0];
247 snew(atc->rcount, atc->nslab);
248 snew(atc->buf_index, atc->nslab);
251 atc->nthread = pme->nthread;
252 if (atc->nthread > 1)
254 snew(atc->thread_plist, atc->nthread);
256 snew(atc->spline, atc->nthread);
257 for (thread = 0; thread < atc->nthread; thread++)
259 if (atc->nthread > 1)
261 snew(atc->thread_plist[thread].n, atc->nthread+2*gmxCacheLineSize);
262 atc->thread_plist[thread].n += gmxCacheLineSize;
264 snew(atc->spline[thread].thread_one, pme->nthread);
265 atc->spline[thread].thread_one[thread] = 1;
269 /*! \brief Initialize data structure for communication */
271 init_overlap_comm(pme_overlap_t * ol,
282 pme_grid_comm_t *pgc;
284 int fft_start, fft_end, send_index1, recv_index1;
294 /* Linear translation of the PME grid won't affect reciprocal space
295 * calculations, so to optimize we only interpolate "upwards",
296 * which also means we only have to consider overlap in one direction.
297 * I.e., particles on this node might also be spread to grid indices
298 * that belong to higher nodes (modulo nnodes)
301 snew(ol->s2g0, ol->nnodes+1);
302 snew(ol->s2g1, ol->nnodes);
305 fprintf(debug, "PME slab boundaries:");
307 for (i = 0; i < nnodes; i++)
309 /* s2g0 the local interpolation grid start.
310 * s2g1 the local interpolation grid end.
311 * Since in calc_pidx we divide particles, and not grid lines,
312 * spatially uniform along dimension x or y, we need to round
313 * s2g0 down and s2g1 up.
315 ol->s2g0[i] = ( i *ndata + 0 )/nnodes;
316 ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
320 fprintf(debug, " %3d %3d", ol->s2g0[i], ol->s2g1[i]);
323 ol->s2g0[nnodes] = ndata;
326 fprintf(debug, "\n");
329 /* Determine with how many nodes we need to communicate the grid overlap */
335 for (i = 0; i < nnodes; i++)
337 if ((i+b < nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
338 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
344 while (bCont && b < nnodes);
345 ol->noverlap_nodes = b - 1;
347 snew(ol->send_id, ol->noverlap_nodes);
348 snew(ol->recv_id, ol->noverlap_nodes);
349 for (b = 0; b < ol->noverlap_nodes; b++)
351 ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
352 ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
354 snew(ol->comm_data, ol->noverlap_nodes);
357 for (b = 0; b < ol->noverlap_nodes; b++)
359 pgc = &ol->comm_data[b];
361 fft_start = ol->s2g0[ol->send_id[b]];
362 fft_end = ol->s2g0[ol->send_id[b]+1];
363 if (ol->send_id[b] < nodeid)
368 send_index1 = ol->s2g1[nodeid];
369 send_index1 = std::min(send_index1, fft_end);
370 pgc->send_index0 = fft_start;
371 pgc->send_nindex = std::max(0, send_index1 - pgc->send_index0);
372 ol->send_size += pgc->send_nindex;
374 /* We always start receiving to the first index of our slab */
375 fft_start = ol->s2g0[ol->nodeid];
376 fft_end = ol->s2g0[ol->nodeid+1];
377 recv_index1 = ol->s2g1[ol->recv_id[b]];
378 if (ol->recv_id[b] > nodeid)
380 recv_index1 -= ndata;
382 recv_index1 = std::min(recv_index1, fft_end);
383 pgc->recv_index0 = fft_start;
384 pgc->recv_nindex = std::max(0, recv_index1 - pgc->recv_index0);
388 /* Communicate the buffer sizes to receive */
389 for (b = 0; b < ol->noverlap_nodes; b++)
391 MPI_Sendrecv(&ol->send_size, 1, MPI_INT, ol->send_id[b], b,
392 &ol->comm_data[b].recv_size, 1, MPI_INT, ol->recv_id[b], b,
393 ol->mpi_comm, &stat);
397 /* For non-divisible grid we need pme_order iso pme_order-1 */
398 snew(ol->sendbuf, norder*commplainsize);
399 snew(ol->recvbuf, norder*commplainsize);
402 void gmx_pme_check_restrictions(int pme_order,
403 int nkx, int nky, int nkz,
406 gmx_bool bUseThreads,
408 gmx_bool *bValidSettings)
410 if (pme_order > PME_ORDER_MAX)
414 *bValidSettings = FALSE;
417 gmx_fatal(FARGS, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
418 pme_order, PME_ORDER_MAX);
421 if (nkx <= pme_order*(nnodes_major > 1 ? 2 : 1) ||
422 nky <= pme_order*(nnodes_minor > 1 ? 2 : 1) ||
427 *bValidSettings = FALSE;
430 gmx_fatal(FARGS, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order",
434 /* Check for a limitation of the (current) sum_fftgrid_dd code.
435 * We only allow multiple communication pulses in dim 1, not in dim 0.
437 if (bUseThreads && (nkx < nnodes_major*pme_order &&
438 nkx != nnodes_major*(pme_order - 1)))
442 *bValidSettings = FALSE;
445 gmx_fatal(FARGS, "The number of PME grid lines per rank along x is %g. But when using OpenMP threads, the number of grid lines per rank along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use fewer ranks along x (and possibly more along y and/or z) by specifying -dd manually.",
446 nkx/(double)nnodes_major, pme_order);
449 if (bValidSettings != NULL)
451 *bValidSettings = TRUE;
457 /*! \brief Round \p enumerator */
458 static int div_round_up(int enumerator, int denominator)
460 return (enumerator + denominator - 1)/denominator;
463 int gmx_pme_init(struct gmx_pme_t **pmedata,
469 gmx_bool bFreeEnergy_q,
470 gmx_bool bFreeEnergy_lj,
471 gmx_bool bReproducible,
474 struct gmx_pme_t *pme = NULL;
476 int use_threads, sum_use_threads, i;
481 fprintf(debug, "Creating PME data structures.\n");
485 pme->sum_qgrid_tmp = NULL;
486 pme->sum_qgrid_dd_tmp = NULL;
492 pme->nnodes_major = nnodes_major;
493 pme->nnodes_minor = nnodes_minor;
496 if (nnodes_major*nnodes_minor > 1)
498 pme->mpi_comm = cr->mpi_comm_mygroup;
500 MPI_Comm_rank(pme->mpi_comm, &pme->nodeid);
501 MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
502 if (pme->nnodes != nnodes_major*nnodes_minor)
504 gmx_incons("PME rank count mismatch");
509 pme->mpi_comm = MPI_COMM_NULL;
513 if (pme->nnodes == 1)
516 pme->mpi_comm_d[0] = MPI_COMM_NULL;
517 pme->mpi_comm_d[1] = MPI_COMM_NULL;
520 pme->nodeid_major = 0;
521 pme->nodeid_minor = 0;
523 pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
528 if (nnodes_minor == 1)
531 pme->mpi_comm_d[0] = pme->mpi_comm;
532 pme->mpi_comm_d[1] = MPI_COMM_NULL;
535 pme->nodeid_major = pme->nodeid;
536 pme->nodeid_minor = 0;
539 else if (nnodes_major == 1)
542 pme->mpi_comm_d[0] = MPI_COMM_NULL;
543 pme->mpi_comm_d[1] = pme->mpi_comm;
546 pme->nodeid_major = 0;
547 pme->nodeid_minor = pme->nodeid;
551 if (pme->nnodes % nnodes_major != 0)
553 gmx_incons("For 2D PME decomposition, #PME ranks must be divisible by the number of ranks in the major dimension");
558 MPI_Comm_split(pme->mpi_comm, pme->nodeid % nnodes_minor,
559 pme->nodeid, &pme->mpi_comm_d[0]); /* My communicator along major dimension */
560 MPI_Comm_split(pme->mpi_comm, pme->nodeid/nnodes_minor,
561 pme->nodeid, &pme->mpi_comm_d[1]); /* My communicator along minor dimension */
563 MPI_Comm_rank(pme->mpi_comm_d[0], &pme->nodeid_major);
564 MPI_Comm_size(pme->mpi_comm_d[0], &pme->nnodes_major);
565 MPI_Comm_rank(pme->mpi_comm_d[1], &pme->nodeid_minor);
566 MPI_Comm_size(pme->mpi_comm_d[1], &pme->nnodes_minor);
569 pme->bPPnode = (cr->duty & DUTY_PP);
572 pme->nthread = nthread;
574 /* Check if any of the PME MPI ranks uses threads */
575 use_threads = (pme->nthread > 1 ? 1 : 0);
579 MPI_Allreduce(&use_threads, &sum_use_threads, 1, MPI_INT,
580 MPI_SUM, pme->mpi_comm);
585 sum_use_threads = use_threads;
587 pme->bUseThreads = (sum_use_threads > 0);
589 if (ir->ePBC == epbcSCREW)
591 gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
594 pme->bFEP_q = ((ir->efep != efepNO) && bFreeEnergy_q);
595 pme->bFEP_lj = ((ir->efep != efepNO) && bFreeEnergy_lj);
596 pme->bFEP = (pme->bFEP_q || pme->bFEP_lj);
600 pme->bP3M = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
601 pme->pme_order = ir->pme_order;
603 /* Always constant electrostatics coefficients */
604 pme->epsilon_r = ir->epsilon_r;
606 /* Always constant LJ coefficients */
607 pme->ljpme_combination_rule = ir->ljpme_combination_rule;
609 /* If we violate restrictions, generate a fatal error here */
610 gmx_pme_check_restrictions(pme->pme_order,
611 pme->nkx, pme->nky, pme->nkz,
623 MPI_Type_contiguous(DIM, GMX_MPI_REAL, &(pme->rvec_mpi));
624 MPI_Type_commit(&(pme->rvec_mpi));
627 /* Note that the coefficient spreading and force gathering, which usually
628 * takes about the same amount of time as FFT+solve_pme,
629 * is always fully load balanced
630 * (unless the coefficient distribution is inhomogeneous).
633 imbal = estimate_pme_load_imbalance(pme);
634 if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
638 "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
639 " For optimal PME load balancing\n"
640 " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_ranks_x (%d)\n"
641 " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_ranks_y (%d)\n"
643 (int)((imbal-1)*100 + 0.5),
644 pme->nkx, pme->nky, pme->nnodes_major,
645 pme->nky, pme->nkz, pme->nnodes_minor);
649 /* For non-divisible grid we need pme_order iso pme_order-1 */
650 /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
651 * y is always copied through a buffer: we don't need padding in z,
652 * but we do need the overlap in x because of the communication order.
654 init_overlap_comm(&pme->overlap[0], pme->pme_order,
658 pme->nnodes_major, pme->nodeid_major,
660 (div_round_up(pme->nky, pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
662 /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
663 * We do this with an offset buffer of equal size, so we need to allocate
664 * extra for the offset. That's what the (+1)*pme->nkz is for.
666 init_overlap_comm(&pme->overlap[1], pme->pme_order,
670 pme->nnodes_minor, pme->nodeid_minor,
672 (div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
674 /* Double-check for a limitation of the (current) sum_fftgrid_dd code.
675 * Note that gmx_pme_check_restrictions checked for this already.
677 if (pme->bUseThreads && pme->overlap[0].noverlap_nodes > 1)
679 gmx_incons("More than one communication pulse required for grid overlap communication along the major dimension while using threads");
682 snew(pme->bsp_mod[XX], pme->nkx);
683 snew(pme->bsp_mod[YY], pme->nky);
684 snew(pme->bsp_mod[ZZ], pme->nkz);
686 /* The required size of the interpolation grid, including overlap.
687 * The allocated size (pmegrid_n?) might be slightly larger.
689 pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
690 pme->overlap[0].s2g0[pme->nodeid_major];
691 pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
692 pme->overlap[1].s2g0[pme->nodeid_minor];
693 pme->pmegrid_nz_base = pme->nkz;
694 pme->pmegrid_nz = pme->pmegrid_nz_base + pme->pme_order - 1;
695 set_grid_alignment(&pme->pmegrid_nz, pme->pme_order);
697 pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
698 pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
699 pme->pmegrid_start_iz = 0;
701 make_gridindex5_to_localindex(pme->nkx,
702 pme->pmegrid_start_ix,
703 pme->pmegrid_nx - (pme->pme_order-1),
704 &pme->nnx, &pme->fshx);
705 make_gridindex5_to_localindex(pme->nky,
706 pme->pmegrid_start_iy,
707 pme->pmegrid_ny - (pme->pme_order-1),
708 &pme->nny, &pme->fshy);
709 make_gridindex5_to_localindex(pme->nkz,
710 pme->pmegrid_start_iz,
711 pme->pmegrid_nz_base,
712 &pme->nnz, &pme->fshz);
714 pme->spline_work = make_pme_spline_work(pme->pme_order);
719 /* It doesn't matter if we allocate too many grids here,
720 * we only allocate and use the ones we need.
722 if (EVDW_PME(ir->vdwtype))
724 pme->ngrids = ((ir->ljpme_combination_rule == eljpmeLB) ? DO_Q_AND_LJ_LB : DO_Q_AND_LJ);
730 snew(pme->fftgrid, pme->ngrids);
731 snew(pme->cfftgrid, pme->ngrids);
732 snew(pme->pfft_setup, pme->ngrids);
734 for (i = 0; i < pme->ngrids; ++i)
736 if ((i < DO_Q && EEL_PME(ir->coulombtype) && (i == 0 ||
738 (i >= DO_Q && EVDW_PME(ir->vdwtype) && (i == 2 ||
740 ir->ljpme_combination_rule == eljpmeLB)))
742 pmegrids_init(&pme->pmegrid[i],
743 pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
744 pme->pmegrid_nz_base,
748 pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
749 pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
750 /* This routine will allocate the grid data to fit the FFTs */
751 gmx_parallel_3dfft_init(&pme->pfft_setup[i], ndata,
752 &pme->fftgrid[i], &pme->cfftgrid[i],
754 bReproducible, pme->nthread);
761 /* Use plain SPME B-spline interpolation */
762 make_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
766 /* Use the P3M grid-optimized influence function */
767 make_p3m_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
770 /* Use atc[0] for spreading */
771 init_atomcomm(pme, &pme->atc[0], nnodes_major > 1 ? 0 : 1, TRUE);
772 if (pme->ndecompdim >= 2)
774 init_atomcomm(pme, &pme->atc[1], 1, FALSE);
777 if (pme->nnodes == 1)
779 pme->atc[0].n = homenr;
780 pme_realloc_atomcomm_things(&pme->atc[0]);
785 pme->lb_buf_nalloc = 0;
787 pme_init_all_work(&pme->solve_work, pme->nthread, pme->nkx);
794 int gmx_pme_reinit(struct gmx_pme_t **pmedata,
796 struct gmx_pme_t * pme_src,
797 const t_inputrec * ir,
805 irc.nkx = grid_size[XX];
806 irc.nky = grid_size[YY];
807 irc.nkz = grid_size[ZZ];
809 if (pme_src->nnodes == 1)
811 homenr = pme_src->atc[0].n;
818 ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
819 &irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE, pme_src->nthread);
823 /* We can easily reuse the allocated pme grids in pme_src */
824 reuse_pmegrids(&pme_src->pmegrid[PME_GRID_QA], &(*pmedata)->pmegrid[PME_GRID_QA]);
825 /* We would like to reuse the fft grids, but that's harder */
831 void gmx_pme_calc_energy(struct gmx_pme_t *pme, int n, rvec *x, real *q, real *V)
838 gmx_incons("gmx_pme_calc_energy called in parallel");
842 gmx_incons("gmx_pme_calc_energy with free energy");
845 atc = &pme->atc_energy;
847 if (atc->spline == NULL)
849 snew(atc->spline, atc->nthread);
853 atc->pme_order = pme->pme_order;
855 pme_realloc_atomcomm_things(atc);
857 atc->coefficient = q;
859 /* We only use the A-charges grid */
860 grid = &pme->pmegrid[PME_GRID_QA];
862 /* Only calculate the spline coefficients, don't actually spread */
863 spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgrid[PME_GRID_QA], FALSE, PME_GRID_QA);
865 *V = gather_energy_bsplines(pme, grid->grid.grid, atc);
868 /*! \brief Calculate initial Lorentz-Berthelot coefficients for LJ-PME */
870 calc_initial_lb_coeffs(struct gmx_pme_t *pme, real *local_c6, real *local_sigma)
873 for (i = 0; i < pme->atc[0].n; ++i)
876 sigma4 = local_sigma[i];
877 sigma4 = sigma4*sigma4;
878 sigma4 = sigma4*sigma4;
879 pme->atc[0].coefficient[i] = local_c6[i] / sigma4;
883 /*! \brief Calculate next Lorentz-Berthelot coefficients for LJ-PME */
885 calc_next_lb_coeffs(struct gmx_pme_t *pme, real *local_sigma)
889 for (i = 0; i < pme->atc[0].n; ++i)
891 pme->atc[0].coefficient[i] *= local_sigma[i];
895 int gmx_pme_do(struct gmx_pme_t *pme,
896 int start, int homenr,
898 real chargeA[], real chargeB[],
899 real c6A[], real c6B[],
900 real sigmaA[], real sigmaB[],
901 matrix box, t_commrec *cr,
902 int maxshift_x, int maxshift_y,
903 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
904 matrix vir_q, real ewaldcoeff_q,
905 matrix vir_lj, real ewaldcoeff_lj,
906 real *energy_q, real *energy_lj,
907 real lambda_q, real lambda_lj,
908 real *dvdlambda_q, real *dvdlambda_lj,
911 int d, i, j, npme, grid_index, max_grid_index;
913 pme_atomcomm_t *atc = NULL;
914 pmegrids_t *pmegrid = NULL;
917 real *coefficient = NULL;
922 gmx_parallel_3dfft_t pfft_setup;
924 t_complex * cfftgrid;
926 gmx_bool bFirst, bDoSplines;
928 int fep_states_lj = pme->bFEP_lj ? 2 : 1;
929 const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
930 const gmx_bool bBackFFT = flags & (GMX_PME_CALC_F | GMX_PME_CALC_POT);
931 const gmx_bool bCalcF = flags & GMX_PME_CALC_F;
933 assert(pme->nnodes > 0);
934 assert(pme->nnodes == 1 || pme->ndecompdim > 0);
940 if (atc->npd > atc->pd_nalloc)
942 atc->pd_nalloc = over_alloc_dd(atc->npd);
943 srenew(atc->pd, atc->pd_nalloc);
945 for (d = pme->ndecompdim-1; d >= 0; d--)
948 atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
954 /* This could be necessary for TPI */
955 pme->atc[0].n = homenr;
956 if (DOMAINDECOMP(cr))
958 pme_realloc_atomcomm_things(atc);
964 gmx::invertBoxMatrix(box, pme->recipbox);
967 /* For simplicity, we construct the splines for all particles if
968 * more than one PME calculations is needed. Some optimization
969 * could be done by keeping track of which atoms have splines
970 * constructed, and construct new splines on each pass for atoms
971 * that don't yet have them.
974 bDoSplines = pme->bFEP || ((flags & GMX_PME_DO_COULOMB) && (flags & GMX_PME_DO_LJ));
976 /* We need a maximum of four separate PME calculations:
977 * grid_index=0: Coulomb PME with charges from state A
978 * grid_index=1: Coulomb PME with charges from state B
979 * grid_index=2: LJ PME with C6 from state A
980 * grid_index=3: LJ PME with C6 from state B
981 * For Lorentz-Berthelot combination rules, a separate loop is used to
982 * calculate all the terms
985 /* If we are doing LJ-PME with LB, we only do Q here */
986 max_grid_index = (pme->ljpme_combination_rule == eljpmeLB) ? DO_Q : DO_Q_AND_LJ;
988 for (grid_index = 0; grid_index < max_grid_index; ++grid_index)
990 /* Check if we should do calculations at this grid_index
991 * If grid_index is odd we should be doing FEP
992 * If grid_index < 2 we should be doing electrostatic PME
993 * If grid_index >= 2 we should be doing LJ-PME
995 if ((grid_index < DO_Q && (!(flags & GMX_PME_DO_COULOMB) ||
996 (grid_index == 1 && !pme->bFEP_q))) ||
997 (grid_index >= DO_Q && (!(flags & GMX_PME_DO_LJ) ||
998 (grid_index == 3 && !pme->bFEP_lj))))
1002 /* Unpack structure */
1003 pmegrid = &pme->pmegrid[grid_index];
1004 fftgrid = pme->fftgrid[grid_index];
1005 cfftgrid = pme->cfftgrid[grid_index];
1006 pfft_setup = pme->pfft_setup[grid_index];
1009 case 0: coefficient = chargeA + start; break;
1010 case 1: coefficient = chargeB + start; break;
1011 case 2: coefficient = c6A + start; break;
1012 case 3: coefficient = c6B + start; break;
1015 grid = pmegrid->grid.grid;
1019 fprintf(debug, "PME: number of ranks = %d, rank = %d\n",
1020 cr->nnodes, cr->nodeid);
1021 fprintf(debug, "Grid = %p\n", (void*)grid);
1024 gmx_fatal(FARGS, "No grid!");
1029 if (pme->nnodes == 1)
1031 atc->coefficient = coefficient;
1035 wallcycle_start(wcycle, ewcPME_REDISTXF);
1036 do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, coefficient);
1039 wallcycle_stop(wcycle, ewcPME_REDISTXF);
1044 fprintf(debug, "Rank= %6d, pme local particles=%6d\n",
1045 cr->nodeid, atc->n);
1048 if (flags & GMX_PME_SPREAD)
1050 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1052 /* Spread the coefficients on a grid */
1053 spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
1057 inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
1059 inc_nrnb(nrnb, eNR_SPREADBSP,
1060 pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
1062 if (!pme->bUseThreads)
1064 wrap_periodic_pmegrid(pme, grid);
1066 /* sum contributions to local grid from other nodes */
1068 if (pme->nnodes > 1)
1070 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
1075 copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
1078 wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1080 /* TODO If the OpenMP and single-threaded implementations
1081 converge, then spread_on_grid() and
1082 copy_pmegrid_to_fftgrid() will perhaps live in the same
1083 source file and the following debugging function can live
1086 dump_local_fftgrid(pme,fftgrid);
1091 /* Here we start a large thread parallel region */
1092 #pragma omp parallel num_threads(pme->nthread) private(thread)
1096 thread = gmx_omp_get_thread_num();
1097 if (flags & GMX_PME_SOLVE)
1104 wallcycle_start(wcycle, ewcPME_FFT);
1106 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
1110 wallcycle_stop(wcycle, ewcPME_FFT);
1114 /* solve in k-space for our local cells */
1117 wallcycle_start(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
1119 if (grid_index < DO_Q)
1122 solve_pme_yzx(pme, cfftgrid, ewaldcoeff_q,
1123 box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
1125 pme->nthread, thread);
1130 solve_pme_lj_yzx(pme, &cfftgrid, FALSE, ewaldcoeff_lj,
1131 box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
1133 pme->nthread, thread);
1138 wallcycle_stop(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
1140 inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
1150 wallcycle_start(wcycle, ewcPME_FFT);
1152 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
1156 wallcycle_stop(wcycle, ewcPME_FFT);
1160 if (pme->nodeid == 0)
1162 real ntot = pme->nkx*pme->nky*pme->nkz;
1163 npme = static_cast<int>(ntot*log(ntot)/log(2.0));
1164 inc_nrnb(nrnb, eNR_FFT, 2*npme);
1167 /* Note: this wallcycle region is closed below
1168 outside an OpenMP region, so take care if
1169 refactoring code here. */
1170 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1173 copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
1175 } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1177 /* End of thread parallel section.
1178 * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
1183 /* distribute local grid to all nodes */
1185 if (pme->nnodes > 1)
1187 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
1192 unwrap_periodic_pmegrid(pme, grid);
1197 /* interpolate forces for our local atoms */
1201 /* If we are running without parallelization,
1202 * atc->f is the actual force array, not a buffer,
1203 * therefore we should not clear it.
1205 lambda = grid_index < DO_Q ? lambda_q : lambda_lj;
1206 bClearF = (bFirst && PAR(cr));
1207 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1208 for (thread = 0; thread < pme->nthread; thread++)
1212 gather_f_bsplines(pme, grid, bClearF, atc,
1213 &atc->spline[thread],
1214 pme->bFEP ? (grid_index % 2 == 0 ? 1.0-lambda : lambda) : 1.0);
1216 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1221 inc_nrnb(nrnb, eNR_GATHERFBSP,
1222 pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
1223 /* Note: this wallcycle region is opened above inside an OpenMP
1224 region, so take care if refactoring code here. */
1225 wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1230 /* This should only be called on the master thread
1231 * and after the threads have synchronized.
1235 get_pme_ener_vir_q(pme->solve_work, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
1239 get_pme_ener_vir_lj(pme->solve_work, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
1243 } /* of grid_index-loop */
1245 /* For Lorentz-Berthelot combination rules in LJ-PME, we need to calculate
1248 if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB)
1250 /* Loop over A- and B-state if we are doing FEP */
1251 for (fep_state = 0; fep_state < fep_states_lj; ++fep_state)
1253 real *local_c6 = NULL, *local_sigma = NULL, *RedistC6 = NULL, *RedistSigma = NULL;
1254 if (pme->nnodes == 1)
1256 if (pme->lb_buf1 == NULL)
1258 pme->lb_buf_nalloc = pme->atc[0].n;
1259 snew(pme->lb_buf1, pme->lb_buf_nalloc);
1261 pme->atc[0].coefficient = pme->lb_buf1;
1266 local_sigma = sigmaA;
1270 local_sigma = sigmaB;
1273 gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
1283 RedistSigma = sigmaA;
1287 RedistSigma = sigmaB;
1290 gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
1292 wallcycle_start(wcycle, ewcPME_REDISTXF);
1294 do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, RedistC6);
1295 if (pme->lb_buf_nalloc < atc->n)
1297 pme->lb_buf_nalloc = atc->nalloc;
1298 srenew(pme->lb_buf1, pme->lb_buf_nalloc);
1299 srenew(pme->lb_buf2, pme->lb_buf_nalloc);
1301 local_c6 = pme->lb_buf1;
1302 for (i = 0; i < atc->n; ++i)
1304 local_c6[i] = atc->coefficient[i];
1308 do_redist_pos_coeffs(pme, cr, start, homenr, FALSE, x, RedistSigma);
1309 local_sigma = pme->lb_buf2;
1310 for (i = 0; i < atc->n; ++i)
1312 local_sigma[i] = atc->coefficient[i];
1316 wallcycle_stop(wcycle, ewcPME_REDISTXF);
1318 calc_initial_lb_coeffs(pme, local_c6, local_sigma);
1320 /*Seven terms in LJ-PME with LB, grid_index < 2 reserved for electrostatics*/
1321 for (grid_index = 2; grid_index < 9; ++grid_index)
1323 /* Unpack structure */
1324 pmegrid = &pme->pmegrid[grid_index];
1325 fftgrid = pme->fftgrid[grid_index];
1326 pfft_setup = pme->pfft_setup[grid_index];
1327 calc_next_lb_coeffs(pme, local_sigma);
1328 grid = pmegrid->grid.grid;
1331 if (flags & GMX_PME_SPREAD)
1333 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1334 /* Spread the c6 on a grid */
1335 spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
1339 inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
1342 inc_nrnb(nrnb, eNR_SPREADBSP,
1343 pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
1344 if (pme->nthread == 1)
1346 wrap_periodic_pmegrid(pme, grid);
1347 /* sum contributions to local grid from other nodes */
1349 if (pme->nnodes > 1)
1351 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
1355 copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
1357 wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1359 /*Here we start a large thread parallel region*/
1360 #pragma omp parallel num_threads(pme->nthread) private(thread)
1364 thread = gmx_omp_get_thread_num();
1365 if (flags & GMX_PME_SOLVE)
1370 wallcycle_start(wcycle, ewcPME_FFT);
1373 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
1377 wallcycle_stop(wcycle, ewcPME_FFT);
1382 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1386 if (flags & GMX_PME_SOLVE)
1388 /* solve in k-space for our local cells */
1389 #pragma omp parallel num_threads(pme->nthread) private(thread)
1394 thread = gmx_omp_get_thread_num();
1397 wallcycle_start(wcycle, ewcLJPME);
1401 solve_pme_lj_yzx(pme, &pme->cfftgrid[2], TRUE, ewaldcoeff_lj,
1402 box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
1404 pme->nthread, thread);
1407 wallcycle_stop(wcycle, ewcLJPME);
1409 inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
1412 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1418 /* This should only be called on the master thread and
1419 * after the threads have synchronized.
1421 get_pme_ener_vir_lj(pme->solve_work, pme->nthread, &energy_AB[2+fep_state], vir_AB[2+fep_state]);
1426 bFirst = !(flags & GMX_PME_DO_COULOMB);
1427 calc_initial_lb_coeffs(pme, local_c6, local_sigma);
1428 for (grid_index = 8; grid_index >= 2; --grid_index)
1430 /* Unpack structure */
1431 pmegrid = &pme->pmegrid[grid_index];
1432 fftgrid = pme->fftgrid[grid_index];
1433 pfft_setup = pme->pfft_setup[grid_index];
1434 grid = pmegrid->grid.grid;
1435 calc_next_lb_coeffs(pme, local_sigma);
1437 #pragma omp parallel num_threads(pme->nthread) private(thread)
1441 thread = gmx_omp_get_thread_num();
1446 wallcycle_start(wcycle, ewcPME_FFT);
1449 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
1453 wallcycle_stop(wcycle, ewcPME_FFT);
1457 if (pme->nodeid == 0)
1459 real ntot = pme->nkx*pme->nky*pme->nkz;
1460 npme = static_cast<int>(ntot*log(ntot)/log(2.0));
1461 inc_nrnb(nrnb, eNR_FFT, 2*npme);
1463 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1466 copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
1468 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1469 } /*#pragma omp parallel*/
1471 /* distribute local grid to all nodes */
1473 if (pme->nnodes > 1)
1475 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
1480 unwrap_periodic_pmegrid(pme, grid);
1484 /* interpolate forces for our local atoms */
1486 bClearF = (bFirst && PAR(cr));
1487 scale = pme->bFEP ? (fep_state < 1 ? 1.0-lambda_lj : lambda_lj) : 1.0;
1488 scale *= lb_scale_factor[grid_index-2];
1490 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1491 for (thread = 0; thread < pme->nthread; thread++)
1495 gather_f_bsplines(pme, grid, bClearF, &pme->atc[0],
1496 &pme->atc[0].spline[thread],
1499 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1504 inc_nrnb(nrnb, eNR_GATHERFBSP,
1505 pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
1507 wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1510 } /* for (grid_index = 8; grid_index >= 2; --grid_index) */
1512 } /* for (fep_state = 0; fep_state < fep_states_lj; ++fep_state) */
1513 } /* if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB) */
1515 if (bCalcF && pme->nnodes > 1)
1517 wallcycle_start(wcycle, ewcPME_REDISTXF);
1518 for (d = 0; d < pme->ndecompdim; d++)
1521 if (d == pme->ndecompdim - 1)
1528 n_d = pme->atc[d+1].n;
1529 f_d = pme->atc[d+1].f;
1531 if (DOMAINDECOMP(cr))
1533 dd_pmeredist_f(pme, atc, n_d, f_d,
1534 d == pme->ndecompdim-1 && pme->bPPnode);
1538 wallcycle_stop(wcycle, ewcPME_REDISTXF);
1544 if (flags & GMX_PME_DO_COULOMB)
1548 *energy_q = energy_AB[0];
1549 m_add(vir_q, vir_AB[0], vir_q);
1553 *energy_q = (1.0-lambda_q)*energy_AB[0] + lambda_q*energy_AB[1];
1554 *dvdlambda_q += energy_AB[1] - energy_AB[0];
1555 for (i = 0; i < DIM; i++)
1557 for (j = 0; j < DIM; j++)
1559 vir_q[i][j] += (1.0-lambda_q)*vir_AB[0][i][j] +
1560 lambda_q*vir_AB[1][i][j];
1566 fprintf(debug, "Electrostatic PME mesh energy: %g\n", *energy_q);
1574 if (flags & GMX_PME_DO_LJ)
1578 *energy_lj = energy_AB[2];
1579 m_add(vir_lj, vir_AB[2], vir_lj);
1583 *energy_lj = (1.0-lambda_lj)*energy_AB[2] + lambda_lj*energy_AB[3];
1584 *dvdlambda_lj += energy_AB[3] - energy_AB[2];
1585 for (i = 0; i < DIM; i++)
1587 for (j = 0; j < DIM; j++)
1589 vir_lj[i][j] += (1.0-lambda_lj)*vir_AB[2][i][j] + lambda_lj*vir_AB[3][i][j];
1595 fprintf(debug, "Lennard-Jones PME mesh energy: %g\n", *energy_lj);