Merge branch release-5-1
[alexxy/gromacs.git] / src / gromacs / ewald / pme.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 /*! \internal \file
38  *
39  * \brief This file contains function definitions necessary for
40  * computing energies and forces for the PME long-ranged part (Coulomb
41  * and LJ).
42  *
43  * \author Erik Lindahl <erik@kth.se>
44  * \author Berk Hess <hess@kth.se>
45  * \ingroup module_ewald
46  */
47 /* IMPORTANT FOR DEVELOPERS:
48  *
49  * Triclinic pme stuff isn't entirely trivial, and we've experienced
50  * some bugs during development (many of them due to me). To avoid
51  * this in the future, please check the following things if you make
52  * changes in this file:
53  *
54  * 1. You should obtain identical (at least to the PME precision)
55  *    energies, forces, and virial for
56  *    a rectangular box and a triclinic one where the z (or y) axis is
57  *    tilted a whole box side. For instance you could use these boxes:
58  *
59  *    rectangular       triclinic
60  *     2  0  0           2  0  0
61  *     0  2  0           0  2  0
62  *     0  0  6           2  2  6
63  *
64  * 2. You should check the energy conservation in a triclinic box.
65  *
66  * It might seem an overkill, but better safe than sorry.
67  * /Erik 001109
68  */
69
70 #include "gmxpre.h"
71
72 #include "pme.h"
73
74 #include "config.h"
75
76 #include <assert.h>
77 #include <math.h>
78 #include <stdio.h>
79 #include <stdlib.h>
80 #include <string.h>
81
82 #include <algorithm>
83
84 #include "gromacs/fft/parallel_3dfft.h"
85 #include "gromacs/fileio/pdbio.h"
86 #include "gromacs/gmxlib/network.h"
87 #include "gromacs/gmxlib/nrnb.h"
88 #include "gromacs/math/gmxcomplex.h"
89 #include "gromacs/math/invertmatrix.h"
90 #include "gromacs/math/units.h"
91 #include "gromacs/math/vec.h"
92 #include "gromacs/math/vectypes.h"
93 #include "gromacs/mdtypes/commrec.h"
94 #include "gromacs/mdtypes/forcerec.h"
95 #include "gromacs/mdtypes/inputrec.h"
96 #include "gromacs/mdtypes/md_enums.h"
97 #include "gromacs/pbcutil/pbc.h"
98 #include "gromacs/timing/cyclecounter.h"
99 #include "gromacs/timing/wallcycle.h"
100 #include "gromacs/timing/walltime_accounting.h"
101 #include "gromacs/utility/basedefinitions.h"
102 #include "gromacs/utility/exceptions.h"
103 #include "gromacs/utility/fatalerror.h"
104 #include "gromacs/utility/gmxmpi.h"
105 #include "gromacs/utility/gmxomp.h"
106 #include "gromacs/utility/real.h"
107 #include "gromacs/utility/smalloc.h"
108
109 #include "calculate-spline-moduli.h"
110 #include "pme-gather.h"
111 #include "pme-grid.h"
112 #include "pme-internal.h"
113 #include "pme-redistribute.h"
114 #include "pme-solve.h"
115 #include "pme-spline-work.h"
116 #include "pme-spread.h"
117
118 /*! \brief Number of bytes in a cache line.
119  *
120  * Must also be a multiple of the SIMD and SIMD4 register size, to
121  * preserve alignment.
122  */
123 const int gmxCacheLineSize = 64;
124
125 //! Set up coordinate communication
126 static void setup_coordinate_communication(pme_atomcomm_t *atc)
127 {
128     int nslab, n, i;
129     int fw, bw;
130
131     nslab = atc->nslab;
132
133     n = 0;
134     for (i = 1; i <= nslab/2; i++)
135     {
136         fw = (atc->nodeid + i) % nslab;
137         bw = (atc->nodeid - i + nslab) % nslab;
138         if (n < nslab - 1)
139         {
140             atc->node_dest[n] = fw;
141             atc->node_src[n]  = bw;
142             n++;
143         }
144         if (n < nslab - 1)
145         {
146             atc->node_dest[n] = bw;
147             atc->node_src[n]  = fw;
148             n++;
149         }
150     }
151 }
152
153 int gmx_pme_destroy(FILE *log, struct gmx_pme_t **pmedata)
154 {
155     int i;
156
157     if (NULL != log)
158     {
159         fprintf(log, "Destroying PME data structures.\n");
160     }
161
162     sfree((*pmedata)->nnx);
163     sfree((*pmedata)->nny);
164     sfree((*pmedata)->nnz);
165
166     for (i = 0; i < (*pmedata)->ngrids; ++i)
167     {
168         pmegrids_destroy(&(*pmedata)->pmegrid[i]);
169         sfree((*pmedata)->fftgrid[i]);
170         sfree((*pmedata)->cfftgrid[i]);
171         gmx_parallel_3dfft_destroy((*pmedata)->pfft_setup[i]);
172     }
173
174     sfree((*pmedata)->lb_buf1);
175     sfree((*pmedata)->lb_buf2);
176
177     pme_free_all_work(&(*pmedata)->solve_work, (*pmedata)->nthread);
178
179     sfree(*pmedata);
180     *pmedata = NULL;
181
182     return 0;
183 }
184
185 /*! \brief Round \p n up to the next multiple of \p f */
186 static int mult_up(int n, int f)
187 {
188     return ((n + f - 1)/f)*f;
189 }
190
191 /*! \brief Return estimate of the load imbalance from the PME grid not being a good match for the number of PME ranks */
192 static double estimate_pme_load_imbalance(struct gmx_pme_t *pme)
193 {
194     int    nma, nmi;
195     double n1, n2, n3;
196
197     nma = pme->nnodes_major;
198     nmi = pme->nnodes_minor;
199
200     n1 = mult_up(pme->nkx, nma)*mult_up(pme->nky, nmi)*pme->nkz;
201     n2 = mult_up(pme->nkx, nma)*mult_up(pme->nkz, nmi)*pme->nky;
202     n3 = mult_up(pme->nky, nma)*mult_up(pme->nkz, nmi)*pme->nkx;
203
204     /* pme_solve is roughly double the cost of an fft */
205
206     return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
207 }
208
209 /*! \brief Initialize atom communication data structure */
210 static void init_atomcomm(struct gmx_pme_t *pme, pme_atomcomm_t *atc,
211                           int dimind, gmx_bool bSpread)
212 {
213     int thread;
214
215     atc->dimind    = dimind;
216     atc->nslab     = 1;
217     atc->nodeid    = 0;
218     atc->pd_nalloc = 0;
219 #if GMX_MPI
220     if (pme->nnodes > 1)
221     {
222         atc->mpi_comm = pme->mpi_comm_d[dimind];
223         MPI_Comm_size(atc->mpi_comm, &atc->nslab);
224         MPI_Comm_rank(atc->mpi_comm, &atc->nodeid);
225     }
226     if (debug)
227     {
228         fprintf(debug, "For PME atom communication in dimind %d: nslab %d rank %d\n", atc->dimind, atc->nslab, atc->nodeid);
229     }
230 #endif
231
232     atc->bSpread   = bSpread;
233     atc->pme_order = pme->pme_order;
234
235     if (atc->nslab > 1)
236     {
237         snew(atc->node_dest, atc->nslab);
238         snew(atc->node_src, atc->nslab);
239         setup_coordinate_communication(atc);
240
241         snew(atc->count_thread, pme->nthread);
242         for (thread = 0; thread < pme->nthread; thread++)
243         {
244             snew(atc->count_thread[thread], atc->nslab);
245         }
246         atc->count = atc->count_thread[0];
247         snew(atc->rcount, atc->nslab);
248         snew(atc->buf_index, atc->nslab);
249     }
250
251     atc->nthread = pme->nthread;
252     if (atc->nthread > 1)
253     {
254         snew(atc->thread_plist, atc->nthread);
255     }
256     snew(atc->spline, atc->nthread);
257     for (thread = 0; thread < atc->nthread; thread++)
258     {
259         if (atc->nthread > 1)
260         {
261             snew(atc->thread_plist[thread].n, atc->nthread+2*gmxCacheLineSize);
262             atc->thread_plist[thread].n += gmxCacheLineSize;
263         }
264         snew(atc->spline[thread].thread_one, pme->nthread);
265         atc->spline[thread].thread_one[thread] = 1;
266     }
267 }
268
269 /*! \brief Initialize data structure for communication */
270 static void
271 init_overlap_comm(pme_overlap_t *  ol,
272                   int              norder,
273 #if GMX_MPI
274                   MPI_Comm         comm,
275 #endif
276                   int              nnodes,
277                   int              nodeid,
278                   int              ndata,
279                   int              commplainsize)
280 {
281     int              b, i;
282     pme_grid_comm_t *pgc;
283     gmx_bool         bCont;
284     int              fft_start, fft_end, send_index1, recv_index1;
285 #if GMX_MPI
286     MPI_Status       stat;
287
288     ol->mpi_comm = comm;
289 #endif
290
291     ol->nnodes = nnodes;
292     ol->nodeid = nodeid;
293
294     /* Linear translation of the PME grid won't affect reciprocal space
295      * calculations, so to optimize we only interpolate "upwards",
296      * which also means we only have to consider overlap in one direction.
297      * I.e., particles on this node might also be spread to grid indices
298      * that belong to higher nodes (modulo nnodes)
299      */
300
301     snew(ol->s2g0, ol->nnodes+1);
302     snew(ol->s2g1, ol->nnodes);
303     if (debug)
304     {
305         fprintf(debug, "PME slab boundaries:");
306     }
307     for (i = 0; i < nnodes; i++)
308     {
309         /* s2g0 the local interpolation grid start.
310          * s2g1 the local interpolation grid end.
311          * Since in calc_pidx we divide particles, and not grid lines,
312          * spatially uniform along dimension x or y, we need to round
313          * s2g0 down and s2g1 up.
314          */
315         ol->s2g0[i] = ( i   *ndata + 0       )/nnodes;
316         ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
317
318         if (debug)
319         {
320             fprintf(debug, "  %3d %3d", ol->s2g0[i], ol->s2g1[i]);
321         }
322     }
323     ol->s2g0[nnodes] = ndata;
324     if (debug)
325     {
326         fprintf(debug, "\n");
327     }
328
329     /* Determine with how many nodes we need to communicate the grid overlap */
330     b = 0;
331     do
332     {
333         b++;
334         bCont = FALSE;
335         for (i = 0; i < nnodes; i++)
336         {
337             if ((i+b <  nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
338                 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
339             {
340                 bCont = TRUE;
341             }
342         }
343     }
344     while (bCont && b < nnodes);
345     ol->noverlap_nodes = b - 1;
346
347     snew(ol->send_id, ol->noverlap_nodes);
348     snew(ol->recv_id, ol->noverlap_nodes);
349     for (b = 0; b < ol->noverlap_nodes; b++)
350     {
351         ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
352         ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
353     }
354     snew(ol->comm_data, ol->noverlap_nodes);
355
356     ol->send_size = 0;
357     for (b = 0; b < ol->noverlap_nodes; b++)
358     {
359         pgc = &ol->comm_data[b];
360         /* Send */
361         fft_start        = ol->s2g0[ol->send_id[b]];
362         fft_end          = ol->s2g0[ol->send_id[b]+1];
363         if (ol->send_id[b] < nodeid)
364         {
365             fft_start += ndata;
366             fft_end   += ndata;
367         }
368         send_index1       = ol->s2g1[nodeid];
369         send_index1       = std::min(send_index1, fft_end);
370         pgc->send_index0  = fft_start;
371         pgc->send_nindex  = std::max(0, send_index1 - pgc->send_index0);
372         ol->send_size    += pgc->send_nindex;
373
374         /* We always start receiving to the first index of our slab */
375         fft_start        = ol->s2g0[ol->nodeid];
376         fft_end          = ol->s2g0[ol->nodeid+1];
377         recv_index1      = ol->s2g1[ol->recv_id[b]];
378         if (ol->recv_id[b] > nodeid)
379         {
380             recv_index1 -= ndata;
381         }
382         recv_index1      = std::min(recv_index1, fft_end);
383         pgc->recv_index0 = fft_start;
384         pgc->recv_nindex = std::max(0, recv_index1 - pgc->recv_index0);
385     }
386
387 #if GMX_MPI
388     /* Communicate the buffer sizes to receive */
389     for (b = 0; b < ol->noverlap_nodes; b++)
390     {
391         MPI_Sendrecv(&ol->send_size, 1, MPI_INT, ol->send_id[b], b,
392                      &ol->comm_data[b].recv_size, 1, MPI_INT, ol->recv_id[b], b,
393                      ol->mpi_comm, &stat);
394     }
395 #endif
396
397     /* For non-divisible grid we need pme_order iso pme_order-1 */
398     snew(ol->sendbuf, norder*commplainsize);
399     snew(ol->recvbuf, norder*commplainsize);
400 }
401
402 void gmx_pme_check_restrictions(int pme_order,
403                                 int nkx, int nky, int nkz,
404                                 int nnodes_major,
405                                 int nnodes_minor,
406                                 gmx_bool bUseThreads,
407                                 gmx_bool bFatal,
408                                 gmx_bool *bValidSettings)
409 {
410     if (pme_order > PME_ORDER_MAX)
411     {
412         if (!bFatal)
413         {
414             *bValidSettings = FALSE;
415             return;
416         }
417         gmx_fatal(FARGS, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
418                   pme_order, PME_ORDER_MAX);
419     }
420
421     if (nkx <= pme_order*(nnodes_major > 1 ? 2 : 1) ||
422         nky <= pme_order*(nnodes_minor > 1 ? 2 : 1) ||
423         nkz <= pme_order)
424     {
425         if (!bFatal)
426         {
427             *bValidSettings = FALSE;
428             return;
429         }
430         gmx_fatal(FARGS, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order",
431                   pme_order);
432     }
433
434     /* Check for a limitation of the (current) sum_fftgrid_dd code.
435      * We only allow multiple communication pulses in dim 1, not in dim 0.
436      */
437     if (bUseThreads && (nkx < nnodes_major*pme_order &&
438                         nkx != nnodes_major*(pme_order - 1)))
439     {
440         if (!bFatal)
441         {
442             *bValidSettings = FALSE;
443             return;
444         }
445         gmx_fatal(FARGS, "The number of PME grid lines per rank along x is %g. But when using OpenMP threads, the number of grid lines per rank along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use fewer ranks along x (and possibly more along y and/or z) by specifying -dd manually.",
446                   nkx/(double)nnodes_major, pme_order);
447     }
448
449     if (bValidSettings != NULL)
450     {
451         *bValidSettings = TRUE;
452     }
453
454     return;
455 }
456
457 /*! \brief Round \p enumerator */
458 static int div_round_up(int enumerator, int denominator)
459 {
460     return (enumerator + denominator - 1)/denominator;
461 }
462
463 int gmx_pme_init(struct gmx_pme_t **pmedata,
464                  t_commrec *        cr,
465                  int                nnodes_major,
466                  int                nnodes_minor,
467                  t_inputrec *       ir,
468                  int                homenr,
469                  gmx_bool           bFreeEnergy_q,
470                  gmx_bool           bFreeEnergy_lj,
471                  gmx_bool           bReproducible,
472                  int                nthread)
473 {
474     struct gmx_pme_t *pme = NULL;
475
476     int               use_threads, sum_use_threads, i;
477     ivec              ndata;
478
479     if (debug)
480     {
481         fprintf(debug, "Creating PME data structures.\n");
482     }
483     snew(pme, 1);
484
485     pme->sum_qgrid_tmp       = NULL;
486     pme->sum_qgrid_dd_tmp    = NULL;
487     pme->buf_nalloc          = 0;
488
489     pme->nnodes              = 1;
490     pme->bPPnode             = TRUE;
491
492     pme->nnodes_major        = nnodes_major;
493     pme->nnodes_minor        = nnodes_minor;
494
495 #if GMX_MPI
496     if (nnodes_major*nnodes_minor > 1)
497     {
498         pme->mpi_comm = cr->mpi_comm_mygroup;
499
500         MPI_Comm_rank(pme->mpi_comm, &pme->nodeid);
501         MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
502         if (pme->nnodes != nnodes_major*nnodes_minor)
503         {
504             gmx_incons("PME rank count mismatch");
505         }
506     }
507     else
508     {
509         pme->mpi_comm = MPI_COMM_NULL;
510     }
511 #endif
512
513     if (pme->nnodes == 1)
514     {
515 #if GMX_MPI
516         pme->mpi_comm_d[0] = MPI_COMM_NULL;
517         pme->mpi_comm_d[1] = MPI_COMM_NULL;
518 #endif
519         pme->ndecompdim   = 0;
520         pme->nodeid_major = 0;
521         pme->nodeid_minor = 0;
522 #if GMX_MPI
523         pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
524 #endif
525     }
526     else
527     {
528         if (nnodes_minor == 1)
529         {
530 #if GMX_MPI
531             pme->mpi_comm_d[0] = pme->mpi_comm;
532             pme->mpi_comm_d[1] = MPI_COMM_NULL;
533 #endif
534             pme->ndecompdim   = 1;
535             pme->nodeid_major = pme->nodeid;
536             pme->nodeid_minor = 0;
537
538         }
539         else if (nnodes_major == 1)
540         {
541 #if GMX_MPI
542             pme->mpi_comm_d[0] = MPI_COMM_NULL;
543             pme->mpi_comm_d[1] = pme->mpi_comm;
544 #endif
545             pme->ndecompdim   = 1;
546             pme->nodeid_major = 0;
547             pme->nodeid_minor = pme->nodeid;
548         }
549         else
550         {
551             if (pme->nnodes % nnodes_major != 0)
552             {
553                 gmx_incons("For 2D PME decomposition, #PME ranks must be divisible by the number of ranks in the major dimension");
554             }
555             pme->ndecompdim = 2;
556
557 #if GMX_MPI
558             MPI_Comm_split(pme->mpi_comm, pme->nodeid % nnodes_minor,
559                            pme->nodeid, &pme->mpi_comm_d[0]);  /* My communicator along major dimension */
560             MPI_Comm_split(pme->mpi_comm, pme->nodeid/nnodes_minor,
561                            pme->nodeid, &pme->mpi_comm_d[1]);  /* My communicator along minor dimension */
562
563             MPI_Comm_rank(pme->mpi_comm_d[0], &pme->nodeid_major);
564             MPI_Comm_size(pme->mpi_comm_d[0], &pme->nnodes_major);
565             MPI_Comm_rank(pme->mpi_comm_d[1], &pme->nodeid_minor);
566             MPI_Comm_size(pme->mpi_comm_d[1], &pme->nnodes_minor);
567 #endif
568         }
569         pme->bPPnode = (cr->duty & DUTY_PP);
570     }
571
572     pme->nthread = nthread;
573
574     /* Check if any of the PME MPI ranks uses threads */
575     use_threads = (pme->nthread > 1 ? 1 : 0);
576 #if GMX_MPI
577     if (pme->nnodes > 1)
578     {
579         MPI_Allreduce(&use_threads, &sum_use_threads, 1, MPI_INT,
580                       MPI_SUM, pme->mpi_comm);
581     }
582     else
583 #endif
584     {
585         sum_use_threads = use_threads;
586     }
587     pme->bUseThreads = (sum_use_threads > 0);
588
589     if (ir->ePBC == epbcSCREW)
590     {
591         gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
592     }
593
594     pme->bFEP_q      = ((ir->efep != efepNO) && bFreeEnergy_q);
595     pme->bFEP_lj     = ((ir->efep != efepNO) && bFreeEnergy_lj);
596     pme->bFEP        = (pme->bFEP_q || pme->bFEP_lj);
597     pme->nkx         = ir->nkx;
598     pme->nky         = ir->nky;
599     pme->nkz         = ir->nkz;
600     pme->bP3M        = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
601     pme->pme_order   = ir->pme_order;
602
603     /* Always constant electrostatics coefficients */
604     pme->epsilon_r   = ir->epsilon_r;
605
606     /* Always constant LJ coefficients */
607     pme->ljpme_combination_rule = ir->ljpme_combination_rule;
608
609     /* If we violate restrictions, generate a fatal error here */
610     gmx_pme_check_restrictions(pme->pme_order,
611                                pme->nkx, pme->nky, pme->nkz,
612                                pme->nnodes_major,
613                                pme->nnodes_minor,
614                                pme->bUseThreads,
615                                TRUE,
616                                NULL);
617
618     if (pme->nnodes > 1)
619     {
620         double imbal;
621
622 #if GMX_MPI
623         MPI_Type_contiguous(DIM, GMX_MPI_REAL, &(pme->rvec_mpi));
624         MPI_Type_commit(&(pme->rvec_mpi));
625 #endif
626
627         /* Note that the coefficient spreading and force gathering, which usually
628          * takes about the same amount of time as FFT+solve_pme,
629          * is always fully load balanced
630          * (unless the coefficient distribution is inhomogeneous).
631          */
632
633         imbal = estimate_pme_load_imbalance(pme);
634         if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
635         {
636             fprintf(stderr,
637                     "\n"
638                     "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
639                     "      For optimal PME load balancing\n"
640                     "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_ranks_x (%d)\n"
641                     "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_ranks_y (%d)\n"
642                     "\n",
643                     (int)((imbal-1)*100 + 0.5),
644                     pme->nkx, pme->nky, pme->nnodes_major,
645                     pme->nky, pme->nkz, pme->nnodes_minor);
646         }
647     }
648
649     /* For non-divisible grid we need pme_order iso pme_order-1 */
650     /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
651      * y is always copied through a buffer: we don't need padding in z,
652      * but we do need the overlap in x because of the communication order.
653      */
654     init_overlap_comm(&pme->overlap[0], pme->pme_order,
655 #if GMX_MPI
656                       pme->mpi_comm_d[0],
657 #endif
658                       pme->nnodes_major, pme->nodeid_major,
659                       pme->nkx,
660                       (div_round_up(pme->nky, pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
661
662     /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
663      * We do this with an offset buffer of equal size, so we need to allocate
664      * extra for the offset. That's what the (+1)*pme->nkz is for.
665      */
666     init_overlap_comm(&pme->overlap[1], pme->pme_order,
667 #if GMX_MPI
668                       pme->mpi_comm_d[1],
669 #endif
670                       pme->nnodes_minor, pme->nodeid_minor,
671                       pme->nky,
672                       (div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
673
674     /* Double-check for a limitation of the (current) sum_fftgrid_dd code.
675      * Note that gmx_pme_check_restrictions checked for this already.
676      */
677     if (pme->bUseThreads && pme->overlap[0].noverlap_nodes > 1)
678     {
679         gmx_incons("More than one communication pulse required for grid overlap communication along the major dimension while using threads");
680     }
681
682     snew(pme->bsp_mod[XX], pme->nkx);
683     snew(pme->bsp_mod[YY], pme->nky);
684     snew(pme->bsp_mod[ZZ], pme->nkz);
685
686     /* The required size of the interpolation grid, including overlap.
687      * The allocated size (pmegrid_n?) might be slightly larger.
688      */
689     pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
690         pme->overlap[0].s2g0[pme->nodeid_major];
691     pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
692         pme->overlap[1].s2g0[pme->nodeid_minor];
693     pme->pmegrid_nz_base = pme->nkz;
694     pme->pmegrid_nz      = pme->pmegrid_nz_base + pme->pme_order - 1;
695     set_grid_alignment(&pme->pmegrid_nz, pme->pme_order);
696
697     pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
698     pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
699     pme->pmegrid_start_iz = 0;
700
701     make_gridindex5_to_localindex(pme->nkx,
702                                   pme->pmegrid_start_ix,
703                                   pme->pmegrid_nx - (pme->pme_order-1),
704                                   &pme->nnx, &pme->fshx);
705     make_gridindex5_to_localindex(pme->nky,
706                                   pme->pmegrid_start_iy,
707                                   pme->pmegrid_ny - (pme->pme_order-1),
708                                   &pme->nny, &pme->fshy);
709     make_gridindex5_to_localindex(pme->nkz,
710                                   pme->pmegrid_start_iz,
711                                   pme->pmegrid_nz_base,
712                                   &pme->nnz, &pme->fshz);
713
714     pme->spline_work = make_pme_spline_work(pme->pme_order);
715
716     ndata[0]    = pme->nkx;
717     ndata[1]    = pme->nky;
718     ndata[2]    = pme->nkz;
719     /* It doesn't matter if we allocate too many grids here,
720      * we only allocate and use the ones we need.
721      */
722     if (EVDW_PME(ir->vdwtype))
723     {
724         pme->ngrids = ((ir->ljpme_combination_rule == eljpmeLB) ? DO_Q_AND_LJ_LB : DO_Q_AND_LJ);
725     }
726     else
727     {
728         pme->ngrids = DO_Q;
729     }
730     snew(pme->fftgrid, pme->ngrids);
731     snew(pme->cfftgrid, pme->ngrids);
732     snew(pme->pfft_setup, pme->ngrids);
733
734     for (i = 0; i < pme->ngrids; ++i)
735     {
736         if ((i <  DO_Q && EEL_PME(ir->coulombtype) && (i == 0 ||
737                                                        bFreeEnergy_q)) ||
738             (i >= DO_Q && EVDW_PME(ir->vdwtype) && (i == 2 ||
739                                                     bFreeEnergy_lj ||
740                                                     ir->ljpme_combination_rule == eljpmeLB)))
741         {
742             pmegrids_init(&pme->pmegrid[i],
743                           pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
744                           pme->pmegrid_nz_base,
745                           pme->pme_order,
746                           pme->bUseThreads,
747                           pme->nthread,
748                           pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
749                           pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
750             /* This routine will allocate the grid data to fit the FFTs */
751             gmx_parallel_3dfft_init(&pme->pfft_setup[i], ndata,
752                                     &pme->fftgrid[i], &pme->cfftgrid[i],
753                                     pme->mpi_comm_d,
754                                     bReproducible, pme->nthread);
755
756         }
757     }
758
759     if (!pme->bP3M)
760     {
761         /* Use plain SPME B-spline interpolation */
762         make_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
763     }
764     else
765     {
766         /* Use the P3M grid-optimized influence function */
767         make_p3m_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
768     }
769
770     /* Use atc[0] for spreading */
771     init_atomcomm(pme, &pme->atc[0], nnodes_major > 1 ? 0 : 1, TRUE);
772     if (pme->ndecompdim >= 2)
773     {
774         init_atomcomm(pme, &pme->atc[1], 1, FALSE);
775     }
776
777     if (pme->nnodes == 1)
778     {
779         pme->atc[0].n = homenr;
780         pme_realloc_atomcomm_things(&pme->atc[0]);
781     }
782
783     pme->lb_buf1       = NULL;
784     pme->lb_buf2       = NULL;
785     pme->lb_buf_nalloc = 0;
786
787     pme_init_all_work(&pme->solve_work, pme->nthread, pme->nkx);
788
789     *pmedata = pme;
790
791     return 0;
792 }
793
794 int gmx_pme_reinit(struct gmx_pme_t **pmedata,
795                    t_commrec *        cr,
796                    struct gmx_pme_t * pme_src,
797                    const t_inputrec * ir,
798                    ivec               grid_size)
799 {
800     t_inputrec irc;
801     int        homenr;
802     int        ret;
803
804     irc     = *ir;
805     irc.nkx = grid_size[XX];
806     irc.nky = grid_size[YY];
807     irc.nkz = grid_size[ZZ];
808
809     if (pme_src->nnodes == 1)
810     {
811         homenr = pme_src->atc[0].n;
812     }
813     else
814     {
815         homenr = -1;
816     }
817
818     ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
819                        &irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE, pme_src->nthread);
820
821     if (ret == 0)
822     {
823         /* We can easily reuse the allocated pme grids in pme_src */
824         reuse_pmegrids(&pme_src->pmegrid[PME_GRID_QA], &(*pmedata)->pmegrid[PME_GRID_QA]);
825         /* We would like to reuse the fft grids, but that's harder */
826     }
827
828     return ret;
829 }
830
831 void gmx_pme_calc_energy(struct gmx_pme_t *pme, int n, rvec *x, real *q, real *V)
832 {
833     pme_atomcomm_t *atc;
834     pmegrids_t     *grid;
835
836     if (pme->nnodes > 1)
837     {
838         gmx_incons("gmx_pme_calc_energy called in parallel");
839     }
840     if (pme->bFEP_q > 1)
841     {
842         gmx_incons("gmx_pme_calc_energy with free energy");
843     }
844
845     atc            = &pme->atc_energy;
846     atc->nthread   = 1;
847     if (atc->spline == NULL)
848     {
849         snew(atc->spline, atc->nthread);
850     }
851     atc->nslab     = 1;
852     atc->bSpread   = TRUE;
853     atc->pme_order = pme->pme_order;
854     atc->n         = n;
855     pme_realloc_atomcomm_things(atc);
856     atc->x           = x;
857     atc->coefficient = q;
858
859     /* We only use the A-charges grid */
860     grid = &pme->pmegrid[PME_GRID_QA];
861
862     /* Only calculate the spline coefficients, don't actually spread */
863     spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgrid[PME_GRID_QA], FALSE, PME_GRID_QA);
864
865     *V = gather_energy_bsplines(pme, grid->grid.grid, atc);
866 }
867
868 /*! \brief Calculate initial Lorentz-Berthelot coefficients for LJ-PME */
869 static void
870 calc_initial_lb_coeffs(struct gmx_pme_t *pme, real *local_c6, real *local_sigma)
871 {
872     int  i;
873     for (i = 0; i < pme->atc[0].n; ++i)
874     {
875         real sigma4;
876         sigma4                     = local_sigma[i];
877         sigma4                     = sigma4*sigma4;
878         sigma4                     = sigma4*sigma4;
879         pme->atc[0].coefficient[i] = local_c6[i] / sigma4;
880     }
881 }
882
883 /*! \brief Calculate next Lorentz-Berthelot coefficients for LJ-PME */
884 static void
885 calc_next_lb_coeffs(struct gmx_pme_t *pme, real *local_sigma)
886 {
887     int  i;
888
889     for (i = 0; i < pme->atc[0].n; ++i)
890     {
891         pme->atc[0].coefficient[i] *= local_sigma[i];
892     }
893 }
894
895 int gmx_pme_do(struct gmx_pme_t *pme,
896                int start,       int homenr,
897                rvec x[],        rvec f[],
898                real chargeA[],  real chargeB[],
899                real c6A[],      real c6B[],
900                real sigmaA[],   real sigmaB[],
901                matrix box,      t_commrec *cr,
902                int  maxshift_x, int maxshift_y,
903                t_nrnb *nrnb,    gmx_wallcycle_t wcycle,
904                matrix vir_q,    real ewaldcoeff_q,
905                matrix vir_lj,   real ewaldcoeff_lj,
906                real *energy_q,  real *energy_lj,
907                real lambda_q,   real lambda_lj,
908                real *dvdlambda_q, real *dvdlambda_lj,
909                int flags)
910 {
911     int                  d, i, j, npme, grid_index, max_grid_index;
912     int                  n_d;
913     pme_atomcomm_t      *atc        = NULL;
914     pmegrids_t          *pmegrid    = NULL;
915     real                *grid       = NULL;
916     rvec                *f_d;
917     real                *coefficient = NULL;
918     real                 energy_AB[4];
919     matrix               vir_AB[4];
920     real                 scale, lambda;
921     gmx_bool             bClearF;
922     gmx_parallel_3dfft_t pfft_setup;
923     real              *  fftgrid;
924     t_complex          * cfftgrid;
925     int                  thread;
926     gmx_bool             bFirst, bDoSplines;
927     int                  fep_state;
928     int                  fep_states_lj           = pme->bFEP_lj ? 2 : 1;
929     const gmx_bool       bCalcEnerVir            = flags & GMX_PME_CALC_ENER_VIR;
930     const gmx_bool       bBackFFT                = flags & (GMX_PME_CALC_F | GMX_PME_CALC_POT);
931     const gmx_bool       bCalcF                  = flags & GMX_PME_CALC_F;
932
933     assert(pme->nnodes > 0);
934     assert(pme->nnodes == 1 || pme->ndecompdim > 0);
935
936     if (pme->nnodes > 1)
937     {
938         atc      = &pme->atc[0];
939         atc->npd = homenr;
940         if (atc->npd > atc->pd_nalloc)
941         {
942             atc->pd_nalloc = over_alloc_dd(atc->npd);
943             srenew(atc->pd, atc->pd_nalloc);
944         }
945         for (d = pme->ndecompdim-1; d >= 0; d--)
946         {
947             atc           = &pme->atc[d];
948             atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
949         }
950     }
951     else
952     {
953         atc = &pme->atc[0];
954         /* This could be necessary for TPI */
955         pme->atc[0].n = homenr;
956         if (DOMAINDECOMP(cr))
957         {
958             pme_realloc_atomcomm_things(atc);
959         }
960         atc->x = x;
961         atc->f = f;
962     }
963
964     gmx::invertBoxMatrix(box, pme->recipbox);
965     bFirst = TRUE;
966
967     /* For simplicity, we construct the splines for all particles if
968      * more than one PME calculations is needed. Some optimization
969      * could be done by keeping track of which atoms have splines
970      * constructed, and construct new splines on each pass for atoms
971      * that don't yet have them.
972      */
973
974     bDoSplines = pme->bFEP || ((flags & GMX_PME_DO_COULOMB) && (flags & GMX_PME_DO_LJ));
975
976     /* We need a maximum of four separate PME calculations:
977      * grid_index=0: Coulomb PME with charges from state A
978      * grid_index=1: Coulomb PME with charges from state B
979      * grid_index=2: LJ PME with C6 from state A
980      * grid_index=3: LJ PME with C6 from state B
981      * For Lorentz-Berthelot combination rules, a separate loop is used to
982      * calculate all the terms
983      */
984
985     /* If we are doing LJ-PME with LB, we only do Q here */
986     max_grid_index = (pme->ljpme_combination_rule == eljpmeLB) ? DO_Q : DO_Q_AND_LJ;
987
988     for (grid_index = 0; grid_index < max_grid_index; ++grid_index)
989     {
990         /* Check if we should do calculations at this grid_index
991          * If grid_index is odd we should be doing FEP
992          * If grid_index < 2 we should be doing electrostatic PME
993          * If grid_index >= 2 we should be doing LJ-PME
994          */
995         if ((grid_index <  DO_Q && (!(flags & GMX_PME_DO_COULOMB) ||
996                                     (grid_index == 1 && !pme->bFEP_q))) ||
997             (grid_index >= DO_Q && (!(flags & GMX_PME_DO_LJ) ||
998                                     (grid_index == 3 && !pme->bFEP_lj))))
999         {
1000             continue;
1001         }
1002         /* Unpack structure */
1003         pmegrid    = &pme->pmegrid[grid_index];
1004         fftgrid    = pme->fftgrid[grid_index];
1005         cfftgrid   = pme->cfftgrid[grid_index];
1006         pfft_setup = pme->pfft_setup[grid_index];
1007         switch (grid_index)
1008         {
1009             case 0: coefficient = chargeA + start; break;
1010             case 1: coefficient = chargeB + start; break;
1011             case 2: coefficient = c6A + start; break;
1012             case 3: coefficient = c6B + start; break;
1013         }
1014
1015         grid = pmegrid->grid.grid;
1016
1017         if (debug)
1018         {
1019             fprintf(debug, "PME: number of ranks = %d, rank = %d\n",
1020                     cr->nnodes, cr->nodeid);
1021             fprintf(debug, "Grid = %p\n", (void*)grid);
1022             if (grid == NULL)
1023             {
1024                 gmx_fatal(FARGS, "No grid!");
1025             }
1026         }
1027         where();
1028
1029         if (pme->nnodes == 1)
1030         {
1031             atc->coefficient = coefficient;
1032         }
1033         else
1034         {
1035             wallcycle_start(wcycle, ewcPME_REDISTXF);
1036             do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, coefficient);
1037             where();
1038
1039             wallcycle_stop(wcycle, ewcPME_REDISTXF);
1040         }
1041
1042         if (debug)
1043         {
1044             fprintf(debug, "Rank= %6d, pme local particles=%6d\n",
1045                     cr->nodeid, atc->n);
1046         }
1047
1048         if (flags & GMX_PME_SPREAD)
1049         {
1050             wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1051
1052             /* Spread the coefficients on a grid */
1053             spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
1054
1055             if (bFirst)
1056             {
1057                 inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
1058             }
1059             inc_nrnb(nrnb, eNR_SPREADBSP,
1060                      pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
1061
1062             if (!pme->bUseThreads)
1063             {
1064                 wrap_periodic_pmegrid(pme, grid);
1065
1066                 /* sum contributions to local grid from other nodes */
1067 #if GMX_MPI
1068                 if (pme->nnodes > 1)
1069                 {
1070                     gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
1071                     where();
1072                 }
1073 #endif
1074
1075                 copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
1076             }
1077
1078             wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1079
1080             /* TODO If the OpenMP and single-threaded implementations
1081                converge, then spread_on_grid() and
1082                copy_pmegrid_to_fftgrid() will perhaps live in the same
1083                source file and the following debugging function can live
1084                there too. */
1085             /*
1086                dump_local_fftgrid(pme,fftgrid);
1087                exit(0);
1088              */
1089         }
1090
1091         /* Here we start a large thread parallel region */
1092 #pragma omp parallel num_threads(pme->nthread) private(thread)
1093         {
1094             try
1095             {
1096                 thread = gmx_omp_get_thread_num();
1097                 if (flags & GMX_PME_SOLVE)
1098                 {
1099                     int loop_count;
1100
1101                     /* do 3d-fft */
1102                     if (thread == 0)
1103                     {
1104                         wallcycle_start(wcycle, ewcPME_FFT);
1105                     }
1106                     gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
1107                                                thread, wcycle);
1108                     if (thread == 0)
1109                     {
1110                         wallcycle_stop(wcycle, ewcPME_FFT);
1111                     }
1112                     where();
1113
1114                     /* solve in k-space for our local cells */
1115                     if (thread == 0)
1116                     {
1117                         wallcycle_start(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
1118                     }
1119                     if (grid_index < DO_Q)
1120                     {
1121                         loop_count =
1122                             solve_pme_yzx(pme, cfftgrid, ewaldcoeff_q,
1123                                           box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
1124                                           bCalcEnerVir,
1125                                           pme->nthread, thread);
1126                     }
1127                     else
1128                     {
1129                         loop_count =
1130                             solve_pme_lj_yzx(pme, &cfftgrid, FALSE, ewaldcoeff_lj,
1131                                              box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
1132                                              bCalcEnerVir,
1133                                              pme->nthread, thread);
1134                     }
1135
1136                     if (thread == 0)
1137                     {
1138                         wallcycle_stop(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
1139                         where();
1140                         inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
1141                     }
1142                 }
1143
1144                 if (bBackFFT)
1145                 {
1146                     /* do 3d-invfft */
1147                     if (thread == 0)
1148                     {
1149                         where();
1150                         wallcycle_start(wcycle, ewcPME_FFT);
1151                     }
1152                     gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
1153                                                thread, wcycle);
1154                     if (thread == 0)
1155                     {
1156                         wallcycle_stop(wcycle, ewcPME_FFT);
1157
1158                         where();
1159
1160                         if (pme->nodeid == 0)
1161                         {
1162                             real ntot = pme->nkx*pme->nky*pme->nkz;
1163                             npme  = static_cast<int>(ntot*log(ntot)/log(2.0));
1164                             inc_nrnb(nrnb, eNR_FFT, 2*npme);
1165                         }
1166
1167                         /* Note: this wallcycle region is closed below
1168                            outside an OpenMP region, so take care if
1169                            refactoring code here. */
1170                         wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1171                     }
1172
1173                     copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
1174                 }
1175             } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1176         }
1177         /* End of thread parallel section.
1178          * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
1179          */
1180
1181         if (bBackFFT)
1182         {
1183             /* distribute local grid to all nodes */
1184 #if GMX_MPI
1185             if (pme->nnodes > 1)
1186             {
1187                 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
1188             }
1189 #endif
1190             where();
1191
1192             unwrap_periodic_pmegrid(pme, grid);
1193         }
1194
1195         if (bCalcF)
1196         {
1197             /* interpolate forces for our local atoms */
1198
1199             where();
1200
1201             /* If we are running without parallelization,
1202              * atc->f is the actual force array, not a buffer,
1203              * therefore we should not clear it.
1204              */
1205             lambda  = grid_index < DO_Q ? lambda_q : lambda_lj;
1206             bClearF = (bFirst && PAR(cr));
1207 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1208             for (thread = 0; thread < pme->nthread; thread++)
1209             {
1210                 try
1211                 {
1212                     gather_f_bsplines(pme, grid, bClearF, atc,
1213                                       &atc->spline[thread],
1214                                       pme->bFEP ? (grid_index % 2 == 0 ? 1.0-lambda : lambda) : 1.0);
1215                 }
1216                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1217             }
1218
1219             where();
1220
1221             inc_nrnb(nrnb, eNR_GATHERFBSP,
1222                      pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
1223             /* Note: this wallcycle region is opened above inside an OpenMP
1224                region, so take care if refactoring code here. */
1225             wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1226         }
1227
1228         if (bCalcEnerVir)
1229         {
1230             /* This should only be called on the master thread
1231              * and after the threads have synchronized.
1232              */
1233             if (grid_index < 2)
1234             {
1235                 get_pme_ener_vir_q(pme->solve_work, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
1236             }
1237             else
1238             {
1239                 get_pme_ener_vir_lj(pme->solve_work, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
1240             }
1241         }
1242         bFirst = FALSE;
1243     } /* of grid_index-loop */
1244
1245     /* For Lorentz-Berthelot combination rules in LJ-PME, we need to calculate
1246      * seven terms. */
1247
1248     if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB)
1249     {
1250         /* Loop over A- and B-state if we are doing FEP */
1251         for (fep_state = 0; fep_state < fep_states_lj; ++fep_state)
1252         {
1253             real *local_c6 = NULL, *local_sigma = NULL, *RedistC6 = NULL, *RedistSigma = NULL;
1254             if (pme->nnodes == 1)
1255             {
1256                 if (pme->lb_buf1 == NULL)
1257                 {
1258                     pme->lb_buf_nalloc = pme->atc[0].n;
1259                     snew(pme->lb_buf1, pme->lb_buf_nalloc);
1260                 }
1261                 pme->atc[0].coefficient = pme->lb_buf1;
1262                 switch (fep_state)
1263                 {
1264                     case 0:
1265                         local_c6      = c6A;
1266                         local_sigma   = sigmaA;
1267                         break;
1268                     case 1:
1269                         local_c6      = c6B;
1270                         local_sigma   = sigmaB;
1271                         break;
1272                     default:
1273                         gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
1274                 }
1275             }
1276             else
1277             {
1278                 atc = &pme->atc[0];
1279                 switch (fep_state)
1280                 {
1281                     case 0:
1282                         RedistC6      = c6A;
1283                         RedistSigma   = sigmaA;
1284                         break;
1285                     case 1:
1286                         RedistC6      = c6B;
1287                         RedistSigma   = sigmaB;
1288                         break;
1289                     default:
1290                         gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
1291                 }
1292                 wallcycle_start(wcycle, ewcPME_REDISTXF);
1293
1294                 do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, RedistC6);
1295                 if (pme->lb_buf_nalloc < atc->n)
1296                 {
1297                     pme->lb_buf_nalloc = atc->nalloc;
1298                     srenew(pme->lb_buf1, pme->lb_buf_nalloc);
1299                     srenew(pme->lb_buf2, pme->lb_buf_nalloc);
1300                 }
1301                 local_c6 = pme->lb_buf1;
1302                 for (i = 0; i < atc->n; ++i)
1303                 {
1304                     local_c6[i] = atc->coefficient[i];
1305                 }
1306                 where();
1307
1308                 do_redist_pos_coeffs(pme, cr, start, homenr, FALSE, x, RedistSigma);
1309                 local_sigma = pme->lb_buf2;
1310                 for (i = 0; i < atc->n; ++i)
1311                 {
1312                     local_sigma[i] = atc->coefficient[i];
1313                 }
1314                 where();
1315
1316                 wallcycle_stop(wcycle, ewcPME_REDISTXF);
1317             }
1318             calc_initial_lb_coeffs(pme, local_c6, local_sigma);
1319
1320             /*Seven terms in LJ-PME with LB, grid_index < 2 reserved for electrostatics*/
1321             for (grid_index = 2; grid_index < 9; ++grid_index)
1322             {
1323                 /* Unpack structure */
1324                 pmegrid    = &pme->pmegrid[grid_index];
1325                 fftgrid    = pme->fftgrid[grid_index];
1326                 pfft_setup = pme->pfft_setup[grid_index];
1327                 calc_next_lb_coeffs(pme, local_sigma);
1328                 grid = pmegrid->grid.grid;
1329                 where();
1330
1331                 if (flags & GMX_PME_SPREAD)
1332                 {
1333                     wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1334                     /* Spread the c6 on a grid */
1335                     spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
1336
1337                     if (bFirst)
1338                     {
1339                         inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
1340                     }
1341
1342                     inc_nrnb(nrnb, eNR_SPREADBSP,
1343                              pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
1344                     if (pme->nthread == 1)
1345                     {
1346                         wrap_periodic_pmegrid(pme, grid);
1347                         /* sum contributions to local grid from other nodes */
1348 #if GMX_MPI
1349                         if (pme->nnodes > 1)
1350                         {
1351                             gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
1352                             where();
1353                         }
1354 #endif
1355                         copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
1356                     }
1357                     wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1358                 }
1359                 /*Here we start a large thread parallel region*/
1360 #pragma omp parallel num_threads(pme->nthread) private(thread)
1361                 {
1362                     try
1363                     {
1364                         thread = gmx_omp_get_thread_num();
1365                         if (flags & GMX_PME_SOLVE)
1366                         {
1367                             /* do 3d-fft */
1368                             if (thread == 0)
1369                             {
1370                                 wallcycle_start(wcycle, ewcPME_FFT);
1371                             }
1372
1373                             gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
1374                                                        thread, wcycle);
1375                             if (thread == 0)
1376                             {
1377                                 wallcycle_stop(wcycle, ewcPME_FFT);
1378                             }
1379                             where();
1380                         }
1381                     }
1382                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1383                 }
1384                 bFirst = FALSE;
1385             }
1386             if (flags & GMX_PME_SOLVE)
1387             {
1388                 /* solve in k-space for our local cells */
1389 #pragma omp parallel num_threads(pme->nthread) private(thread)
1390                 {
1391                     try
1392                     {
1393                         int loop_count;
1394                         thread = gmx_omp_get_thread_num();
1395                         if (thread == 0)
1396                         {
1397                             wallcycle_start(wcycle, ewcLJPME);
1398                         }
1399
1400                         loop_count =
1401                             solve_pme_lj_yzx(pme, &pme->cfftgrid[2], TRUE, ewaldcoeff_lj,
1402                                              box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
1403                                              bCalcEnerVir,
1404                                              pme->nthread, thread);
1405                         if (thread == 0)
1406                         {
1407                             wallcycle_stop(wcycle, ewcLJPME);
1408                             where();
1409                             inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
1410                         }
1411                     }
1412                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1413                 }
1414             }
1415
1416             if (bCalcEnerVir)
1417             {
1418                 /* This should only be called on the master thread and
1419                  * after the threads have synchronized.
1420                  */
1421                 get_pme_ener_vir_lj(pme->solve_work, pme->nthread, &energy_AB[2+fep_state], vir_AB[2+fep_state]);
1422             }
1423
1424             if (bBackFFT)
1425             {
1426                 bFirst = !(flags & GMX_PME_DO_COULOMB);
1427                 calc_initial_lb_coeffs(pme, local_c6, local_sigma);
1428                 for (grid_index = 8; grid_index >= 2; --grid_index)
1429                 {
1430                     /* Unpack structure */
1431                     pmegrid    = &pme->pmegrid[grid_index];
1432                     fftgrid    = pme->fftgrid[grid_index];
1433                     pfft_setup = pme->pfft_setup[grid_index];
1434                     grid       = pmegrid->grid.grid;
1435                     calc_next_lb_coeffs(pme, local_sigma);
1436                     where();
1437 #pragma omp parallel num_threads(pme->nthread) private(thread)
1438                     {
1439                         try
1440                         {
1441                             thread = gmx_omp_get_thread_num();
1442                             /* do 3d-invfft */
1443                             if (thread == 0)
1444                             {
1445                                 where();
1446                                 wallcycle_start(wcycle, ewcPME_FFT);
1447                             }
1448
1449                             gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
1450                                                        thread, wcycle);
1451                             if (thread == 0)
1452                             {
1453                                 wallcycle_stop(wcycle, ewcPME_FFT);
1454
1455                                 where();
1456
1457                                 if (pme->nodeid == 0)
1458                                 {
1459                                     real ntot = pme->nkx*pme->nky*pme->nkz;
1460                                     npme  = static_cast<int>(ntot*log(ntot)/log(2.0));
1461                                     inc_nrnb(nrnb, eNR_FFT, 2*npme);
1462                                 }
1463                                 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1464                             }
1465
1466                             copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
1467                         }
1468                         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1469                     } /*#pragma omp parallel*/
1470
1471                     /* distribute local grid to all nodes */
1472 #if GMX_MPI
1473                     if (pme->nnodes > 1)
1474                     {
1475                         gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
1476                     }
1477 #endif
1478                     where();
1479
1480                     unwrap_periodic_pmegrid(pme, grid);
1481
1482                     if (bCalcF)
1483                     {
1484                         /* interpolate forces for our local atoms */
1485                         where();
1486                         bClearF = (bFirst && PAR(cr));
1487                         scale   = pme->bFEP ? (fep_state < 1 ? 1.0-lambda_lj : lambda_lj) : 1.0;
1488                         scale  *= lb_scale_factor[grid_index-2];
1489
1490 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1491                         for (thread = 0; thread < pme->nthread; thread++)
1492                         {
1493                             try
1494                             {
1495                                 gather_f_bsplines(pme, grid, bClearF, &pme->atc[0],
1496                                                   &pme->atc[0].spline[thread],
1497                                                   scale);
1498                             }
1499                             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1500                         }
1501
1502                         where();
1503
1504                         inc_nrnb(nrnb, eNR_GATHERFBSP,
1505                                  pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
1506                     }
1507                     wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1508
1509                     bFirst = FALSE;
1510                 } /* for (grid_index = 8; grid_index >= 2; --grid_index) */
1511             }     /* if (bCalcF) */
1512         }         /* for (fep_state = 0; fep_state < fep_states_lj; ++fep_state) */
1513     }             /* if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB) */
1514
1515     if (bCalcF && pme->nnodes > 1)
1516     {
1517         wallcycle_start(wcycle, ewcPME_REDISTXF);
1518         for (d = 0; d < pme->ndecompdim; d++)
1519         {
1520             atc = &pme->atc[d];
1521             if (d == pme->ndecompdim - 1)
1522             {
1523                 n_d = homenr;
1524                 f_d = f + start;
1525             }
1526             else
1527             {
1528                 n_d = pme->atc[d+1].n;
1529                 f_d = pme->atc[d+1].f;
1530             }
1531             if (DOMAINDECOMP(cr))
1532             {
1533                 dd_pmeredist_f(pme, atc, n_d, f_d,
1534                                d == pme->ndecompdim-1 && pme->bPPnode);
1535             }
1536         }
1537
1538         wallcycle_stop(wcycle, ewcPME_REDISTXF);
1539     }
1540     where();
1541
1542     if (bCalcEnerVir)
1543     {
1544         if (flags & GMX_PME_DO_COULOMB)
1545         {
1546             if (!pme->bFEP_q)
1547             {
1548                 *energy_q = energy_AB[0];
1549                 m_add(vir_q, vir_AB[0], vir_q);
1550             }
1551             else
1552             {
1553                 *energy_q       = (1.0-lambda_q)*energy_AB[0] + lambda_q*energy_AB[1];
1554                 *dvdlambda_q   += energy_AB[1] - energy_AB[0];
1555                 for (i = 0; i < DIM; i++)
1556                 {
1557                     for (j = 0; j < DIM; j++)
1558                     {
1559                         vir_q[i][j] += (1.0-lambda_q)*vir_AB[0][i][j] +
1560                             lambda_q*vir_AB[1][i][j];
1561                     }
1562                 }
1563             }
1564             if (debug)
1565             {
1566                 fprintf(debug, "Electrostatic PME mesh energy: %g\n", *energy_q);
1567             }
1568         }
1569         else
1570         {
1571             *energy_q = 0;
1572         }
1573
1574         if (flags & GMX_PME_DO_LJ)
1575         {
1576             if (!pme->bFEP_lj)
1577             {
1578                 *energy_lj = energy_AB[2];
1579                 m_add(vir_lj, vir_AB[2], vir_lj);
1580             }
1581             else
1582             {
1583                 *energy_lj     = (1.0-lambda_lj)*energy_AB[2] + lambda_lj*energy_AB[3];
1584                 *dvdlambda_lj += energy_AB[3] - energy_AB[2];
1585                 for (i = 0; i < DIM; i++)
1586                 {
1587                     for (j = 0; j < DIM; j++)
1588                     {
1589                         vir_lj[i][j] += (1.0-lambda_lj)*vir_AB[2][i][j] + lambda_lj*vir_AB[3][i][j];
1590                     }
1591                 }
1592             }
1593             if (debug)
1594             {
1595                 fprintf(debug, "Lennard-Jones PME mesh energy: %g\n", *energy_lj);
1596             }
1597         }
1598         else
1599         {
1600             *energy_lj = 0;
1601         }
1602     }
1603     return 0;
1604 }