src/gromacs/ewald/pme.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  *
  39  * \brief This file contains function definitions necessary for
  40  * computing energies and forces for the PME long-ranged part (Coulomb
  41  * and LJ).
  42  *
  43  * \author Erik Lindahl <erik@kth.se>
  44  * \author Berk Hess <hess@kth.se>
  45  * \ingroup module_ewald
  46  */
  47 /* IMPORTANT FOR DEVELOPERS:
  48  *
  49  * Triclinic pme stuff isn't entirely trivial, and we've experienced
  50  * some bugs during development (many of them due to me). To avoid
  51  * this in the future, please check the following things if you make
  52  * changes in this file:
  53  *
  54  * 1. You should obtain identical (at least to the PME precision)
  55  *    energies, forces, and virial for
  56  *    a rectangular box and a triclinic one where the z (or y) axis is
  57  *    tilted a whole box side. For instance you could use these boxes:
  58  *
  59  *    rectangular       triclinic
  60  *     2  0  0           2  0  0
  61  *     0  2  0           0  2  0
  62  *     0  0  6           2  2  6
  63  *
  64  * 2. You should check the energy conservation in a triclinic box.
  65  *
  66  * It might seem an overkill, but better safe than sorry.
  67  * /Erik 001109
  68  */
  69
  70 #include "gmxpre.h"
  71
  72 #include "pme.h"
  73
  74 #include "config.h"
  75
  76 #include <assert.h>
  77 #include <math.h>
  78 #include <stdio.h>
  79 #include <stdlib.h>
  80 #include <string.h>
  81
  82 #include <algorithm>
  83
  84 #include "gromacs/fft/parallel_3dfft.h"
  85 #include "gromacs/fileio/pdbio.h"
  86 #include "gromacs/gmxlib/network.h"
  87 #include "gromacs/gmxlib/nrnb.h"
  88 #include "gromacs/math/gmxcomplex.h"
  89 #include "gromacs/math/invertmatrix.h"
  90 #include "gromacs/math/units.h"
  91 #include "gromacs/math/vec.h"
  92 #include "gromacs/math/vectypes.h"
  93 #include "gromacs/mdtypes/commrec.h"
  94 #include "gromacs/mdtypes/forcerec.h"
  95 #include "gromacs/mdtypes/inputrec.h"
  96 #include "gromacs/mdtypes/md_enums.h"
  97 #include "gromacs/pbcutil/pbc.h"
  98 #include "gromacs/timing/cyclecounter.h"
  99 #include "gromacs/timing/wallcycle.h"
 100 #include "gromacs/timing/walltime_accounting.h"
 101 #include "gromacs/utility/basedefinitions.h"
 102 #include "gromacs/utility/exceptions.h"
 103 #include "gromacs/utility/fatalerror.h"
 104 #include "gromacs/utility/gmxmpi.h"
 105 #include "gromacs/utility/gmxomp.h"
 106 #include "gromacs/utility/real.h"
 107 #include "gromacs/utility/smalloc.h"
 108
 109 #include "calculate-spline-moduli.h"
 110 #include "pme-gather.h"
 111 #include "pme-grid.h"
 112 #include "pme-internal.h"
 113 #include "pme-redistribute.h"
 114 #include "pme-solve.h"
 115 #include "pme-spline-work.h"
 116 #include "pme-spread.h"
 117
 118 /*! \brief Number of bytes in a cache line.
 119  *
 120  * Must also be a multiple of the SIMD and SIMD4 register size, to
 121  * preserve alignment.
 122  */
 123 const int gmxCacheLineSize = 64;
 124
 125 //! Set up coordinate communication
 126 static void setup_coordinate_communication(pme_atomcomm_t *atc)
 127 {
 128     int nslab, n, i;
 129     int fw, bw;
 130
 131     nslab = atc->nslab;
 132
 133     n = 0;
 134     for (i = 1; i <= nslab/2; i++)
 135     {
 136         fw = (atc->nodeid + i) % nslab;
 137         bw = (atc->nodeid - i + nslab) % nslab;
 138         if (n < nslab - 1)
 139         {
 140             atc->node_dest[n] = fw;
 141             atc->node_src[n]  = bw;
 142             n++;
 143         }
 144         if (n < nslab - 1)
 145         {
 146             atc->node_dest[n] = bw;
 147             atc->node_src[n]  = fw;
 148             n++;
 149         }
 150     }
 151 }
 152
 153 int gmx_pme_destroy(FILE *log, struct gmx_pme_t **pmedata)
 154 {
 155     int i;
 156
 157     if (NULL != log)
 158     {
 159         fprintf(log, "Destroying PME data structures.\n");
 160     }
 161
 162     sfree((*pmedata)->nnx);
 163     sfree((*pmedata)->nny);
 164     sfree((*pmedata)->nnz);
 165
 166     for (i = 0; i < (*pmedata)->ngrids; ++i)
 167     {
 168         pmegrids_destroy(&(*pmedata)->pmegrid[i]);
 169         sfree((*pmedata)->fftgrid[i]);
 170         sfree((*pmedata)->cfftgrid[i]);
 171         gmx_parallel_3dfft_destroy((*pmedata)->pfft_setup[i]);
 172     }
 173
 174     sfree((*pmedata)->lb_buf1);
 175     sfree((*pmedata)->lb_buf2);
 176
 177     pme_free_all_work(&(*pmedata)->solve_work, (*pmedata)->nthread);
 178
 179     sfree(*pmedata);
 180     *pmedata = NULL;
 181
 182     return 0;
 183 }
 184
 185 /*! \brief Round \p n up to the next multiple of \p f */
 186 static int mult_up(int n, int f)
 187 {
 188     return ((n + f - 1)/f)*f;
 189 }
 190
 191 /*! \brief Return estimate of the load imbalance from the PME grid not being a good match for the number of PME ranks */
 192 static double estimate_pme_load_imbalance(struct gmx_pme_t *pme)
 193 {
 194     int    nma, nmi;
 195     double n1, n2, n3;
 196
 197     nma = pme->nnodes_major;
 198     nmi = pme->nnodes_minor;
 199
 200     n1 = mult_up(pme->nkx, nma)*mult_up(pme->nky, nmi)*pme->nkz;
 201     n2 = mult_up(pme->nkx, nma)*mult_up(pme->nkz, nmi)*pme->nky;
 202     n3 = mult_up(pme->nky, nma)*mult_up(pme->nkz, nmi)*pme->nkx;
 203
 204     /* pme_solve is roughly double the cost of an fft */
 205
 206     return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
 207 }
 208
 209 /*! \brief Initialize atom communication data structure */
 210 static void init_atomcomm(struct gmx_pme_t *pme, pme_atomcomm_t *atc,
 211                           int dimind, gmx_bool bSpread)
 212 {
 213     int thread;
 214
 215     atc->dimind    = dimind;
 216     atc->nslab     = 1;
 217     atc->nodeid    = 0;
 218     atc->pd_nalloc = 0;
 219 #if GMX_MPI
 220     if (pme->nnodes > 1)
 221     {
 222         atc->mpi_comm = pme->mpi_comm_d[dimind];
 223         MPI_Comm_size(atc->mpi_comm, &atc->nslab);
 224         MPI_Comm_rank(atc->mpi_comm, &atc->nodeid);
 225     }
 226     if (debug)
 227     {
 228         fprintf(debug, "For PME atom communication in dimind %d: nslab %d rank %d\n", atc->dimind, atc->nslab, atc->nodeid);
 229     }
 230 #endif
 231
 232     atc->bSpread   = bSpread;
 233     atc->pme_order = pme->pme_order;
 234
 235     if (atc->nslab > 1)
 236     {
 237         snew(atc->node_dest, atc->nslab);
 238         snew(atc->node_src, atc->nslab);
 239         setup_coordinate_communication(atc);
 240
 241         snew(atc->count_thread, pme->nthread);
 242         for (thread = 0; thread < pme->nthread; thread++)
 243         {
 244             snew(atc->count_thread[thread], atc->nslab);
 245         }
 246         atc->count = atc->count_thread[0];
 247         snew(atc->rcount, atc->nslab);
 248         snew(atc->buf_index, atc->nslab);
 249     }
 250
 251     atc->nthread = pme->nthread;
 252     if (atc->nthread > 1)
 253     {
 254         snew(atc->thread_plist, atc->nthread);
 255     }
 256     snew(atc->spline, atc->nthread);
 257     for (thread = 0; thread < atc->nthread; thread++)
 258     {
 259         if (atc->nthread > 1)
 260         {
 261             snew(atc->thread_plist[thread].n, atc->nthread+2*gmxCacheLineSize);
 262             atc->thread_plist[thread].n += gmxCacheLineSize;
 263         }
 264         snew(atc->spline[thread].thread_one, pme->nthread);
 265         atc->spline[thread].thread_one[thread] = 1;
 266     }
 267 }
 268
 269 /*! \brief Initialize data structure for communication */
 270 static void
 271 init_overlap_comm(pme_overlap_t *  ol,
 272                   int              norder,
 273 #if GMX_MPI
 274                   MPI_Comm         comm,
 275 #endif
 276                   int              nnodes,
 277                   int              nodeid,
 278                   int              ndata,
 279                   int              commplainsize)
 280 {
 281     int              b, i;
 282     pme_grid_comm_t *pgc;
 283     gmx_bool         bCont;
 284     int              fft_start, fft_end, send_index1, recv_index1;
 285 #if GMX_MPI
 286     MPI_Status       stat;
 287
 288     ol->mpi_comm = comm;
 289 #endif
 290
 291     ol->nnodes = nnodes;
 292     ol->nodeid = nodeid;
 293
 294     /* Linear translation of the PME grid won't affect reciprocal space
 295      * calculations, so to optimize we only interpolate "upwards",
 296      * which also means we only have to consider overlap in one direction.
 297      * I.e., particles on this node might also be spread to grid indices
 298      * that belong to higher nodes (modulo nnodes)
 299      */
 300
 301     snew(ol->s2g0, ol->nnodes+1);
 302     snew(ol->s2g1, ol->nnodes);
 303     if (debug)
 304     {
 305         fprintf(debug, "PME slab boundaries:");
 306     }
 307     for (i = 0; i < nnodes; i++)
 308     {
 309         /* s2g0 the local interpolation grid start.
 310          * s2g1 the local interpolation grid end.
 311          * Since in calc_pidx we divide particles, and not grid lines,
 312          * spatially uniform along dimension x or y, we need to round
 313          * s2g0 down and s2g1 up.
 314          */
 315         ol->s2g0[i] = ( i   *ndata + 0       )/nnodes;
 316         ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
 317
 318         if (debug)
 319         {
 320             fprintf(debug, "  %3d %3d", ol->s2g0[i], ol->s2g1[i]);
 321         }
 322     }
 323     ol->s2g0[nnodes] = ndata;
 324     if (debug)
 325     {
 326         fprintf(debug, "\n");
 327     }
 328
 329     /* Determine with how many nodes we need to communicate the grid overlap */
 330     b = 0;
 331     do
 332     {
 333         b++;
 334         bCont = FALSE;
 335         for (i = 0; i < nnodes; i++)
 336         {
 337             if ((i+b <  nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
 338                 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
 339             {
 340                 bCont = TRUE;
 341             }
 342         }
 343     }
 344     while (bCont && b < nnodes);
 345     ol->noverlap_nodes = b - 1;
 346
 347     snew(ol->send_id, ol->noverlap_nodes);
 348     snew(ol->recv_id, ol->noverlap_nodes);
 349     for (b = 0; b < ol->noverlap_nodes; b++)
 350     {
 351         ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
 352         ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
 353     }
 354     snew(ol->comm_data, ol->noverlap_nodes);
 355
 356     ol->send_size = 0;
 357     for (b = 0; b < ol->noverlap_nodes; b++)
 358     {
 359         pgc = &ol->comm_data[b];
 360         /* Send */
 361         fft_start        = ol->s2g0[ol->send_id[b]];
 362         fft_end          = ol->s2g0[ol->send_id[b]+1];
 363         if (ol->send_id[b] < nodeid)
 364         {
 365             fft_start += ndata;
 366             fft_end   += ndata;
 367         }
 368         send_index1       = ol->s2g1[nodeid];
 369         send_index1       = std::min(send_index1, fft_end);
 370         pgc->send_index0  = fft_start;
 371         pgc->send_nindex  = std::max(0, send_index1 - pgc->send_index0);
 372         ol->send_size    += pgc->send_nindex;
 373
 374         /* We always start receiving to the first index of our slab */
 375         fft_start        = ol->s2g0[ol->nodeid];
 376         fft_end          = ol->s2g0[ol->nodeid+1];
 377         recv_index1      = ol->s2g1[ol->recv_id[b]];
 378         if (ol->recv_id[b] > nodeid)
 379         {
 380             recv_index1 -= ndata;
 381         }
 382         recv_index1      = std::min(recv_index1, fft_end);
 383         pgc->recv_index0 = fft_start;
 384         pgc->recv_nindex = std::max(0, recv_index1 - pgc->recv_index0);
 385     }
 386
 387 #if GMX_MPI
 388     /* Communicate the buffer sizes to receive */
 389     for (b = 0; b < ol->noverlap_nodes; b++)
 390     {
 391         MPI_Sendrecv(&ol->send_size, 1, MPI_INT, ol->send_id[b], b,
 392                      &ol->comm_data[b].recv_size, 1, MPI_INT, ol->recv_id[b], b,
 393                      ol->mpi_comm, &stat);
 394     }
 395 #endif
 396
 397     /* For non-divisible grid we need pme_order iso pme_order-1 */
 398     snew(ol->sendbuf, norder*commplainsize);
 399     snew(ol->recvbuf, norder*commplainsize);
 400 }
 401
 402 void gmx_pme_check_restrictions(int pme_order,
 403                                 int nkx, int nky, int nkz,
 404                                 int nnodes_major,
 405                                 int nnodes_minor,
 406                                 gmx_bool bUseThreads,
 407                                 gmx_bool bFatal,
 408                                 gmx_bool *bValidSettings)
 409 {
 410     if (pme_order > PME_ORDER_MAX)
 411     {
 412         if (!bFatal)
 413         {
 414             *bValidSettings = FALSE;
 415             return;
 416         }
 417         gmx_fatal(FARGS, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
 418                   pme_order, PME_ORDER_MAX);
 419     }
 420
 421     if (nkx <= pme_order*(nnodes_major > 1 ? 2 : 1) ||
 422         nky <= pme_order*(nnodes_minor > 1 ? 2 : 1) ||
 423         nkz <= pme_order)
 424     {
 425         if (!bFatal)
 426         {
 427             *bValidSettings = FALSE;
 428             return;
 429         }
 430         gmx_fatal(FARGS, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order",
 431                   pme_order);
 432     }
 433
 434     /* Check for a limitation of the (current) sum_fftgrid_dd code.
 435      * We only allow multiple communication pulses in dim 1, not in dim 0.
 436      */
 437     if (bUseThreads && (nkx < nnodes_major*pme_order &&
 438                         nkx != nnodes_major*(pme_order - 1)))
 439     {
 440         if (!bFatal)
 441         {
 442             *bValidSettings = FALSE;
 443             return;
 444         }
 445         gmx_fatal(FARGS, "The number of PME grid lines per rank along x is %g. But when using OpenMP threads, the number of grid lines per rank along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use fewer ranks along x (and possibly more along y and/or z) by specifying -dd manually.",
 446                   nkx/(double)nnodes_major, pme_order);
 447     }
 448
 449     if (bValidSettings != NULL)
 450     {
 451         *bValidSettings = TRUE;
 452     }
 453
 454     return;
 455 }
 456
 457 /*! \brief Round \p enumerator */
 458 static int div_round_up(int enumerator, int denominator)
 459 {
 460     return (enumerator + denominator - 1)/denominator;
 461 }
 462
 463 int gmx_pme_init(struct gmx_pme_t **pmedata,
 464                  t_commrec *        cr,
 465                  int                nnodes_major,
 466                  int                nnodes_minor,
 467                  t_inputrec *       ir,
 468                  int                homenr,
 469                  gmx_bool           bFreeEnergy_q,
 470                  gmx_bool           bFreeEnergy_lj,
 471                  gmx_bool           bReproducible,
 472                  int                nthread)
 473 {
 474     struct gmx_pme_t *pme = NULL;
 475
 476     int               use_threads, sum_use_threads, i;
 477     ivec              ndata;
 478
 479     if (debug)
 480     {
 481         fprintf(debug, "Creating PME data structures.\n");
 482     }
 483     snew(pme, 1);
 484
 485     pme->sum_qgrid_tmp       = NULL;
 486     pme->sum_qgrid_dd_tmp    = NULL;
 487     pme->buf_nalloc          = 0;
 488
 489     pme->nnodes              = 1;
 490     pme->bPPnode             = TRUE;
 491
 492     pme->nnodes_major        = nnodes_major;
 493     pme->nnodes_minor        = nnodes_minor;
 494
 495 #if GMX_MPI
 496     if (nnodes_major*nnodes_minor > 1)
 497     {
 498         pme->mpi_comm = cr->mpi_comm_mygroup;
 499
 500         MPI_Comm_rank(pme->mpi_comm, &pme->nodeid);
 501         MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
 502         if (pme->nnodes != nnodes_major*nnodes_minor)
 503         {
 504             gmx_incons("PME rank count mismatch");
 505         }
 506     }
 507     else
 508     {
 509         pme->mpi_comm = MPI_COMM_NULL;
 510     }
 511 #endif
 512
 513     if (pme->nnodes == 1)
 514     {
 515 #if GMX_MPI
 516         pme->mpi_comm_d[0] = MPI_COMM_NULL;
 517         pme->mpi_comm_d[1] = MPI_COMM_NULL;
 518 #endif
 519         pme->ndecompdim   = 0;
 520         pme->nodeid_major = 0;
 521         pme->nodeid_minor = 0;
 522 #if GMX_MPI
 523         pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
 524 #endif
 525     }
 526     else
 527     {
 528         if (nnodes_minor == 1)
 529         {
 530 #if GMX_MPI
 531             pme->mpi_comm_d[0] = pme->mpi_comm;
 532             pme->mpi_comm_d[1] = MPI_COMM_NULL;
 533 #endif
 534             pme->ndecompdim   = 1;
 535             pme->nodeid_major = pme->nodeid;
 536             pme->nodeid_minor = 0;
 537
 538         }
 539         else if (nnodes_major == 1)
 540         {
 541 #if GMX_MPI
 542             pme->mpi_comm_d[0] = MPI_COMM_NULL;
 543             pme->mpi_comm_d[1] = pme->mpi_comm;
 544 #endif
 545             pme->ndecompdim   = 1;
 546             pme->nodeid_major = 0;
 547             pme->nodeid_minor = pme->nodeid;
 548         }
 549         else
 550         {
 551             if (pme->nnodes % nnodes_major != 0)
 552             {
 553                 gmx_incons("For 2D PME decomposition, #PME ranks must be divisible by the number of ranks in the major dimension");
 554             }
 555             pme->ndecompdim = 2;
 556
 557 #if GMX_MPI
 558             MPI_Comm_split(pme->mpi_comm, pme->nodeid % nnodes_minor,
 559                            pme->nodeid, &pme->mpi_comm_d[0]);  /* My communicator along major dimension */
 560             MPI_Comm_split(pme->mpi_comm, pme->nodeid/nnodes_minor,
 561                            pme->nodeid, &pme->mpi_comm_d[1]);  /* My communicator along minor dimension */
 562
 563             MPI_Comm_rank(pme->mpi_comm_d[0], &pme->nodeid_major);
 564             MPI_Comm_size(pme->mpi_comm_d[0], &pme->nnodes_major);
 565             MPI_Comm_rank(pme->mpi_comm_d[1], &pme->nodeid_minor);
 566             MPI_Comm_size(pme->mpi_comm_d[1], &pme->nnodes_minor);
 567 #endif
 568         }
 569         pme->bPPnode = (cr->duty & DUTY_PP);
 570     }
 571
 572     pme->nthread = nthread;
 573
 574     /* Check if any of the PME MPI ranks uses threads */
 575     use_threads = (pme->nthread > 1 ? 1 : 0);
 576 #if GMX_MPI
 577     if (pme->nnodes > 1)
 578     {
 579         MPI_Allreduce(&use_threads, &sum_use_threads, 1, MPI_INT,
 580                       MPI_SUM, pme->mpi_comm);
 581     }
 582     else
 583 #endif
 584     {
 585         sum_use_threads = use_threads;
 586     }
 587     pme->bUseThreads = (sum_use_threads > 0);
 588
 589     if (ir->ePBC == epbcSCREW)
 590     {
 591         gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
 592     }
 593
 594     pme->bFEP_q      = ((ir->efep != efepNO) && bFreeEnergy_q);
 595     pme->bFEP_lj     = ((ir->efep != efepNO) && bFreeEnergy_lj);
 596     pme->bFEP        = (pme->bFEP_q || pme->bFEP_lj);
 597     pme->nkx         = ir->nkx;
 598     pme->nky         = ir->nky;
 599     pme->nkz         = ir->nkz;
 600     pme->bP3M        = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
 601     pme->pme_order   = ir->pme_order;
 602
 603     /* Always constant electrostatics coefficients */
 604     pme->epsilon_r   = ir->epsilon_r;
 605
 606     /* Always constant LJ coefficients */
 607     pme->ljpme_combination_rule = ir->ljpme_combination_rule;
 608
 609     /* If we violate restrictions, generate a fatal error here */
 610     gmx_pme_check_restrictions(pme->pme_order,
 611                                pme->nkx, pme->nky, pme->nkz,
 612                                pme->nnodes_major,
 613                                pme->nnodes_minor,
 614                                pme->bUseThreads,
 615                                TRUE,
 616                                NULL);
 617
 618     if (pme->nnodes > 1)
 619     {
 620         double imbal;
 621
 622 #if GMX_MPI
 623         MPI_Type_contiguous(DIM, GMX_MPI_REAL, &(pme->rvec_mpi));
 624         MPI_Type_commit(&(pme->rvec_mpi));
 625 #endif
 626
 627         /* Note that the coefficient spreading and force gathering, which usually
 628          * takes about the same amount of time as FFT+solve_pme,
 629          * is always fully load balanced
 630          * (unless the coefficient distribution is inhomogeneous).
 631          */
 632
 633         imbal = estimate_pme_load_imbalance(pme);
 634         if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
 635         {
 636             fprintf(stderr,
 637                     "\n"
 638                     "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
 639                     "      For optimal PME load balancing\n"
 640                     "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_ranks_x (%d)\n"
 641                     "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_ranks_y (%d)\n"
 642                     "\n",
 643                     (int)((imbal-1)*100 + 0.5),
 644                     pme->nkx, pme->nky, pme->nnodes_major,
 645                     pme->nky, pme->nkz, pme->nnodes_minor);
 646         }
 647     }
 648
 649     /* For non-divisible grid we need pme_order iso pme_order-1 */
 650     /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
 651      * y is always copied through a buffer: we don't need padding in z,
 652      * but we do need the overlap in x because of the communication order.
 653      */
 654     init_overlap_comm(&pme->overlap[0], pme->pme_order,
 655 #if GMX_MPI
 656                       pme->mpi_comm_d[0],
 657 #endif
 658                       pme->nnodes_major, pme->nodeid_major,
 659                       pme->nkx,
 660                       (div_round_up(pme->nky, pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
 661
 662     /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
 663      * We do this with an offset buffer of equal size, so we need to allocate
 664      * extra for the offset. That's what the (+1)*pme->nkz is for.
 665      */
 666     init_overlap_comm(&pme->overlap[1], pme->pme_order,
 667 #if GMX_MPI
 668                       pme->mpi_comm_d[1],
 669 #endif
 670                       pme->nnodes_minor, pme->nodeid_minor,
 671                       pme->nky,
 672                       (div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
 673
 674     /* Double-check for a limitation of the (current) sum_fftgrid_dd code.
 675      * Note that gmx_pme_check_restrictions checked for this already.
 676      */
 677     if (pme->bUseThreads && pme->overlap[0].noverlap_nodes > 1)
 678     {
 679         gmx_incons("More than one communication pulse required for grid overlap communication along the major dimension while using threads");
 680     }
 681
 682     snew(pme->bsp_mod[XX], pme->nkx);
 683     snew(pme->bsp_mod[YY], pme->nky);
 684     snew(pme->bsp_mod[ZZ], pme->nkz);
 685
 686     /* The required size of the interpolation grid, including overlap.
 687      * The allocated size (pmegrid_n?) might be slightly larger.
 688      */
 689     pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
 690         pme->overlap[0].s2g0[pme->nodeid_major];
 691     pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
 692         pme->overlap[1].s2g0[pme->nodeid_minor];
 693     pme->pmegrid_nz_base = pme->nkz;
 694     pme->pmegrid_nz      = pme->pmegrid_nz_base + pme->pme_order - 1;
 695     set_grid_alignment(&pme->pmegrid_nz, pme->pme_order);
 696
 697     pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
 698     pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
 699     pme->pmegrid_start_iz = 0;
 700
 701     make_gridindex5_to_localindex(pme->nkx,
 702                                   pme->pmegrid_start_ix,
 703                                   pme->pmegrid_nx - (pme->pme_order-1),
 704                                   &pme->nnx, &pme->fshx);
 705     make_gridindex5_to_localindex(pme->nky,
 706                                   pme->pmegrid_start_iy,
 707                                   pme->pmegrid_ny - (pme->pme_order-1),
 708                                   &pme->nny, &pme->fshy);
 709     make_gridindex5_to_localindex(pme->nkz,
 710                                   pme->pmegrid_start_iz,
 711                                   pme->pmegrid_nz_base,
 712                                   &pme->nnz, &pme->fshz);
 713
 714     pme->spline_work = make_pme_spline_work(pme->pme_order);
 715
 716     ndata[0]    = pme->nkx;
 717     ndata[1]    = pme->nky;
 718     ndata[2]    = pme->nkz;
 719     /* It doesn't matter if we allocate too many grids here,
 720      * we only allocate and use the ones we need.
 721      */
 722     if (EVDW_PME(ir->vdwtype))
 723     {
 724         pme->ngrids = ((ir->ljpme_combination_rule == eljpmeLB) ? DO_Q_AND_LJ_LB : DO_Q_AND_LJ);
 725     }
 726     else
 727     {
 728         pme->ngrids = DO_Q;
 729     }
 730     snew(pme->fftgrid, pme->ngrids);
 731     snew(pme->cfftgrid, pme->ngrids);
 732     snew(pme->pfft_setup, pme->ngrids);
 733
 734     for (i = 0; i < pme->ngrids; ++i)
 735     {
 736         if ((i <  DO_Q && EEL_PME(ir->coulombtype) && (i == 0 ||
 737                                                        bFreeEnergy_q)) ||
 738             (i >= DO_Q && EVDW_PME(ir->vdwtype) && (i == 2 ||
 739                                                     bFreeEnergy_lj ||
 740                                                     ir->ljpme_combination_rule == eljpmeLB)))
 741         {
 742             pmegrids_init(&pme->pmegrid[i],
 743                           pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
 744                           pme->pmegrid_nz_base,
 745                           pme->pme_order,
 746                           pme->bUseThreads,
 747                           pme->nthread,
 748                           pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
 749                           pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
 750             /* This routine will allocate the grid data to fit the FFTs */
 751             gmx_parallel_3dfft_init(&pme->pfft_setup[i], ndata,
 752                                     &pme->fftgrid[i], &pme->cfftgrid[i],
 753                                     pme->mpi_comm_d,
 754                                     bReproducible, pme->nthread);
 755
 756         }
 757     }
 758
 759     if (!pme->bP3M)
 760     {
 761         /* Use plain SPME B-spline interpolation */
 762         make_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
 763     }
 764     else
 765     {
 766         /* Use the P3M grid-optimized influence function */
 767         make_p3m_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
 768     }
 769
 770     /* Use atc[0] for spreading */
 771     init_atomcomm(pme, &pme->atc[0], nnodes_major > 1 ? 0 : 1, TRUE);
 772     if (pme->ndecompdim >= 2)
 773     {
 774         init_atomcomm(pme, &pme->atc[1], 1, FALSE);
 775     }
 776
 777     if (pme->nnodes == 1)
 778     {
 779         pme->atc[0].n = homenr;
 780         pme_realloc_atomcomm_things(&pme->atc[0]);
 781     }
 782
 783     pme->lb_buf1       = NULL;
 784     pme->lb_buf2       = NULL;
 785     pme->lb_buf_nalloc = 0;
 786
 787     pme_init_all_work(&pme->solve_work, pme->nthread, pme->nkx);
 788
 789     *pmedata = pme;
 790
 791     return 0;
 792 }
 793
 794 int gmx_pme_reinit(struct gmx_pme_t **pmedata,
 795                    t_commrec *        cr,
 796                    struct gmx_pme_t * pme_src,
 797                    const t_inputrec * ir,
 798                    ivec               grid_size)
 799 {
 800     t_inputrec irc;
 801     int        homenr;
 802     int        ret;
 803
 804     irc     = *ir;
 805     irc.nkx = grid_size[XX];
 806     irc.nky = grid_size[YY];
 807     irc.nkz = grid_size[ZZ];
 808
 809     if (pme_src->nnodes == 1)
 810     {
 811         homenr = pme_src->atc[0].n;
 812     }
 813     else
 814     {
 815         homenr = -1;
 816     }
 817
 818     ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
 819                        &irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE, pme_src->nthread);
 820
 821     if (ret == 0)
 822     {
 823         /* We can easily reuse the allocated pme grids in pme_src */
 824         reuse_pmegrids(&pme_src->pmegrid[PME_GRID_QA], &(*pmedata)->pmegrid[PME_GRID_QA]);
 825         /* We would like to reuse the fft grids, but that's harder */
 826     }
 827
 828     return ret;
 829 }
 830
 831 void gmx_pme_calc_energy(struct gmx_pme_t *pme, int n, rvec *x, real *q, real *V)
 832 {
 833     pme_atomcomm_t *atc;
 834     pmegrids_t     *grid;
 835
 836     if (pme->nnodes > 1)
 837     {
 838         gmx_incons("gmx_pme_calc_energy called in parallel");
 839     }
 840     if (pme->bFEP_q > 1)
 841     {
 842         gmx_incons("gmx_pme_calc_energy with free energy");
 843     }
 844
 845     atc            = &pme->atc_energy;
 846     atc->nthread   = 1;
 847     if (atc->spline == NULL)
 848     {
 849         snew(atc->spline, atc->nthread);
 850     }
 851     atc->nslab     = 1;
 852     atc->bSpread   = TRUE;
 853     atc->pme_order = pme->pme_order;
 854     atc->n         = n;
 855     pme_realloc_atomcomm_things(atc);
 856     atc->x           = x;
 857     atc->coefficient = q;
 858
 859     /* We only use the A-charges grid */
 860     grid = &pme->pmegrid[PME_GRID_QA];
 861
 862     /* Only calculate the spline coefficients, don't actually spread */
 863     spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgrid[PME_GRID_QA], FALSE, PME_GRID_QA);
 864
 865     *V = gather_energy_bsplines(pme, grid->grid.grid, atc);
 866 }
 867
 868 /*! \brief Calculate initial Lorentz-Berthelot coefficients for LJ-PME */
 869 static void
 870 calc_initial_lb_coeffs(struct gmx_pme_t *pme, real *local_c6, real *local_sigma)
 871 {
 872     int  i;
 873     for (i = 0; i < pme->atc[0].n; ++i)
 874     {
 875         real sigma4;
 876         sigma4                     = local_sigma[i];
 877         sigma4                     = sigma4*sigma4;
 878         sigma4                     = sigma4*sigma4;
 879         pme->atc[0].coefficient[i] = local_c6[i] / sigma4;
 880     }
 881 }
 882
 883 /*! \brief Calculate next Lorentz-Berthelot coefficients for LJ-PME */
 884 static void
 885 calc_next_lb_coeffs(struct gmx_pme_t *pme, real *local_sigma)
 886 {
 887     int  i;
 888
 889     for (i = 0; i < pme->atc[0].n; ++i)
 890     {
 891         pme->atc[0].coefficient[i] *= local_sigma[i];
 892     }
 893 }
 894
 895 int gmx_pme_do(struct gmx_pme_t *pme,
 896                int start,       int homenr,
 897                rvec x[],        rvec f[],
 898                real chargeA[],  real chargeB[],
 899                real c6A[],      real c6B[],
 900                real sigmaA[],   real sigmaB[],
 901                matrix box,      t_commrec *cr,
 902                int  maxshift_x, int maxshift_y,
 903                t_nrnb *nrnb,    gmx_wallcycle_t wcycle,
 904                matrix vir_q,    real ewaldcoeff_q,
 905                matrix vir_lj,   real ewaldcoeff_lj,
 906                real *energy_q,  real *energy_lj,
 907                real lambda_q,   real lambda_lj,
 908                real *dvdlambda_q, real *dvdlambda_lj,
 909                int flags)
 910 {
 911     int                  d, i, j, npme, grid_index, max_grid_index;
 912     int                  n_d;
 913     pme_atomcomm_t      *atc        = NULL;
 914     pmegrids_t          *pmegrid    = NULL;
 915     real                *grid       = NULL;
 916     rvec                *f_d;
 917     real                *coefficient = NULL;
 918     real                 energy_AB[4];
 919     matrix               vir_AB[4];
 920     real                 scale, lambda;
 921     gmx_bool             bClearF;
 922     gmx_parallel_3dfft_t pfft_setup;
 923     real              *  fftgrid;
 924     t_complex          * cfftgrid;
 925     int                  thread;
 926     gmx_bool             bFirst, bDoSplines;
 927     int                  fep_state;
 928     int                  fep_states_lj           = pme->bFEP_lj ? 2 : 1;
 929     const gmx_bool       bCalcEnerVir            = flags & GMX_PME_CALC_ENER_VIR;
 930     const gmx_bool       bBackFFT                = flags & (GMX_PME_CALC_F | GMX_PME_CALC_POT);
 931     const gmx_bool       bCalcF                  = flags & GMX_PME_CALC_F;
 932
 933     assert(pme->nnodes > 0);
 934     assert(pme->nnodes == 1 || pme->ndecompdim > 0);
 935
 936     if (pme->nnodes > 1)
 937     {
 938         atc      = &pme->atc[0];
 939         atc->npd = homenr;
 940         if (atc->npd > atc->pd_nalloc)
 941         {
 942             atc->pd_nalloc = over_alloc_dd(atc->npd);
 943             srenew(atc->pd, atc->pd_nalloc);
 944         }
 945         for (d = pme->ndecompdim-1; d >= 0; d--)
 946         {
 947             atc           = &pme->atc[d];
 948             atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
 949         }
 950     }
 951     else
 952     {
 953         atc = &pme->atc[0];
 954         /* This could be necessary for TPI */
 955         pme->atc[0].n = homenr;
 956         if (DOMAINDECOMP(cr))
 957         {
 958             pme_realloc_atomcomm_things(atc);
 959         }
 960         atc->x = x;
 961         atc->f = f;
 962     }
 963
 964     gmx::invertBoxMatrix(box, pme->recipbox);
 965     bFirst = TRUE;
 966
 967     /* For simplicity, we construct the splines for all particles if
 968      * more than one PME calculations is needed. Some optimization
 969      * could be done by keeping track of which atoms have splines
 970      * constructed, and construct new splines on each pass for atoms
 971      * that don't yet have them.
 972      */
 973
 974     bDoSplines = pme->bFEP || ((flags & GMX_PME_DO_COULOMB) && (flags & GMX_PME_DO_LJ));
 975
 976     /* We need a maximum of four separate PME calculations:
 977      * grid_index=0: Coulomb PME with charges from state A
 978      * grid_index=1: Coulomb PME with charges from state B
 979      * grid_index=2: LJ PME with C6 from state A
 980      * grid_index=3: LJ PME with C6 from state B
 981      * For Lorentz-Berthelot combination rules, a separate loop is used to
 982      * calculate all the terms
 983      */
 984
 985     /* If we are doing LJ-PME with LB, we only do Q here */
 986     max_grid_index = (pme->ljpme_combination_rule == eljpmeLB) ? DO_Q : DO_Q_AND_LJ;
 987
 988     for (grid_index = 0; grid_index < max_grid_index; ++grid_index)
 989     {
 990         /* Check if we should do calculations at this grid_index
 991          * If grid_index is odd we should be doing FEP
 992          * If grid_index < 2 we should be doing electrostatic PME
 993          * If grid_index >= 2 we should be doing LJ-PME
 994          */
 995         if ((grid_index <  DO_Q && (!(flags & GMX_PME_DO_COULOMB) ||
 996                                     (grid_index == 1 && !pme->bFEP_q))) ||
 997             (grid_index >= DO_Q && (!(flags & GMX_PME_DO_LJ) ||
 998                                     (grid_index == 3 && !pme->bFEP_lj))))
 999         {
1000             continue;
1001         }
1002         /* Unpack structure */
1003         pmegrid    = &pme->pmegrid[grid_index];
1004         fftgrid    = pme->fftgrid[grid_index];
1005         cfftgrid   = pme->cfftgrid[grid_index];
1006         pfft_setup = pme->pfft_setup[grid_index];
1007         switch (grid_index)
1008         {
1009             case 0: coefficient = chargeA + start; break;
1010             case 1: coefficient = chargeB + start; break;
1011             case 2: coefficient = c6A + start; break;
1012             case 3: coefficient = c6B + start; break;
1013         }
1014
1015         grid = pmegrid->grid.grid;
1016
1017         if (debug)
1018         {
1019             fprintf(debug, "PME: number of ranks = %d, rank = %d\n",
1020                     cr->nnodes, cr->nodeid);
1021             fprintf(debug, "Grid = %p\n", (void*)grid);
1022             if (grid == NULL)
1023             {
1024                 gmx_fatal(FARGS, "No grid!");
1025             }
1026         }
1027         where();
1028
1029         if (pme->nnodes == 1)
1030         {
1031             atc->coefficient = coefficient;
1032         }
1033         else
1034         {
1035             wallcycle_start(wcycle, ewcPME_REDISTXF);
1036             do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, coefficient);
1037             where();
1038
1039             wallcycle_stop(wcycle, ewcPME_REDISTXF);
1040         }
1041
1042         if (debug)
1043         {
1044             fprintf(debug, "Rank= %6d, pme local particles=%6d\n",
1045                     cr->nodeid, atc->n);
1046         }
1047
1048         if (flags & GMX_PME_SPREAD)
1049         {
1050             wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1051
1052             /* Spread the coefficients on a grid */
1053             spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
1054
1055             if (bFirst)
1056             {
1057                 inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
1058             }
1059             inc_nrnb(nrnb, eNR_SPREADBSP,
1060                      pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
1061
1062             if (!pme->bUseThreads)
1063             {
1064                 wrap_periodic_pmegrid(pme, grid);
1065
1066                 /* sum contributions to local grid from other nodes */
1067 #if GMX_MPI
1068                 if (pme->nnodes > 1)
1069                 {
1070                     gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
1071                     where();
1072                 }
1073 #endif
1074
1075                 copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
1076             }
1077
1078             wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1079
1080             /* TODO If the OpenMP and single-threaded implementations
1081                converge, then spread_on_grid() and
1082                copy_pmegrid_to_fftgrid() will perhaps live in the same
1083                source file and the following debugging function can live
1084                there too. */
1085             /*
1086                dump_local_fftgrid(pme,fftgrid);
1087                exit(0);
1088              */
1089         }
1090
1091         /* Here we start a large thread parallel region */
1092 #pragma omp parallel num_threads(pme->nthread) private(thread)
1093         {
1094             try
1095             {
1096                 thread = gmx_omp_get_thread_num();
1097                 if (flags & GMX_PME_SOLVE)
1098                 {
1099                     int loop_count;
1100
1101                     /* do 3d-fft */
1102                     if (thread == 0)
1103                     {
1104                         wallcycle_start(wcycle, ewcPME_FFT);
1105                     }
1106                     gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
1107                                                thread, wcycle);
1108                     if (thread == 0)
1109                     {
1110                         wallcycle_stop(wcycle, ewcPME_FFT);
1111                     }
1112                     where();
1113
1114                     /* solve in k-space for our local cells */
1115                     if (thread == 0)
1116                     {
1117                         wallcycle_start(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
1118                     }
1119                     if (grid_index < DO_Q)
1120                     {
1121                         loop_count =
1122                             solve_pme_yzx(pme, cfftgrid, ewaldcoeff_q,
1123                                           box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
1124                                           bCalcEnerVir,
1125                                           pme->nthread, thread);
1126                     }
1127                     else
1128                     {
1129                         loop_count =
1130                             solve_pme_lj_yzx(pme, &cfftgrid, FALSE, ewaldcoeff_lj,
1131                                              box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
1132                                              bCalcEnerVir,
1133                                              pme->nthread, thread);
1134                     }
1135
1136                     if (thread == 0)
1137                     {
1138                         wallcycle_stop(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
1139                         where();
1140                         inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
1141                     }
1142                 }
1143
1144                 if (bBackFFT)
1145                 {
1146                     /* do 3d-invfft */
1147                     if (thread == 0)
1148                     {
1149                         where();
1150                         wallcycle_start(wcycle, ewcPME_FFT);
1151                     }
1152                     gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
1153                                                thread, wcycle);
1154                     if (thread == 0)
1155                     {
1156                         wallcycle_stop(wcycle, ewcPME_FFT);
1157
1158                         where();
1159
1160                         if (pme->nodeid == 0)
1161                         {
1162                             real ntot = pme->nkx*pme->nky*pme->nkz;
1163                             npme  = static_cast<int>(ntot*log(ntot)/log(2.0));
1164                             inc_nrnb(nrnb, eNR_FFT, 2*npme);
1165                         }
1166
1167                         /* Note: this wallcycle region is closed below
1168                            outside an OpenMP region, so take care if
1169                            refactoring code here. */
1170                         wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1171                     }
1172
1173                     copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
1174                 }
1175             } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1176         }
1177         /* End of thread parallel section.
1178          * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
1179          */
1180
1181         if (bBackFFT)
1182         {
1183             /* distribute local grid to all nodes */
1184 #if GMX_MPI
1185             if (pme->nnodes > 1)
1186             {
1187                 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
1188             }
1189 #endif
1190             where();
1191
1192             unwrap_periodic_pmegrid(pme, grid);
1193         }
1194
1195         if (bCalcF)
1196         {
1197             /* interpolate forces for our local atoms */
1198
1199             where();
1200
1201             /* If we are running without parallelization,
1202              * atc->f is the actual force array, not a buffer,
1203              * therefore we should not clear it.
1204              */
1205             lambda  = grid_index < DO_Q ? lambda_q : lambda_lj;
1206             bClearF = (bFirst && PAR(cr));
1207 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1208             for (thread = 0; thread < pme->nthread; thread++)
1209             {
1210                 try
1211                 {
1212                     gather_f_bsplines(pme, grid, bClearF, atc,
1213                                       &atc->spline[thread],
1214                                       pme->bFEP ? (grid_index % 2 == 0 ? 1.0-lambda : lambda) : 1.0);
1215                 }
1216                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1217             }
1218
1219             where();
1220
1221             inc_nrnb(nrnb, eNR_GATHERFBSP,
1222                      pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
1223             /* Note: this wallcycle region is opened above inside an OpenMP
1224                region, so take care if refactoring code here. */
1225             wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1226         }
1227
1228         if (bCalcEnerVir)
1229         {
1230             /* This should only be called on the master thread
1231              * and after the threads have synchronized.
1232              */
1233             if (grid_index < 2)
1234             {
1235                 get_pme_ener_vir_q(pme->solve_work, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
1236             }
1237             else
1238             {
1239                 get_pme_ener_vir_lj(pme->solve_work, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
1240             }
1241         }
1242         bFirst = FALSE;
1243     } /* of grid_index-loop */
1244
1245     /* For Lorentz-Berthelot combination rules in LJ-PME, we need to calculate
1246      * seven terms. */
1247
1248     if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB)
1249     {
1250         /* Loop over A- and B-state if we are doing FEP */
1251         for (fep_state = 0; fep_state < fep_states_lj; ++fep_state)
1252         {
1253             real *local_c6 = NULL, *local_sigma = NULL, *RedistC6 = NULL, *RedistSigma = NULL;
1254             if (pme->nnodes == 1)
1255             {
1256                 if (pme->lb_buf1 == NULL)
1257                 {
1258                     pme->lb_buf_nalloc = pme->atc[0].n;
1259                     snew(pme->lb_buf1, pme->lb_buf_nalloc);
1260                 }
1261                 pme->atc[0].coefficient = pme->lb_buf1;
1262                 switch (fep_state)
1263                 {
1264                     case 0:
1265                         local_c6      = c6A;
1266                         local_sigma   = sigmaA;
1267                         break;
1268                     case 1:
1269                         local_c6      = c6B;
1270                         local_sigma   = sigmaB;
1271                         break;
1272                     default:
1273                         gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
1274                 }
1275             }
1276             else
1277             {
1278                 atc = &pme->atc[0];
1279                 switch (fep_state)
1280                 {
1281                     case 0:
1282                         RedistC6      = c6A;
1283                         RedistSigma   = sigmaA;
1284                         break;
1285                     case 1:
1286                         RedistC6      = c6B;
1287                         RedistSigma   = sigmaB;
1288                         break;
1289                     default:
1290                         gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
1291                 }
1292                 wallcycle_start(wcycle, ewcPME_REDISTXF);
1293
1294                 do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, RedistC6);
1295                 if (pme->lb_buf_nalloc < atc->n)
1296                 {
1297                     pme->lb_buf_nalloc = atc->nalloc;
1298                     srenew(pme->lb_buf1, pme->lb_buf_nalloc);
1299                     srenew(pme->lb_buf2, pme->lb_buf_nalloc);
1300                 }
1301                 local_c6 = pme->lb_buf1;
1302                 for (i = 0; i < atc->n; ++i)
1303                 {
1304                     local_c6[i] = atc->coefficient[i];
1305                 }
1306                 where();
1307
1308                 do_redist_pos_coeffs(pme, cr, start, homenr, FALSE, x, RedistSigma);
1309                 local_sigma = pme->lb_buf2;
1310                 for (i = 0; i < atc->n; ++i)
1311                 {
1312                     local_sigma[i] = atc->coefficient[i];
1313                 }
1314                 where();
1315
1316                 wallcycle_stop(wcycle, ewcPME_REDISTXF);
1317             }
1318             calc_initial_lb_coeffs(pme, local_c6, local_sigma);
1319
1320             /*Seven terms in LJ-PME with LB, grid_index < 2 reserved for electrostatics*/
1321             for (grid_index = 2; grid_index < 9; ++grid_index)
1322             {
1323                 /* Unpack structure */
1324                 pmegrid    = &pme->pmegrid[grid_index];
1325                 fftgrid    = pme->fftgrid[grid_index];
1326                 pfft_setup = pme->pfft_setup[grid_index];
1327                 calc_next_lb_coeffs(pme, local_sigma);
1328                 grid = pmegrid->grid.grid;
1329                 where();
1330
1331                 if (flags & GMX_PME_SPREAD)
1332                 {
1333                     wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1334                     /* Spread the c6 on a grid */
1335                     spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
1336
1337                     if (bFirst)
1338                     {
1339                         inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
1340                     }
1341
1342                     inc_nrnb(nrnb, eNR_SPREADBSP,
1343                              pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
1344                     if (pme->nthread == 1)
1345                     {
1346                         wrap_periodic_pmegrid(pme, grid);
1347                         /* sum contributions to local grid from other nodes */
1348 #if GMX_MPI
1349                         if (pme->nnodes > 1)
1350                         {
1351                             gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
1352                             where();
1353                         }
1354 #endif
1355                         copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
1356                     }
1357                     wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1358                 }
1359                 /*Here we start a large thread parallel region*/
1360 #pragma omp parallel num_threads(pme->nthread) private(thread)
1361                 {
1362                     try
1363                     {
1364                         thread = gmx_omp_get_thread_num();
1365                         if (flags & GMX_PME_SOLVE)
1366                         {
1367                             /* do 3d-fft */
1368                             if (thread == 0)
1369                             {
1370                                 wallcycle_start(wcycle, ewcPME_FFT);
1371                             }
1372
1373                             gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
1374                                                        thread, wcycle);
1375                             if (thread == 0)
1376                             {
1377                                 wallcycle_stop(wcycle, ewcPME_FFT);
1378                             }
1379                             where();
1380                         }
1381                     }
1382                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1383                 }
1384                 bFirst = FALSE;
1385             }
1386             if (flags & GMX_PME_SOLVE)
1387             {
1388                 /* solve in k-space for our local cells */
1389 #pragma omp parallel num_threads(pme->nthread) private(thread)
1390                 {
1391                     try
1392                     {
1393                         int loop_count;
1394                         thread = gmx_omp_get_thread_num();
1395                         if (thread == 0)
1396                         {
1397                             wallcycle_start(wcycle, ewcLJPME);
1398                         }
1399
1400                         loop_count =
1401                             solve_pme_lj_yzx(pme, &pme->cfftgrid[2], TRUE, ewaldcoeff_lj,
1402                                              box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
1403                                              bCalcEnerVir,
1404                                              pme->nthread, thread);
1405                         if (thread == 0)
1406                         {
1407                             wallcycle_stop(wcycle, ewcLJPME);
1408                             where();
1409                             inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
1410                         }
1411                     }
1412                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1413                 }
1414             }
1415
1416             if (bCalcEnerVir)
1417             {
1418                 /* This should only be called on the master thread and
1419                  * after the threads have synchronized.
1420                  */
1421                 get_pme_ener_vir_lj(pme->solve_work, pme->nthread, &energy_AB[2+fep_state], vir_AB[2+fep_state]);
1422             }
1423
1424             if (bBackFFT)
1425             {
1426                 bFirst = !(flags & GMX_PME_DO_COULOMB);
1427                 calc_initial_lb_coeffs(pme, local_c6, local_sigma);
1428                 for (grid_index = 8; grid_index >= 2; --grid_index)
1429                 {
1430                     /* Unpack structure */
1431                     pmegrid    = &pme->pmegrid[grid_index];
1432                     fftgrid    = pme->fftgrid[grid_index];
1433                     pfft_setup = pme->pfft_setup[grid_index];
1434                     grid       = pmegrid->grid.grid;
1435                     calc_next_lb_coeffs(pme, local_sigma);
1436                     where();
1437 #pragma omp parallel num_threads(pme->nthread) private(thread)
1438                     {
1439                         try
1440                         {
1441                             thread = gmx_omp_get_thread_num();
1442                             /* do 3d-invfft */
1443                             if (thread == 0)
1444                             {
1445                                 where();
1446                                 wallcycle_start(wcycle, ewcPME_FFT);
1447                             }
1448
1449                             gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
1450                                                        thread, wcycle);
1451                             if (thread == 0)
1452                             {
1453                                 wallcycle_stop(wcycle, ewcPME_FFT);
1454
1455                                 where();
1456
1457                                 if (pme->nodeid == 0)
1458                                 {
1459                                     real ntot = pme->nkx*pme->nky*pme->nkz;
1460                                     npme  = static_cast<int>(ntot*log(ntot)/log(2.0));
1461                                     inc_nrnb(nrnb, eNR_FFT, 2*npme);
1462                                 }
1463                                 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1464                             }
1465
1466                             copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
1467                         }
1468                         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1469                     } /*#pragma omp parallel*/
1470
1471                     /* distribute local grid to all nodes */
1472 #if GMX_MPI
1473                     if (pme->nnodes > 1)
1474                     {
1475                         gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
1476                     }
1477 #endif
1478                     where();
1479
1480                     unwrap_periodic_pmegrid(pme, grid);
1481
1482                     if (bCalcF)
1483                     {
1484                         /* interpolate forces for our local atoms */
1485                         where();
1486                         bClearF = (bFirst && PAR(cr));
1487                         scale   = pme->bFEP ? (fep_state < 1 ? 1.0-lambda_lj : lambda_lj) : 1.0;
1488                         scale  *= lb_scale_factor[grid_index-2];
1489
1490 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1491                         for (thread = 0; thread < pme->nthread; thread++)
1492                         {
1493                             try
1494                             {
1495                                 gather_f_bsplines(pme, grid, bClearF, &pme->atc[0],
1496                                                   &pme->atc[0].spline[thread],
1497                                                   scale);
1498                             }
1499                             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1500                         }
1501
1502                         where();
1503
1504                         inc_nrnb(nrnb, eNR_GATHERFBSP,
1505                                  pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
1506                     }
1507                     wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1508
1509                     bFirst = FALSE;
1510                 } /* for (grid_index = 8; grid_index >= 2; --grid_index) */
1511             }     /* if (bCalcF) */
1512         }         /* for (fep_state = 0; fep_state < fep_states_lj; ++fep_state) */
1513     }             /* if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB) */
1514
1515     if (bCalcF && pme->nnodes > 1)
1516     {
1517         wallcycle_start(wcycle, ewcPME_REDISTXF);
1518         for (d = 0; d < pme->ndecompdim; d++)
1519         {
1520             atc = &pme->atc[d];
1521             if (d == pme->ndecompdim - 1)
1522             {
1523                 n_d = homenr;
1524                 f_d = f + start;
1525             }
1526             else
1527             {
1528                 n_d = pme->atc[d+1].n;
1529                 f_d = pme->atc[d+1].f;
1530             }
1531             if (DOMAINDECOMP(cr))
1532             {
1533                 dd_pmeredist_f(pme, atc, n_d, f_d,
1534                                d == pme->ndecompdim-1 && pme->bPPnode);
1535             }
1536         }
1537
1538         wallcycle_stop(wcycle, ewcPME_REDISTXF);
1539     }
1540     where();
1541
1542     if (bCalcEnerVir)
1543     {
1544         if (flags & GMX_PME_DO_COULOMB)
1545         {
1546             if (!pme->bFEP_q)
1547             {
1548                 *energy_q = energy_AB[0];
1549                 m_add(vir_q, vir_AB[0], vir_q);
1550             }
1551             else
1552             {
1553                 *energy_q       = (1.0-lambda_q)*energy_AB[0] + lambda_q*energy_AB[1];
1554                 *dvdlambda_q   += energy_AB[1] - energy_AB[0];
1555                 for (i = 0; i < DIM; i++)
1556                 {
1557                     for (j = 0; j < DIM; j++)
1558                     {
1559                         vir_q[i][j] += (1.0-lambda_q)*vir_AB[0][i][j] +
1560                             lambda_q*vir_AB[1][i][j];
1561                     }
1562                 }
1563             }
1564             if (debug)
1565             {
1566                 fprintf(debug, "Electrostatic PME mesh energy: %g\n", *energy_q);
1567             }
1568         }
1569         else
1570         {
1571             *energy_q = 0;
1572         }
1573
1574         if (flags & GMX_PME_DO_LJ)
1575         {
1576             if (!pme->bFEP_lj)
1577             {
1578                 *energy_lj = energy_AB[2];
1579                 m_add(vir_lj, vir_AB[2], vir_lj);
1580             }
1581             else
1582             {
1583                 *energy_lj     = (1.0-lambda_lj)*energy_AB[2] + lambda_lj*energy_AB[3];
1584                 *dvdlambda_lj += energy_AB[3] - energy_AB[2];
1585                 for (i = 0; i < DIM; i++)
1586                 {
1587                     for (j = 0; j < DIM; j++)
1588                     {
1589                         vir_lj[i][j] += (1.0-lambda_lj)*vir_AB[2][i][j] + lambda_lj*vir_AB[3][i][j];
1590                     }
1591                 }
1592             }
1593             if (debug)
1594             {
1595                 fprintf(debug, "Lennard-Jones PME mesh energy: %g\n", *energy_lj);
1596             }
1597         }
1598         else
1599         {
1600             *energy_lj = 0;
1601         }
1602     }
1603     return 0;
1604 }