1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2004, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
34 * GROwing Monsters And Cloning Shrimps
36 /* IMPORTANT FOR DEVELOPERS:
38 * Triclinic pme stuff isn't entirely trivial, and we've experienced
39 * some bugs during development (many of them due to me). To avoid
40 * this in the future, please check the following things if you make
41 * changes in this file:
43 * 1. You should obtain identical (at least to the PME precision)
44 * energies, forces, and virial for
45 * a rectangular box and a triclinic one where the z (or y) axis is
46 * tilted a whole box side. For instance you could use these boxes:
48 * rectangular triclinic
53 * 2. You should check the energy conservation in a triclinic box.
55 * It might seem an overkill, but better safe than sorry.
77 #include "gmxcomplex.h"
81 #include "gmx_fatal.h"
87 #include "gmx_wallcycle.h"
88 #include "gmx_parallel_3dfft.h"
91 #if ( !defined(GMX_DOUBLE) && ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) ) )
92 #include "gmx_sse2_single.h"
95 #include "mpelogging.h"
98 /* #define PRT_FORCE */
99 /* conditions for on the fly time-measurement */
100 /* #define TAKETIME (step > 1 && timesteps < 10) */
101 #define TAKETIME FALSE
104 #define mpi_type MPI_DOUBLE
106 #define mpi_type MPI_FLOAT
109 /* Internal datastructures */
125 int *send_id,*recv_id;
126 pme_grid_comm_t *comm_data;
130 int dimind; /* The index of the dimension, 0=x, 1=y */
137 int *node_dest; /* The nodes to send x and q to with DD */
138 int *node_src; /* The nodes to receive x and q from with DD */
139 int *buf_index; /* Index for commnode into the buffers */
146 int *count; /* The number of atoms to send to each node */
147 int *rcount; /* The number of atoms to receive */
154 gmx_bool bSpread; /* These coordinates are used for spreading */
156 splinevec theta,dtheta;
158 rvec *fractx; /* Fractional coordinate relative to the
159 * lower cell boundary
163 typedef struct gmx_pme {
164 int ndecompdim; /* The number of decomposition dimensions */
165 int nodeid; /* Our nodeid in mpi->mpi_comm */
168 int nnodes; /* The number of nodes doing PME */
173 MPI_Comm mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
175 MPI_Datatype rvec_mpi; /* the pme vector's MPI type */
178 gmx_bool bPPnode; /* Node also does particle-particle forces */
179 gmx_bool bFEP; /* Compute Free energy contribution */
180 int nkx,nky,nkz; /* Grid dimensions */
184 real * pmegridA; /* Grids on which we do spreading/interpolation, includes overlap */
186 int pmegrid_nx,pmegrid_ny,pmegrid_nz;
187 int pmegrid_start_ix,pmegrid_start_iy,pmegrid_start_iz;
189 real * pmegrid_sendbuf;
190 real * pmegrid_recvbuf;
192 real *fftgridA; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
193 real *fftgridB; /* inside the interpolation grid, but separate for 2D PME decomp. */
194 int fftgrid_nx,fftgrid_ny,fftgrid_nz;
196 t_complex *cfftgridA; /* Grids for complex FFT data */
197 t_complex *cfftgridB;
198 int cfftgrid_nx,cfftgrid_ny,cfftgrid_nz;
200 gmx_parallel_3dfft_t pfft_setupA;
201 gmx_parallel_3dfft_t pfft_setupB;
204 real *fshx,*fshy,*fshz;
206 pme_atomcomm_t atc[2]; /* Indexed on decomposition index */
210 pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
213 pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
215 rvec *bufv; /* Communication buffer */
216 real *bufr; /* Communication buffer */
217 int buf_nalloc; /* The communication buffer size */
219 /* work data for solve_pme */
226 real * work_tmp1_alloc;
230 /* Work data for PME_redist */
231 gmx_bool redist_init;
239 int redist_buf_nalloc;
241 /* Work data for sum_qgrid */
242 real * sum_qgrid_tmp;
243 real * sum_qgrid_dd_tmp;
247 static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc)
250 int *idxptr,tix,tiy,tiz;
251 real *xptr,*fptr,tx,ty,tz;
252 real rxx,ryx,ryy,rzx,rzy,rzz;
254 int start_ix,start_iy,start_iz;
260 start_ix = pme->pmegrid_start_ix;
261 start_iy = pme->pmegrid_start_iy;
262 start_iz = pme->pmegrid_start_iz;
264 rxx = pme->recipbox[XX][XX];
265 ryx = pme->recipbox[YY][XX];
266 ryy = pme->recipbox[YY][YY];
267 rzx = pme->recipbox[ZZ][XX];
268 rzy = pme->recipbox[ZZ][YY];
269 rzz = pme->recipbox[ZZ][ZZ];
271 for(i=0; (i<atc->n); i++) {
273 idxptr = atc->idx[i];
274 fptr = atc->fractx[i];
276 /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
277 tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
278 ty = ny * ( xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
279 tz = nz * ( xptr[ZZ] * rzz + 2.0 );
285 /* Because decomposition only occurs in x and y,
286 * we never have a fraction correction in z.
288 fptr[XX] = tx - tix + pme->fshx[tix];
289 fptr[YY] = ty - tiy + pme->fshy[tiy];
292 idxptr[XX] = pme->nnx[tix];
293 idxptr[YY] = pme->nny[tiy];
294 idxptr[ZZ] = pme->nnz[tiz];
297 range_check(idxptr[XX],0,pme->pmegrid_nx);
298 range_check(idxptr[YY],0,pme->pmegrid_ny);
299 range_check(idxptr[ZZ],0,pme->pmegrid_nz);
304 static void pme_calc_pidx(int natoms, matrix recipbox, rvec x[],
310 real rxx,ryx,rzx,ryy,rzy;
313 /* Calculate PME task index (pidx) for each grid index.
314 * Here we always assign equally sized slabs to each node
315 * for load balancing reasons (the PME grid spacing is not used).
322 /* Reset the count */
323 for(i=0; i<nslab; i++)
328 if (atc->dimind == 0)
330 rxx = recipbox[XX][XX];
331 ryx = recipbox[YY][XX];
332 rzx = recipbox[ZZ][XX];
333 /* Calculate the node index in x-dimension */
334 for(i=0; (i<natoms); i++)
337 /* Fractional coordinates along box vectors */
338 s = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
339 si = (int)(s + 2*nslab) % nslab;
346 ryy = recipbox[YY][YY];
347 rzy = recipbox[ZZ][YY];
348 /* Calculate the node index in y-dimension */
349 for(i=0; (i<natoms); i++)
352 /* Fractional coordinates along box vectors */
353 s = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
354 si = (int)(s + 2*nslab) % nslab;
361 static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
365 /* We have to avoid a NULL pointer for atc->x to avoid
366 * possible fatal errors in MPI routines.
368 if (atc->n > atc->nalloc || atc->nalloc == 0)
370 nalloc_old = atc->nalloc;
371 atc->nalloc = over_alloc_dd(max(atc->n,1));
373 if (atc->nslab > 1) {
374 srenew(atc->x,atc->nalloc);
375 srenew(atc->q,atc->nalloc);
376 srenew(atc->f,atc->nalloc);
377 for(i=nalloc_old; i<atc->nalloc; i++)
379 clear_rvec(atc->f[i]);
384 srenew(atc->theta[i] ,atc->pme_order*atc->nalloc);
385 srenew(atc->dtheta[i],atc->pme_order*atc->nalloc);
387 srenew(atc->fractx,atc->nalloc);
388 srenew(atc->idx ,atc->nalloc);
393 static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
394 int n, gmx_bool bXF, rvec *x_f, real *charge,
396 /* Redistribute particle data for PME calculation */
397 /* domain decomposition by x coordinate */
402 if(FALSE == pme->redist_init) {
403 snew(pme->scounts,atc->nslab);
404 snew(pme->rcounts,atc->nslab);
405 snew(pme->sdispls,atc->nslab);
406 snew(pme->rdispls,atc->nslab);
407 snew(pme->sidx,atc->nslab);
408 pme->redist_init = TRUE;
410 if (n > pme->redist_buf_nalloc) {
411 pme->redist_buf_nalloc = over_alloc_dd(n);
412 srenew(pme->redist_buf,pme->redist_buf_nalloc*DIM);
419 /* forward, redistribution from pp to pme */
421 /* Calculate send counts and exchange them with other nodes */
422 for(i=0; (i<atc->nslab); i++) pme->scounts[i]=0;
423 for(i=0; (i<n); i++) pme->scounts[pme->idxa[i]]++;
424 MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
426 /* Calculate send and receive displacements and index into send
431 for(i=1; i<atc->nslab; i++) {
432 pme->sdispls[i]=pme->sdispls[i-1]+pme->scounts[i-1];
433 pme->rdispls[i]=pme->rdispls[i-1]+pme->rcounts[i-1];
434 pme->sidx[i]=pme->sdispls[i];
436 /* Total # of particles to be received */
437 atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
439 pme_realloc_atomcomm_things(atc);
441 /* Copy particle coordinates into send buffer and exchange*/
442 for(i=0; (i<n); i++) {
443 ii=DIM*pme->sidx[pme->idxa[i]];
444 pme->sidx[pme->idxa[i]]++;
445 pme->redist_buf[ii+XX]=x_f[i][XX];
446 pme->redist_buf[ii+YY]=x_f[i][YY];
447 pme->redist_buf[ii+ZZ]=x_f[i][ZZ];
449 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
450 pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
451 pme->rvec_mpi, atc->mpi_comm);
454 /* Copy charge into send buffer and exchange*/
455 for(i=0; i<atc->nslab; i++) pme->sidx[i]=pme->sdispls[i];
456 for(i=0; (i<n); i++) {
457 ii=pme->sidx[pme->idxa[i]];
458 pme->sidx[pme->idxa[i]]++;
459 pme->redist_buf[ii]=charge[i];
461 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
462 atc->q, pme->rcounts, pme->rdispls, mpi_type,
465 else { /* backward, redistribution from pme to pp */
466 MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
467 pme->redist_buf, pme->scounts, pme->sdispls,
468 pme->rvec_mpi, atc->mpi_comm);
470 /* Copy data from receive buffer */
471 for(i=0; i<atc->nslab; i++)
472 pme->sidx[i] = pme->sdispls[i];
473 for(i=0; (i<n); i++) {
474 ii = DIM*pme->sidx[pme->idxa[i]];
475 x_f[i][XX] += pme->redist_buf[ii+XX];
476 x_f[i][YY] += pme->redist_buf[ii+YY];
477 x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
478 pme->sidx[pme->idxa[i]]++;
484 static void pme_dd_sendrecv(pme_atomcomm_t *atc,
485 gmx_bool bBackward,int shift,
486 void *buf_s,int nbyte_s,
487 void *buf_r,int nbyte_r)
493 if (bBackward == FALSE) {
494 dest = atc->node_dest[shift];
495 src = atc->node_src[shift];
497 dest = atc->node_src[shift];
498 src = atc->node_dest[shift];
501 if (nbyte_s > 0 && nbyte_r > 0) {
502 MPI_Sendrecv(buf_s,nbyte_s,MPI_BYTE,
504 buf_r,nbyte_r,MPI_BYTE,
506 atc->mpi_comm,&stat);
507 } else if (nbyte_s > 0) {
508 MPI_Send(buf_s,nbyte_s,MPI_BYTE,
511 } else if (nbyte_r > 0) {
512 MPI_Recv(buf_r,nbyte_r,MPI_BYTE,
514 atc->mpi_comm,&stat);
519 static void dd_pmeredist_x_q(gmx_pme_t pme,
520 int n, gmx_bool bX, rvec *x, real *charge,
523 int *commnode,*buf_index;
524 int nnodes_comm,i,nsend,local_pos,buf_pos,node,scount,rcount;
526 commnode = atc->node_dest;
527 buf_index = atc->buf_index;
529 nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
532 for(i=0; i<nnodes_comm; i++) {
533 buf_index[commnode[i]] = nsend;
534 nsend += atc->count[commnode[i]];
537 if (atc->count[atc->nodeid] + nsend != n)
538 gmx_fatal(FARGS,"%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
539 "This usually means that your system is not well equilibrated.",
540 n - (atc->count[atc->nodeid] + nsend),
541 pme->nodeid,'x'+atc->dimind);
543 if (nsend > pme->buf_nalloc) {
544 pme->buf_nalloc = over_alloc_dd(nsend);
545 srenew(pme->bufv,pme->buf_nalloc);
546 srenew(pme->bufr,pme->buf_nalloc);
549 atc->n = atc->count[atc->nodeid];
550 for(i=0; i<nnodes_comm; i++) {
551 scount = atc->count[commnode[i]];
552 /* Communicate the count */
554 fprintf(debug,"dimind %d PME node %d send to node %d: %d\n",
555 atc->dimind,atc->nodeid,commnode[i],scount);
556 pme_dd_sendrecv(atc,FALSE,i,
558 &atc->rcount[i],sizeof(int));
559 atc->n += atc->rcount[i];
562 pme_realloc_atomcomm_things(atc);
568 if (node == atc->nodeid) {
569 /* Copy direct to the receive buffer */
571 copy_rvec(x[i],atc->x[local_pos]);
573 atc->q[local_pos] = charge[i];
576 /* Copy to the send buffer */
578 copy_rvec(x[i],pme->bufv[buf_index[node]]);
580 pme->bufr[buf_index[node]] = charge[i];
586 for(i=0; i<nnodes_comm; i++) {
587 scount = atc->count[commnode[i]];
588 rcount = atc->rcount[i];
589 if (scount > 0 || rcount > 0) {
591 /* Communicate the coordinates */
592 pme_dd_sendrecv(atc,FALSE,i,
593 pme->bufv[buf_pos],scount*sizeof(rvec),
594 atc->x[local_pos],rcount*sizeof(rvec));
596 /* Communicate the charges */
597 pme_dd_sendrecv(atc,FALSE,i,
598 pme->bufr+buf_pos,scount*sizeof(real),
599 atc->q+local_pos,rcount*sizeof(real));
601 local_pos += atc->rcount[i];
606 static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
610 int *commnode,*buf_index;
611 int nnodes_comm,local_pos,buf_pos,i,scount,rcount,node;
613 commnode = atc->node_dest;
614 buf_index = atc->buf_index;
616 nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
618 local_pos = atc->count[atc->nodeid];
620 for(i=0; i<nnodes_comm; i++) {
621 scount = atc->rcount[i];
622 rcount = atc->count[commnode[i]];
623 if (scount > 0 || rcount > 0) {
624 /* Communicate the forces */
625 pme_dd_sendrecv(atc,TRUE,i,
626 atc->f[local_pos],scount*sizeof(rvec),
627 pme->bufv[buf_pos],rcount*sizeof(rvec));
630 buf_index[commnode[i]] = buf_pos;
640 if (node == atc->nodeid)
642 /* Add from the local force array */
643 rvec_inc(f[i],atc->f[local_pos]);
648 /* Add from the receive buffer */
649 rvec_inc(f[i],pme->bufv[buf_index[node]]);
659 if (node == atc->nodeid)
661 /* Copy from the local force array */
662 copy_rvec(atc->f[local_pos],f[i]);
667 /* Copy from the receive buffer */
668 copy_rvec(pme->bufv[buf_index[node]],f[i]);
677 gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
679 pme_overlap_t *overlap;
680 int send_index0,send_nindex;
681 int recv_index0,recv_nindex;
683 int i,j,k,ix,iy,iz,icnt;
684 int ipulse,send_id,recv_id,datasize;
686 real *sendptr,*recvptr;
688 /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
689 overlap = &pme->overlap[1];
691 for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
693 /* Since we have already (un)wrapped the overlap in the z-dimension,
694 * we only have to communicate 0 to nkz (not pmegrid_nz).
696 if (direction==GMX_SUM_QGRID_FORWARD)
698 send_id = overlap->send_id[ipulse];
699 recv_id = overlap->recv_id[ipulse];
700 send_index0 = overlap->comm_data[ipulse].send_index0;
701 send_nindex = overlap->comm_data[ipulse].send_nindex;
702 recv_index0 = overlap->comm_data[ipulse].recv_index0;
703 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
707 send_id = overlap->recv_id[ipulse];
708 recv_id = overlap->send_id[ipulse];
709 send_index0 = overlap->comm_data[ipulse].recv_index0;
710 send_nindex = overlap->comm_data[ipulse].recv_nindex;
711 recv_index0 = overlap->comm_data[ipulse].send_index0;
712 recv_nindex = overlap->comm_data[ipulse].send_nindex;
715 /* Copy data to contiguous send buffer */
718 fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
719 pme->nodeid,overlap->nodeid,send_id,
720 pme->pmegrid_start_iy,
721 send_index0-pme->pmegrid_start_iy,
722 send_index0-pme->pmegrid_start_iy+send_nindex);
725 for(i=0;i<pme->pmegrid_nx;i++)
728 for(j=0;j<send_nindex;j++)
730 iy = j + send_index0 - pme->pmegrid_start_iy;
731 for(k=0;k<pme->nkz;k++)
734 pme->pmegrid_sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
739 datasize = pme->pmegrid_nx * pme->nkz;
741 MPI_Sendrecv(pme->pmegrid_sendbuf,send_nindex*datasize,GMX_MPI_REAL,
743 pme->pmegrid_recvbuf,recv_nindex*datasize,GMX_MPI_REAL,
745 overlap->mpi_comm,&stat);
747 /* Get data from contiguous recv buffer */
750 fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
751 pme->nodeid,overlap->nodeid,recv_id,
752 pme->pmegrid_start_iy,
753 recv_index0-pme->pmegrid_start_iy,
754 recv_index0-pme->pmegrid_start_iy+recv_nindex);
757 for(i=0;i<pme->pmegrid_nx;i++)
760 for(j=0;j<recv_nindex;j++)
762 iy = j + recv_index0 - pme->pmegrid_start_iy;
763 for(k=0;k<pme->nkz;k++)
766 if(direction==GMX_SUM_QGRID_FORWARD)
768 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += pme->pmegrid_recvbuf[icnt++];
772 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] = pme->pmegrid_recvbuf[icnt++];
779 /* Major dimension is easier, no copying required,
780 * but we might have to sum to separate array.
781 * Since we don't copy, we have to communicate up to pmegrid_nz,
782 * not nkz as for the minor direction.
784 overlap = &pme->overlap[0];
786 for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
788 if(direction==GMX_SUM_QGRID_FORWARD)
790 send_id = overlap->send_id[ipulse];
791 recv_id = overlap->recv_id[ipulse];
792 send_index0 = overlap->comm_data[ipulse].send_index0;
793 send_nindex = overlap->comm_data[ipulse].send_nindex;
794 recv_index0 = overlap->comm_data[ipulse].recv_index0;
795 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
796 recvptr = pme->pmegrid_recvbuf;
800 send_id = overlap->recv_id[ipulse];
801 recv_id = overlap->send_id[ipulse];
802 send_index0 = overlap->comm_data[ipulse].recv_index0;
803 send_nindex = overlap->comm_data[ipulse].recv_nindex;
804 recv_index0 = overlap->comm_data[ipulse].send_index0;
805 recv_nindex = overlap->comm_data[ipulse].send_nindex;
806 recvptr = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
809 sendptr = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
810 datasize = pme->pmegrid_ny * pme->pmegrid_nz;
814 fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
815 pme->nodeid,overlap->nodeid,send_id,
816 pme->pmegrid_start_ix,
817 send_index0-pme->pmegrid_start_ix,
818 send_index0-pme->pmegrid_start_ix+send_nindex);
819 fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
820 pme->nodeid,overlap->nodeid,recv_id,
821 pme->pmegrid_start_ix,
822 recv_index0-pme->pmegrid_start_ix,
823 recv_index0-pme->pmegrid_start_ix+recv_nindex);
826 MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
828 recvptr,recv_nindex*datasize,GMX_MPI_REAL,
830 overlap->mpi_comm,&stat);
832 /* ADD data from contiguous recv buffer */
833 if(direction==GMX_SUM_QGRID_FORWARD)
835 p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
836 for(i=0;i<recv_nindex*datasize;i++)
838 p[i] += pme->pmegrid_recvbuf[i];
847 copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
849 ivec local_fft_ndata,local_fft_offset,local_fft_size;
854 /* Dimensions should be identical for A/B grid, so we just use A here */
855 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
860 local_pme_size[0] = pme->pmegrid_nx;
861 local_pme_size[1] = pme->pmegrid_ny;
862 local_pme_size[2] = pme->pmegrid_nz;
864 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
865 the offset is identical, and the PME grid always has more data (due to overlap)
870 char fn[STRLEN],format[STRLEN];
872 sprintf(fn,"pmegrid%d.pdb",pme->nodeid);
874 sprintf(fn,"pmegrid%d.txt",pme->nodeid);
875 fp2 = ffopen(fn,"w");
876 sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
878 for(ix=0;ix<local_fft_ndata[XX];ix++)
880 for(iy=0;iy<local_fft_ndata[YY];iy++)
882 for(iz=0;iz<local_fft_ndata[ZZ];iz++)
884 pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
885 fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
886 fftgrid[fftidx] = pmegrid[pmeidx];
888 val = 100*pmegrid[pmeidx];
889 if (pmegrid[pmeidx] != 0)
890 fprintf(fp,format,"ATOM",pmeidx,"CA","GLY",' ',pmeidx,' ',
891 5.0*ix,5.0*iy,5.0*iz,1.0,val);
892 if (pmegrid[pmeidx] != 0)
893 fprintf(fp2,"%-12s %5d %5d %5d %12.5e\n",
895 pme->pmegrid_start_ix + ix,
896 pme->pmegrid_start_iy + iy,
897 pme->pmegrid_start_iz + iz,
913 copy_fftgrid_to_pmegrid(gmx_pme_t pme, real *fftgrid, real *pmegrid)
915 ivec local_fft_ndata,local_fft_offset,local_fft_size;
920 /* Dimensions should be identical for A/B grid, so we just use A here */
921 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
926 local_pme_size[0] = pme->pmegrid_nx;
927 local_pme_size[1] = pme->pmegrid_ny;
928 local_pme_size[2] = pme->pmegrid_nz;
930 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
931 the offset is identical, and the PME grid always has more data (due to overlap)
933 for(ix=0;ix<local_fft_ndata[XX];ix++)
935 for(iy=0;iy<local_fft_ndata[YY];iy++)
937 for(iz=0;iz<local_fft_ndata[ZZ];iz++)
939 pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
940 fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
941 pmegrid[pmeidx] = fftgrid[fftidx];
950 wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
952 int nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix,iy,iz;
958 pnx = pme->pmegrid_nx;
959 pny = pme->pmegrid_ny;
960 pnz = pme->pmegrid_nz;
962 overlap = pme->pme_order - 1;
964 /* Add periodic overlap in z */
965 for(ix=0; ix<pnx; ix++)
967 for(iy=0; iy<pny; iy++)
969 for(iz=0; iz<overlap; iz++)
971 pmegrid[(ix*pny+iy)*pnz+iz] +=
972 pmegrid[(ix*pny+iy)*pnz+nz+iz];
977 if (pme->nnodes_minor == 1)
979 for(ix=0; ix<pnx; ix++)
981 for(iy=0; iy<overlap; iy++)
983 for(iz=0; iz<nz; iz++)
985 pmegrid[(ix*pny+iy)*pnz+iz] +=
986 pmegrid[(ix*pny+ny+iy)*pnz+iz];
992 if (pme->nnodes_major == 1)
994 ny_x = (pme->nnodes_minor == 1 ? ny : pny);
996 for(ix=0; ix<overlap; ix++)
998 for(iy=0; iy<ny_x; iy++)
1000 for(iz=0; iz<nz; iz++)
1002 pmegrid[(ix*pny+iy)*pnz+iz] +=
1003 pmegrid[((nx+ix)*pny+iy)*pnz+iz];
1012 unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1014 int nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix,iy,iz;
1020 pnx = pme->pmegrid_nx;
1021 pny = pme->pmegrid_ny;
1022 pnz = pme->pmegrid_nz;
1024 overlap = pme->pme_order - 1;
1026 if (pme->nnodes_major == 1)
1028 ny_x = (pme->nnodes_minor == 1 ? ny : pny);
1030 for(ix=0; ix<overlap; ix++)
1032 for(iy=0; iy<ny_x; iy++)
1034 for(iz=0; iz<nz; iz++)
1036 pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
1037 pmegrid[(ix*pny+iy)*pnz+iz];
1043 if (pme->nnodes_minor == 1)
1045 for(ix=0; ix<pnx; ix++)
1047 for(iy=0; iy<overlap; iy++)
1049 for(iz=0; iz<nz; iz++)
1051 pmegrid[(ix*pny+ny+iy)*pnz+iz] =
1052 pmegrid[(ix*pny+iy)*pnz+iz];
1058 /* Copy periodic overlap in z */
1059 for(ix=0; ix<pnx; ix++)
1061 for(iy=0; iy<pny; iy++)
1063 for(iz=0; iz<overlap; iz++)
1065 pmegrid[(ix*pny+iy)*pnz+nz+iz] =
1066 pmegrid[(ix*pny+iy)*pnz+iz];
1073 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
1074 #define DO_BSPLINE(order) \
1075 for(ithx=0; (ithx<order); ithx++) \
1077 index_x = (i0+ithx)*pny*pnz; \
1078 valx = qn*thx[ithx]; \
1080 for(ithy=0; (ithy<order); ithy++) \
1082 valxy = valx*thy[ithy]; \
1083 index_xy = index_x+(j0+ithy)*pnz; \
1085 for(ithz=0; (ithz<order); ithz++) \
1087 index_xyz = index_xy+(k0+ithz); \
1088 grid[index_xyz] += valxy*thz[ithz]; \
1094 static void spread_q_bsplines(gmx_pme_t pme, pme_atomcomm_t *atc,
1098 /* spread charges from home atoms to local grid */
1100 int b,i,nn,n,ithx,ithy,ithz,i0,j0,k0;
1102 int order,norder,index_x,index_xy,index_xyz;
1104 real *thx,*thy,*thz;
1105 int localsize, bndsize;
1107 int pnx,pny,pnz,ndatatot;
1109 pnx = pme->pmegrid_nx;
1110 pny = pme->pmegrid_ny;
1111 pnz = pme->pmegrid_nz;
1112 ndatatot = pnx*pny*pnz;
1114 for(i=0;i<ndatatot;i++)
1119 order = pme->pme_order;
1121 for(nn=0; (nn<atc->n);nn++)
1128 idxptr = atc->idx[n];
1134 thx = atc->theta[XX] + norder;
1135 thy = atc->theta[YY] + norder;
1136 thz = atc->theta[ZZ] + norder;
1139 case 4: DO_BSPLINE(4); break;
1140 case 5: DO_BSPLINE(5); break;
1141 default: DO_BSPLINE(order); break;
1148 #if ( !defined(GMX_DOUBLE) && ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) ) )
1149 /* Calculate exponentials through SSE in float precision */
1150 #define CALC_EXPONENTIALS(start,end,r_aligned) \
1153 for(kx=0; kx<end; kx+=4) \
1155 tmp_sse = _mm_load_ps(r_aligned+kx); \
1156 tmp_sse = gmx_mm_exp_ps(tmp_sse); \
1157 _mm_store_ps(r_aligned+kx,tmp_sse); \
1161 #define CALC_EXPONENTIALS(start,end,r) \
1162 for(kx=start; kx<end; kx++) \
1164 r[kx] = exp(r[kx]); \
1169 static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
1170 real ewaldcoeff,real vol,
1171 gmx_bool bEnerVir,real *mesh_energy,matrix vir)
1173 /* do recip sum over local cells in grid */
1174 /* y major, z middle, x minor or continuous */
1176 int kx,ky,kz,maxkx,maxky,maxkz;
1177 int nx,ny,nz,iy,iz,kxstart,kxend;
1179 real factor=M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
1180 real ets2,struct2,vfactor,ets2vf;
1181 real eterm,d1,d2,energy=0;
1183 real virxx=0,virxy=0,virxz=0,viryy=0,viryz=0,virzz=0;
1184 real rxx,ryx,ryy,rzx,rzy,rzz;
1185 real *mhx,*mhy,*mhz,*m2,*denom,*tmp1,*m2inv;
1186 real mhxk,mhyk,mhzk,m2k;
1189 ivec local_ndata,local_offset,local_size;
1195 /* Dimensions should be identical for A/B grid, so we just use A here */
1196 gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
1202 rxx = pme->recipbox[XX][XX];
1203 ryx = pme->recipbox[YY][XX];
1204 ryy = pme->recipbox[YY][YY];
1205 rzx = pme->recipbox[ZZ][XX];
1206 rzy = pme->recipbox[ZZ][YY];
1207 rzz = pme->recipbox[ZZ][ZZ];
1213 mhx = pme->work_mhx;
1214 mhy = pme->work_mhy;
1215 mhz = pme->work_mhz;
1217 denom = pme->work_denom;
1218 tmp1 = pme->work_tmp1;
1219 m2inv = pme->work_m2inv;
1221 for(iy=0;iy<local_ndata[YY];iy++)
1223 ky = iy + local_offset[YY];
1234 by = M_PI*vol*pme->bsp_mod[YY][ky];
1236 for(iz=0;iz<local_ndata[ZZ];iz++)
1238 kz = iz + local_offset[ZZ];
1242 bz = pme->bsp_mod[ZZ][kz];
1244 /* 0.5 correction for corner points */
1251 p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
1253 /* We should skip the k-space point (0,0,0) */
1254 if (local_offset[XX] > 0 ||
1255 local_offset[YY] > 0 || ky > 0 ||
1258 kxstart = local_offset[XX];
1262 kxstart = local_offset[XX] + 1;
1265 kxend = local_offset[XX] + local_ndata[XX];
1269 /* More expensive inner loop, especially because of the storage
1270 * of the mh elements in array's.
1271 * Because x is the minor grid index, all mh elements
1272 * depend on kx for triclinic unit cells.
1275 /* Two explicit loops to avoid a conditional inside the loop */
1276 for(kx=kxstart; kx<maxkx; kx++)
1281 mhyk = mx * ryx + my * ryy;
1282 mhzk = mx * rzx + my * rzy + mz * rzz;
1283 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1288 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1289 tmp1[kx] = -factor*m2k;
1292 for(kx=maxkx; kx<kxend; kx++)
1297 mhyk = mx * ryx + my * ryy;
1298 mhzk = mx * rzx + my * rzy + mz * rzz;
1299 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1304 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1305 tmp1[kx] = -factor*m2k;
1308 for(kx=kxstart; kx<kxend; kx++)
1310 m2inv[kx] = 1.0/m2[kx];
1312 for(kx=kxstart; kx<kxend; kx++)
1314 denom[kx] = 1.0/denom[kx];
1317 CALC_EXPONENTIALS(kxstart,kxend,tmp1);
1319 for(kx=kxstart; kx<kxend; kx++,p0++)
1324 eterm = ONE_4PI_EPS0/pme->epsilon_r*tmp1[kx]*denom[kx];
1329 struct2 = 2.0*(d1*d1+d2*d2);
1331 tmp1[kx] = eterm*struct2;
1334 for(kx=kxstart; kx<kxend; kx++)
1336 ets2 = corner_fac*tmp1[kx];
1337 vfactor = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
1340 ets2vf = ets2*vfactor;
1341 virxx += ets2vf*mhx[kx]*mhx[kx] - ets2;
1342 virxy += ets2vf*mhx[kx]*mhy[kx];
1343 virxz += ets2vf*mhx[kx]*mhz[kx];
1344 viryy += ets2vf*mhy[kx]*mhy[kx] - ets2;
1345 viryz += ets2vf*mhy[kx]*mhz[kx];
1346 virzz += ets2vf*mhz[kx]*mhz[kx] - ets2;
1351 /* We don't need to calculate the energy and the virial.
1352 * In this case the triclinic overhead is small.
1355 /* Two explicit loops to avoid a conditional inside the loop */
1357 for(kx=kxstart; kx<maxkx; kx++)
1362 mhyk = mx * ryx + my * ryy;
1363 mhzk = mx * rzx + my * rzy + mz * rzz;
1364 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1365 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1366 tmp1[kx] = -factor*m2k;
1369 for(kx=maxkx; kx<kxend; kx++)
1374 mhyk = mx * ryx + my * ryy;
1375 mhzk = mx * rzx + my * rzy + mz * rzz;
1376 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1377 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1378 tmp1[kx] = -factor*m2k;
1381 for(kx=kxstart; kx<kxend; kx++)
1383 denom[kx] = 1.0/denom[kx];
1386 CALC_EXPONENTIALS(kxstart,kxend,tmp1);
1388 for(kx=kxstart; kx<kxend; kx++,p0++)
1393 eterm = ONE_4PI_EPS0/pme->epsilon_r*tmp1[kx]*denom[kx];
1404 /* Update virial with local values.
1405 * The virial is symmetric by definition.
1406 * this virial seems ok for isotropic scaling, but I'm
1407 * experiencing problems on semiisotropic membranes.
1408 * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
1410 vir[XX][XX] = 0.25*virxx;
1411 vir[YY][YY] = 0.25*viryy;
1412 vir[ZZ][ZZ] = 0.25*virzz;
1413 vir[XX][YY] = vir[YY][XX] = 0.25*virxy;
1414 vir[XX][ZZ] = vir[ZZ][XX] = 0.25*virxz;
1415 vir[YY][ZZ] = vir[ZZ][YY] = 0.25*viryz;
1417 /* This energy should be corrected for a charged system */
1418 *mesh_energy = 0.5*energy;
1421 /* Return the loop count */
1422 return local_ndata[YY]*local_ndata[ZZ]*local_ndata[XX];
1426 #define DO_FSPLINE(order) \
1427 for(ithx=0; (ithx<order); ithx++) \
1429 index_x = (i0+ithx)*pny*pnz; \
1433 for(ithy=0; (ithy<order); ithy++) \
1435 index_xy = index_x+(j0+ithy)*pnz; \
1440 for(ithz=0; (ithz<order); ithz++) \
1442 gval = grid[index_xy+(k0+ithz)]; \
1443 fxy1 += thz[ithz]*gval; \
1444 fz1 += dthz[ithz]*gval; \
1453 void gather_f_bsplines(gmx_pme_t pme,real *grid,
1454 gmx_bool bClearF,pme_atomcomm_t *atc,real scale)
1456 /* sum forces for local particles */
1457 int nn,n,ithx,ithy,ithz,i0,j0,k0;
1458 int index_x,index_xy;
1459 int nx,ny,nz,pnx,pny,pnz;
1461 real tx,ty,dx,dy,qn;
1464 real *thx,*thy,*thz,*dthx,*dthy,*dthz;
1466 real rxx,ryx,ryy,rzx,rzy,rzz;
1469 order = pme->pme_order;
1470 thx = atc->theta[XX];
1471 thy = atc->theta[YY];
1472 thz = atc->theta[ZZ];
1473 dthx = atc->dtheta[XX];
1474 dthy = atc->dtheta[YY];
1475 dthz = atc->dtheta[ZZ];
1479 pnx = pme->pmegrid_nx;
1480 pny = pme->pmegrid_ny;
1481 pnz = pme->pmegrid_nz;
1483 rxx = pme->recipbox[XX][XX];
1484 ryx = pme->recipbox[YY][XX];
1485 ryy = pme->recipbox[YY][YY];
1486 rzx = pme->recipbox[ZZ][XX];
1487 rzy = pme->recipbox[ZZ][YY];
1488 rzz = pme->recipbox[ZZ][ZZ];
1490 for(nn=0; (nn<atc->n); nn++)
1493 qn = scale*atc->q[n];
1506 idxptr = atc->idx[n];
1513 /* Pointer arithmetic alert, next six statements */
1514 thx = atc->theta[XX] + norder;
1515 thy = atc->theta[YY] + norder;
1516 thz = atc->theta[ZZ] + norder;
1517 dthx = atc->dtheta[XX] + norder;
1518 dthy = atc->dtheta[YY] + norder;
1519 dthz = atc->dtheta[ZZ] + norder;
1522 case 4: DO_FSPLINE(4); break;
1523 case 5: DO_FSPLINE(5); break;
1524 default: DO_FSPLINE(order); break;
1527 atc->f[n][XX] += -qn*( fx*nx*rxx );
1528 atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
1529 atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
1532 /* Since the energy and not forces are interpolated
1533 * the net force might not be exactly zero.
1534 * This can be solved by also interpolating F, but
1535 * that comes at a cost.
1536 * A better hack is to remove the net force every
1537 * step, but that must be done at a higher level
1538 * since this routine doesn't see all atoms if running
1539 * in parallel. Don't know how important it is? EL 990726
1543 static real gather_energy_bsplines(gmx_pme_t pme,real *grid,
1544 pme_atomcomm_t *atc)
1546 int n,ithx,ithy,ithz,i0,j0,k0;
1547 int index_x,index_xy;
1549 real energy,pot,tx,ty,qn,gval;
1550 real *thx,*thy,*thz;
1555 order = pme->pme_order;
1558 for(n=0; (n<atc->n); n++) {
1562 idxptr = atc->idx[n];
1569 /* Pointer arithmetic alert, next three statements */
1570 thx = atc->theta[XX] + norder;
1571 thy = atc->theta[YY] + norder;
1572 thz = atc->theta[ZZ] + norder;
1575 for(ithx=0; (ithx<order); ithx++)
1577 index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
1580 for(ithy=0; (ithy<order); ithy++)
1582 index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
1585 for(ithz=0; (ithz<order); ithz++)
1587 gval = grid[index_xy+(k0+ithz)];
1588 pot += tx*ty*thz[ithz]*gval;
1601 void make_bsplines(splinevec theta,splinevec dtheta,int order,
1602 rvec fractx[],int nr,real charge[],
1603 gmx_bool bFreeEnergy)
1605 /* construct splines for local atoms */
1608 real *data,*ddata,*xptr;
1610 for(i=0; (i<nr); i++) {
1611 /* With free energy we do not use the charge check.
1612 * In most cases this will be more efficient than calling make_bsplines
1613 * twice, since usually more than half the particles have charges.
1615 if (bFreeEnergy || charge[i] != 0.0) {
1617 for(j=0; (j<DIM); j++) {
1620 /* dr is relative offset from lower cell limit */
1621 data=&(theta[j][i*order]);
1626 for(k=3; (k<order); k++) {
1628 data[k-1]=div*dr*data[k-2];
1629 for(l=1; (l<(k-1)); l++) {
1630 data[k-l-1]=div*((dr+l)*data[k-l-2]+(k-l-dr)*
1633 data[0]=div*(1-dr)*data[0];
1636 ddata = &(dtheta[j][i*order]);
1637 ddata[0] = -data[0];
1638 for(k=1; (k<order); k++) {
1639 ddata[k]=data[k-1]-data[k];
1643 data[order-1]=div*dr*data[order-2];
1644 for(l=1; (l<(order-1)); l++) {
1645 data[order-l-1]=div*((dr+l)*data[order-l-2]+
1646 (order-l-dr)*data[order-l-1]);
1648 data[0]=div*(1-dr)*data[0];
1655 void make_dft_mod(real *mod,real *data,int ndata)
1660 for(i=0;i<ndata;i++) {
1662 for(j=0;j<ndata;j++) {
1663 arg=(2.0*M_PI*i*j)/ndata;
1664 sc+=data[j]*cos(arg);
1665 ss+=data[j]*sin(arg);
1669 for(i=0;i<ndata;i++)
1671 mod[i]=(mod[i-1]+mod[i+1])*0.5;
1676 void make_bspline_moduli(splinevec bsp_mod,int nx,int ny,int nz,int order)
1678 int nmax=max(nx,max(ny,nz));
1679 real *data,*ddata,*bsp_data;
1685 snew(bsp_data,nmax);
1691 for(k=3;k<order;k++) {
1694 for(l=1;l<(k-1);l++)
1695 data[k-l-1]=div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
1696 data[0]=div*data[0];
1700 for(k=1;k<order;k++)
1701 ddata[k]=data[k-1]-data[k];
1704 for(l=1;l<(order-1);l++)
1705 data[order-l-1]=div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
1706 data[0]=div*data[0];
1710 for(i=1;i<=order;i++)
1711 bsp_data[i]=data[i-1];
1713 make_dft_mod(bsp_mod[XX],bsp_data,nx);
1714 make_dft_mod(bsp_mod[YY],bsp_data,ny);
1715 make_dft_mod(bsp_mod[ZZ],bsp_data,nz);
1722 static void setup_coordinate_communication(pme_atomcomm_t *atc)
1730 for(i=1; i<=nslab/2; i++) {
1731 fw = (atc->nodeid + i) % nslab;
1732 bw = (atc->nodeid - i + nslab) % nslab;
1733 if (n < nslab - 1) {
1734 atc->node_dest[n] = fw;
1735 atc->node_src[n] = bw;
1738 if (n < nslab - 1) {
1739 atc->node_dest[n] = bw;
1740 atc->node_src[n] = fw;
1746 int gmx_pme_destroy(FILE *log,gmx_pme_t *pmedata)
1750 fprintf(log,"Destroying PME data structures.\n");
1753 sfree((*pmedata)->nnx);
1754 sfree((*pmedata)->nny);
1755 sfree((*pmedata)->nnz);
1757 sfree((*pmedata)->pmegridA);
1758 sfree((*pmedata)->fftgridA);
1759 sfree((*pmedata)->cfftgridA);
1760 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
1762 if((*pmedata)->pmegridB)
1764 sfree((*pmedata)->pmegridB);
1765 sfree((*pmedata)->fftgridB);
1766 sfree((*pmedata)->cfftgridB);
1767 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
1769 sfree((*pmedata)->work_mhz);
1770 sfree((*pmedata)->work_m2);
1771 sfree((*pmedata)->work_denom);
1772 sfree((*pmedata)->work_tmp1_alloc);
1773 sfree((*pmedata)->work_m2inv);
1781 static int mult_up(int n,int f)
1783 return ((n + f - 1)/f)*f;
1787 static double pme_load_imbalance(gmx_pme_t pme)
1792 nma = pme->nnodes_major;
1793 nmi = pme->nnodes_minor;
1795 n1 = mult_up(pme->nkx,nma)*mult_up(pme->nky,nmi)*pme->nkz;
1796 n2 = mult_up(pme->nkx,nma)*mult_up(pme->nkz,nmi)*pme->nky;
1797 n3 = mult_up(pme->nky,nma)*mult_up(pme->nkz,nmi)*pme->nkx;
1799 /* pme_solve is roughly double the cost of an fft */
1801 return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
1804 static void init_atomcomm(gmx_pme_t pme,pme_atomcomm_t *atc, t_commrec *cr,
1805 int dimind,gmx_bool bSpread)
1809 atc->dimind = dimind;
1814 if (pme->nnodes > 1)
1816 atc->mpi_comm = pme->mpi_comm_d[dimind];
1817 MPI_Comm_size(atc->mpi_comm,&atc->nslab);
1818 MPI_Comm_rank(atc->mpi_comm,&atc->nodeid);
1822 fprintf(debug,"For PME atom communication in dimind %d: nslab %d rank %d\n",atc->dimind,atc->nslab,atc->nodeid);
1826 atc->bSpread = bSpread;
1827 atc->pme_order = pme->pme_order;
1831 /* These three allocations are not required for particle decomp. */
1832 snew(atc->node_dest,atc->nslab);
1833 snew(atc->node_src,atc->nslab);
1834 setup_coordinate_communication(atc);
1836 snew(atc->count,atc->nslab);
1837 snew(atc->rcount,atc->nslab);
1838 snew(atc->buf_index,atc->nslab);
1843 init_overlap_comm(pme_overlap_t * ol,
1852 int lbnd,rbnd,maxlr,b,i;
1855 pme_grid_comm_t *pgc;
1857 int fft_start,fft_end,send_index1,recv_index1;
1860 ol->mpi_comm = comm;
1863 ol->nnodes = nnodes;
1864 ol->nodeid = nodeid;
1866 /* Linear translation of the PME grid wo'nt affect reciprocal space
1867 * calculations, so to optimize we only interpolate "upwards",
1868 * which also means we only have to consider overlap in one direction.
1869 * I.e., particles on this node might also be spread to grid indices
1870 * that belong to higher nodes (modulo nnodes)
1873 snew(ol->s2g0,ol->nnodes+1);
1874 snew(ol->s2g1,ol->nnodes);
1875 if (debug) { fprintf(debug,"PME slab boundaries:"); }
1876 for(i=0; i<nnodes; i++)
1878 /* s2g0 the local interpolation grid start.
1879 * s2g1 the local interpolation grid end.
1880 * Because grid overlap communication only goes forward,
1881 * the grid the slabs for fft's should be rounded down.
1883 ol->s2g0[i] = ( i *ndata + 0 )/nnodes;
1884 ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
1888 fprintf(debug," %3d %3d",ol->s2g0[i],ol->s2g1[i]);
1891 ol->s2g0[nnodes] = ndata;
1892 if (debug) { fprintf(debug,"\n"); }
1894 /* Determine with how many nodes we need to communicate the grid overlap */
1900 for(i=0; i<nnodes; i++)
1902 if ((i+b < nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
1903 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
1909 while (bCont && b < nnodes);
1910 ol->noverlap_nodes = b - 1;
1912 snew(ol->send_id,ol->noverlap_nodes);
1913 snew(ol->recv_id,ol->noverlap_nodes);
1914 for(b=0; b<ol->noverlap_nodes; b++)
1916 ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
1917 ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
1919 snew(ol->comm_data, ol->noverlap_nodes);
1921 for(b=0; b<ol->noverlap_nodes; b++)
1923 pgc = &ol->comm_data[b];
1925 fft_start = ol->s2g0[ol->send_id[b]];
1926 fft_end = ol->s2g0[ol->send_id[b]+1];
1927 if (ol->send_id[b] < nodeid)
1932 send_index1 = ol->s2g1[nodeid];
1933 send_index1 = min(send_index1,fft_end);
1934 pgc->send_index0 = fft_start;
1935 pgc->send_nindex = max(0,send_index1 - pgc->send_index0);
1937 /* We always start receiving to the first index of our slab */
1938 fft_start = ol->s2g0[ol->nodeid];
1939 fft_end = ol->s2g0[ol->nodeid+1];
1940 recv_index1 = ol->s2g1[ol->recv_id[b]];
1941 if (ol->recv_id[b] > nodeid)
1943 recv_index1 -= ndata;
1945 recv_index1 = min(recv_index1,fft_end);
1946 pgc->recv_index0 = fft_start;
1947 pgc->recv_nindex = max(0,recv_index1 - pgc->recv_index0);
1952 make_gridindex5_to_localindex(int n,int local_start,int local_range,
1953 int **global_to_local,
1954 real **fraction_shift)
1962 for(i=0; (i<5*n); i++)
1964 /* Determine the global to local grid index */
1965 gtl[i] = (i - local_start + n) % n;
1966 /* For coordinates that fall within the local grid the fraction
1967 * is correct, we don't need to shift it.
1970 if (local_range < n)
1972 /* Due to rounding issues i could be 1 beyond the lower or
1973 * upper boundary of the local grid. Correct the index for this.
1974 * If we shift the index, we need to shift the fraction by
1975 * the same amount in the other direction to not affect
1977 * Note that due to this shifting the weights at the end of
1978 * the spline might change, but that will only involve values
1979 * between zero and values close to the precision of a real,
1980 * which is anyhow the accuracy of the whole mesh calculation.
1982 /* With local_range=0 we should not change i=local_start */
1983 if (i % n != local_start)
1990 else if (gtl[i] == local_range)
1992 gtl[i] = local_range - 1;
1999 *global_to_local = gtl;
2000 *fraction_shift = fsh;
2004 gmx_pme_check_grid_restrictions(FILE *fplog,char dim,int nnodes,int *nk)
2008 if (*nk % nnodes != 0)
2010 nk_new = nnodes*(*nk/nnodes + 1);
2012 if (2*nk_new >= 3*(*nk))
2014 gmx_fatal(FARGS,"The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
2015 dim,*nk,dim,nnodes,dim);
2020 fprintf(fplog,"\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
2021 dim,*nk,dim,nnodes,dim,nk_new,dim);
2028 int gmx_pme_init(gmx_pme_t * pmedata,
2034 gmx_bool bFreeEnergy,
2035 gmx_bool bReproducible)
2039 pme_atomcomm_t *atc;
2040 int bufsizex,bufsizey,bufsize;
2044 fprintf(debug,"Creating PME data structures.\n");
2047 pme->redist_init = FALSE;
2048 pme->sum_qgrid_tmp = NULL;
2049 pme->sum_qgrid_dd_tmp = NULL;
2050 pme->buf_nalloc = 0;
2051 pme->redist_buf_nalloc = 0;
2054 pme->bPPnode = TRUE;
2056 pme->nnodes_major = nnodes_major;
2057 pme->nnodes_minor = nnodes_minor;
2060 if (nnodes_major*nnodes_minor > 1 && PAR(cr))
2062 pme->mpi_comm = cr->mpi_comm_mygroup;
2064 MPI_Comm_rank(pme->mpi_comm,&pme->nodeid);
2065 MPI_Comm_size(pme->mpi_comm,&pme->nnodes);
2066 if (pme->nnodes != nnodes_major*nnodes_minor)
2068 gmx_incons("PME node count mismatch");
2073 if (pme->nnodes == 1)
2075 pme->ndecompdim = 0;
2076 pme->nodeid_major = 0;
2077 pme->nodeid_minor = 0;
2081 if (nnodes_minor == 1)
2084 pme->mpi_comm_d[0] = pme->mpi_comm;
2085 pme->mpi_comm_d[1] = NULL;
2087 pme->ndecompdim = 1;
2088 pme->nodeid_major = pme->nodeid;
2089 pme->nodeid_minor = 0;
2092 else if (nnodes_major == 1)
2095 pme->mpi_comm_d[0] = NULL;
2096 pme->mpi_comm_d[1] = pme->mpi_comm;
2098 pme->ndecompdim = 1;
2099 pme->nodeid_major = 0;
2100 pme->nodeid_minor = pme->nodeid;
2104 if (pme->nnodes % nnodes_major != 0)
2106 gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
2108 pme->ndecompdim = 2;
2111 MPI_Comm_split(pme->mpi_comm,pme->nodeid % nnodes_minor,
2112 pme->nodeid,&pme->mpi_comm_d[0]); /* My communicator along major dimension */
2113 MPI_Comm_split(pme->mpi_comm,pme->nodeid/nnodes_minor,
2114 pme->nodeid,&pme->mpi_comm_d[1]); /* My communicator along minor dimension */
2116 MPI_Comm_rank(pme->mpi_comm_d[0],&pme->nodeid_major);
2117 MPI_Comm_size(pme->mpi_comm_d[0],&pme->nnodes_major);
2118 MPI_Comm_rank(pme->mpi_comm_d[1],&pme->nodeid_minor);
2119 MPI_Comm_size(pme->mpi_comm_d[1],&pme->nnodes_minor);
2122 pme->bPPnode = (cr->duty & DUTY_PP);
2125 if (ir->ePBC == epbcSCREW)
2127 gmx_fatal(FARGS,"pme does not (yet) work with pbc = screw");
2130 pme->bFEP = ((ir->efep != efepNO) && bFreeEnergy);
2134 pme->pme_order = ir->pme_order;
2135 pme->epsilon_r = ir->epsilon_r;
2137 /* Currently pme.c supports only the fft5d FFT code.
2138 * Therefore the grid always needs to be divisible by nnodes.
2139 * When the old 1D code is also supported again, change this check.
2141 * This check should be done before calling gmx_pme_init
2142 * and fplog should be passed iso stderr.
2144 if (pme->ndecompdim >= 2)
2146 if (pme->ndecompdim >= 1)
2149 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
2150 'x',nnodes_major,&pme->nkx);
2151 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
2152 'y',nnodes_minor,&pme->nky);
2156 if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
2157 pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
2158 pme->nkz <= pme->pme_order)
2160 gmx_fatal(FARGS,"The pme grid dimensions need to be larger than pme_order (%d) and in parallel larger than 2*pme_order for x and/or y",pme->pme_order);
2163 if (pme->nnodes > 1) {
2167 MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
2168 MPI_Type_commit(&(pme->rvec_mpi));
2171 /* Note that the charge spreading and force gathering, which usually
2172 * takes about the same amount of time as FFT+solve_pme,
2173 * is always fully load balanced
2174 * (unless the charge distribution is inhomogeneous).
2177 imbal = pme_load_imbalance(pme);
2178 if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
2182 "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
2183 " For optimal PME load balancing\n"
2184 " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
2185 " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
2187 (int)((imbal-1)*100 + 0.5),
2188 pme->nkx,pme->nky,pme->nnodes_major,
2189 pme->nky,pme->nkz,pme->nnodes_minor);
2193 init_overlap_comm(&pme->overlap[0],pme->pme_order,
2197 pme->nnodes_major,pme->nodeid_major,pme->nkx);
2199 init_overlap_comm(&pme->overlap[1],pme->pme_order,
2203 pme->nnodes_minor,pme->nodeid_minor,pme->nky);
2205 snew(pme->bsp_mod[XX],pme->nkx);
2206 snew(pme->bsp_mod[YY],pme->nky);
2207 snew(pme->bsp_mod[ZZ],pme->nkz);
2209 /* Allocate data for the interpolation grid, including overlap */
2210 pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
2211 pme->overlap[0].s2g0[pme->nodeid_major];
2212 pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
2213 pme->overlap[1].s2g0[pme->nodeid_minor];
2214 pme->pmegrid_nz = pme->nkz + pme->pme_order - 1;
2216 pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
2217 pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
2218 pme->pmegrid_start_iz = 0;
2220 make_gridindex5_to_localindex(pme->nkx,
2221 pme->pmegrid_start_ix,
2222 pme->pmegrid_nx - (pme->pme_order-1),
2223 &pme->nnx,&pme->fshx);
2224 make_gridindex5_to_localindex(pme->nky,
2225 pme->pmegrid_start_iy,
2226 pme->pmegrid_ny - (pme->pme_order-1),
2227 &pme->nny,&pme->fshy);
2228 make_gridindex5_to_localindex(pme->nkz,
2229 pme->pmegrid_start_iz,
2230 pme->pmegrid_nz - (pme->pme_order-1),
2231 &pme->nnz,&pme->fshz);
2233 snew(pme->pmegridA,pme->pmegrid_nx*pme->pmegrid_ny*pme->pmegrid_nz);
2235 /* For non-divisible grid we need pme_order iso pme_order-1 */
2236 /* x overlap is copied in place: take padding into account.
2237 * y is always copied through a buffer: we don't need padding in z,
2238 * but we do need the overlap in x because of the communication order.
2240 bufsizex = pme->pme_order*pme->pmegrid_ny*pme->pmegrid_nz;
2241 bufsizey = pme->pme_order*pme->pmegrid_nx*pme->nkz;
2242 bufsize = (bufsizex>bufsizey) ? bufsizex : bufsizey;
2244 snew(pme->pmegrid_sendbuf,bufsize);
2245 snew(pme->pmegrid_recvbuf,bufsize);
2247 ndata[0] = pme->nkx;
2248 ndata[1] = pme->nky;
2249 ndata[2] = pme->nkz;
2251 /* This routine will allocate the grid data to fit the FFTs */
2252 gmx_parallel_3dfft_init(&pme->pfft_setupA,ndata,
2253 &pme->fftgridA,&pme->cfftgridA,
2255 pme->overlap[0].s2g0,pme->overlap[1].s2g0,
2260 snew(pme->pmegridB,pme->pmegrid_nx*pme->pmegrid_ny*pme->pmegrid_nz);
2261 gmx_parallel_3dfft_init(&pme->pfft_setupB,ndata,
2262 &pme->fftgridB,&pme->cfftgridB,
2264 pme->overlap[0].s2g0,pme->overlap[1].s2g0,
2268 pme->pmegridB = NULL;
2269 pme->fftgridB = NULL;
2270 pme->cfftgridB = NULL;
2273 make_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
2275 /* Use atc[0] for spreading */
2276 init_atomcomm(pme,&pme->atc[0],cr,nnodes_major > 1 ? 0 : 1,TRUE);
2277 if (pme->ndecompdim >= 2)
2279 init_atomcomm(pme,&pme->atc[1],cr,1,FALSE);
2282 if (pme->nnodes == 1) {
2283 pme->atc[0].n = homenr;
2284 pme_realloc_atomcomm_things(&pme->atc[0]);
2287 /* Use fft5d, order after FFT is y major, z, x minor */
2288 pme->work_nalloc = pme->nkx;
2289 snew(pme->work_mhx,pme->work_nalloc);
2290 snew(pme->work_mhy,pme->work_nalloc);
2291 snew(pme->work_mhz,pme->work_nalloc);
2292 snew(pme->work_m2,pme->work_nalloc);
2293 snew(pme->work_denom,pme->work_nalloc);
2294 /* Allocate an aligned pointer for SSE operations, including 3 extra
2295 * elements at the end since SSE operates on 4 elements at a time.
2297 snew(pme->work_tmp1_alloc,pme->work_nalloc+8);
2298 pme->work_tmp1 = (real *) (((size_t) pme->work_tmp1_alloc + 16) & (~((size_t) 15)));
2299 snew(pme->work_m2inv,pme->work_nalloc);
2306 static void spread_on_grid(gmx_pme_t pme,
2307 pme_atomcomm_t *atc,real *grid,
2308 gmx_bool bCalcSplines,gmx_bool bSpread)
2313 /* Compute fftgrid index for all atoms,
2314 * with help of some extra variables.
2316 calc_interpolation_idx(pme,atc);
2318 /* make local bsplines */
2319 make_bsplines(atc->theta,atc->dtheta,pme->pme_order,
2320 atc->fractx,atc->n,atc->q,pme->bFEP);
2325 /* put local atoms on grid. */
2326 spread_q_bsplines(pme,atc,grid);
2330 void gmx_pme_calc_energy(gmx_pme_t pme,int n,rvec *x,real *q,real *V)
2332 pme_atomcomm_t *atc;
2335 if (pme->nnodes > 1)
2337 gmx_incons("gmx_pme_calc_energy called in parallel");
2341 gmx_incons("gmx_pme_calc_energy with free energy");
2344 atc = &pme->atc_energy;
2346 atc->bSpread = TRUE;
2347 atc->pme_order = pme->pme_order;
2349 pme_realloc_atomcomm_things(atc);
2353 /* We only use the A-charges grid */
2354 grid = pme->pmegridA;
2356 spread_on_grid(pme,atc,NULL,TRUE,FALSE);
2358 *V = gather_energy_bsplines(pme,grid,atc);
2362 static void reset_pmeonly_counters(t_commrec *cr,gmx_wallcycle_t wcycle,
2363 t_nrnb *nrnb,t_inputrec *ir, gmx_large_int_t step_rel)
2365 /* Reset all the counters related to performance over the run */
2366 wallcycle_stop(wcycle,ewcRUN);
2367 wallcycle_reset_all(wcycle);
2369 ir->init_step += step_rel;
2370 ir->nsteps -= step_rel;
2371 wallcycle_start(wcycle,ewcRUN);
2375 int gmx_pmeonly(gmx_pme_t pme,
2376 t_commrec *cr, t_nrnb *nrnb,
2377 gmx_wallcycle_t wcycle,
2378 real ewaldcoeff, gmx_bool bGatherOnly,
2381 gmx_pme_pp_t pme_pp;
2384 rvec *x_pp=NULL,*f_pp=NULL;
2385 real *chargeA=NULL,*chargeB=NULL;
2387 int maxshift_x=0,maxshift_y=0;
2388 real energy,dvdlambda;
2393 gmx_large_int_t step,step_rel;
2396 pme_pp = gmx_pme_pp_init(cr);
2401 do /****** this is a quasi-loop over time steps! */
2403 /* Domain decomposition */
2404 natoms = gmx_pme_recv_q_x(pme_pp,
2405 &chargeA,&chargeB,box,&x_pp,&f_pp,
2406 &maxshift_x,&maxshift_y,
2412 /* We should stop: break out of the loop */
2416 step_rel = step - ir->init_step;
2419 wallcycle_start(wcycle,ewcRUN);
2421 wallcycle_start(wcycle,ewcPMEMESH);
2425 gmx_pme_do(pme,0,natoms,x_pp,f_pp,chargeA,chargeB,box,
2426 cr,maxshift_x,maxshift_y,nrnb,wcycle,vir,ewaldcoeff,
2427 &energy,lambda,&dvdlambda,
2428 GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
2430 cycles = wallcycle_stop(wcycle,ewcPMEMESH);
2432 gmx_pme_send_force_vir_ener(pme_pp,
2433 f_pp,vir,energy,dvdlambda,
2438 if (step_rel == wcycle_get_reset_counters(wcycle))
2440 /* Reset all the counters related to performance over the run */
2441 reset_pmeonly_counters(cr,wcycle,nrnb,ir,step_rel);
2442 wcycle_set_reset_counters(wcycle, 0);
2445 } /***** end of quasi-loop, we stop with the break above */
2451 int gmx_pme_do(gmx_pme_t pme,
2452 int start, int homenr,
2454 real *chargeA, real *chargeB,
2455 matrix box, t_commrec *cr,
2456 int maxshift_x, int maxshift_y,
2457 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
2458 matrix vir, real ewaldcoeff,
2459 real *energy, real lambda,
2460 real *dvdlambda, int flags)
2462 int q,d,i,j,ntot,npme;
2466 pme_atomcomm_t *atc=NULL;
2470 real *charge=NULL,*q_d,vol;
2474 gmx_parallel_3dfft_t pfft_setup;
2476 t_complex * cfftgrid;
2478 if (pme->nnodes > 1) {
2481 if (atc->npd > atc->pd_nalloc) {
2482 atc->pd_nalloc = over_alloc_dd(atc->npd);
2483 srenew(atc->pd,atc->pd_nalloc);
2485 atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
2489 /* This could be necessary for TPI */
2490 pme->atc[0].n = homenr;
2493 for(q=0; q<(pme->bFEP ? 2 : 1); q++) {
2495 grid = pme->pmegridA;
2496 fftgrid = pme->fftgridA;
2497 cfftgrid = pme->cfftgridA;
2498 pfft_setup = pme->pfft_setupA;
2499 charge = chargeA+start;
2501 grid = pme->pmegridB;
2502 fftgrid = pme->fftgridB;
2503 cfftgrid = pme->cfftgridB;
2504 pfft_setup = pme->pfft_setupB;
2505 charge = chargeB+start;
2507 /* Unpack structure */
2509 fprintf(debug,"PME: nnodes = %d, nodeid = %d\n",
2510 cr->nnodes,cr->nodeid);
2511 fprintf(debug,"Grid = %p\n",(void*)grid);
2513 gmx_fatal(FARGS,"No grid!");
2517 m_inv_ur0(box,pme->recipbox);
2519 if (pme->nnodes == 1) {
2521 if (DOMAINDECOMP(cr)) {
2523 pme_realloc_atomcomm_things(atc);
2529 wallcycle_start(wcycle,ewcPME_REDISTXF);
2530 for(d=pme->ndecompdim-1; d>=0; d--)
2532 if (d == pme->ndecompdim-1)
2540 n_d = pme->atc[d+1].n;
2546 if (atc->npd > atc->pd_nalloc) {
2547 atc->pd_nalloc = over_alloc_dd(atc->npd);
2548 srenew(atc->pd,atc->pd_nalloc);
2550 atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
2551 pme_calc_pidx(n_d,pme->recipbox,x_d,atc);
2554 GMX_BARRIER(cr->mpi_comm_mygroup);
2555 /* Redistribute x (only once) and qA or qB */
2556 if (DOMAINDECOMP(cr)) {
2557 dd_pmeredist_x_q(pme, n_d, q==0, x_d, q_d, atc);
2559 pmeredist_pd(pme, TRUE, n_d, q==0, x_d, q_d, atc);
2564 wallcycle_stop(wcycle,ewcPME_REDISTXF);
2568 fprintf(debug,"Node= %6d, pme local particles=%6d\n",
2571 if (flags & GMX_PME_SPREAD_Q)
2573 wallcycle_start(wcycle,ewcPME_SPREADGATHER);
2575 /* Spread the charges on a grid */
2576 GMX_MPE_LOG(ev_spread_on_grid_start);
2578 /* Spread the charges on a grid */
2579 spread_on_grid(pme,&pme->atc[0],grid,q==0,TRUE);
2580 GMX_MPE_LOG(ev_spread_on_grid_finish);
2584 inc_nrnb(nrnb,eNR_WEIGHTS,DIM*atc->n);
2586 inc_nrnb(nrnb,eNR_SPREADQBSP,
2587 pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
2589 wrap_periodic_pmegrid(pme,grid);
2591 /* sum contributions to local grid from other nodes */
2593 if (pme->nnodes > 1) {
2594 GMX_BARRIER(cr->mpi_comm_mygroup);
2595 gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_FORWARD);
2601 copy_pmegrid_to_fftgrid(pme,grid,fftgrid);
2603 wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
2606 if (flags & GMX_PME_SOLVE)
2609 GMX_BARRIER(cr->mpi_comm_mygroup);
2610 GMX_MPE_LOG(ev_gmxfft3d_start);
2611 wallcycle_start(wcycle,ewcPME_FFT);
2612 gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_REAL_TO_COMPLEX,fftgrid,cfftgrid);
2613 wallcycle_stop(wcycle,ewcPME_FFT);
2614 GMX_MPE_LOG(ev_gmxfft3d_finish);
2617 /* solve in k-space for our local cells */
2619 GMX_BARRIER(cr->mpi_comm_mygroup);
2620 GMX_MPE_LOG(ev_solve_pme_start);
2621 wallcycle_start(wcycle,ewcPME_SOLVE);
2623 solve_pme_yzx(pme,cfftgrid,ewaldcoeff,vol,
2624 flags & GMX_PME_CALC_ENER_VIR,
2625 &energy_AB[q],vir_AB[q]);
2626 wallcycle_stop(wcycle,ewcPME_SOLVE);
2628 GMX_MPE_LOG(ev_solve_pme_finish);
2629 inc_nrnb(nrnb,eNR_SOLVEPME,loop_count);
2632 if ((flags & GMX_PME_CALC_F) ||
2633 (flags & GMX_PME_CALC_POT))
2637 GMX_BARRIER(cr->mpi_comm_mygroup);
2638 GMX_MPE_LOG(ev_gmxfft3d_start);
2640 wallcycle_start(wcycle,ewcPME_FFT);
2641 gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_COMPLEX_TO_REAL,cfftgrid,fftgrid);
2642 wallcycle_stop(wcycle,ewcPME_FFT);
2645 GMX_MPE_LOG(ev_gmxfft3d_finish);
2647 if (pme->nodeid == 0)
2649 ntot = pme->nkx*pme->nky*pme->nkz;
2650 npme = ntot*log((real)ntot)/log(2.0);
2651 inc_nrnb(nrnb,eNR_FFT,2*npme);
2654 wallcycle_start(wcycle,ewcPME_SPREADGATHER);
2656 copy_fftgrid_to_pmegrid(pme,fftgrid,grid);
2658 /* distribute local grid to all nodes */
2660 if (pme->nnodes > 1) {
2661 GMX_BARRIER(cr->mpi_comm_mygroup);
2662 gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_BACKWARD);
2667 unwrap_periodic_pmegrid(pme,grid);
2670 if (flags & GMX_PME_CALC_F)
2672 /* interpolate forces for our local atoms */
2673 GMX_BARRIER(cr->mpi_comm_mygroup);
2674 GMX_MPE_LOG(ev_gather_f_bsplines_start);
2678 /* If we are running without parallelization,
2679 * atc->f is the actual force array, not a buffer,
2680 * therefore we should not clear it.
2682 bClearF = (q == 0 && PAR(cr));
2683 gather_f_bsplines(pme,grid,bClearF,&pme->atc[0],
2684 pme->bFEP ? (q==0 ? 1.0-lambda : lambda) : 1.0);
2687 GMX_MPE_LOG(ev_gather_f_bsplines_finish);
2689 inc_nrnb(nrnb,eNR_GATHERFBSP,
2690 pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
2691 wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
2695 if ((flags & GMX_PME_CALC_F) && pme->nnodes > 1) {
2696 wallcycle_start(wcycle,ewcPME_REDISTXF);
2697 for(d=0; d<pme->ndecompdim; d++)
2700 if (d == pme->ndecompdim - 1)
2707 n_d = pme->atc[d+1].n;
2708 f_d = pme->atc[d+1].f;
2710 GMX_BARRIER(cr->mpi_comm_mygroup);
2711 if (DOMAINDECOMP(cr)) {
2712 dd_pmeredist_f(pme,atc,n_d,f_d,
2713 d==pme->ndecompdim-1 && pme->bPPnode);
2715 pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
2719 wallcycle_stop(wcycle,ewcPME_REDISTXF);
2724 *energy = energy_AB[0];
2725 m_add(vir,vir_AB[0],vir);
2727 *energy = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
2728 *dvdlambda += energy_AB[1] - energy_AB[0];
2729 for(i=0; i<DIM; i++)
2730 for(j=0; j<DIM; j++)
2731 vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] + lambda*vir_AB[1][i][j];
2735 fprintf(debug,"PME mesh energy: %g\n",*energy);