1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2004, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
34 * GROwing Monsters And Cloning Shrimps
36 /* IMPORTANT FOR DEVELOPERS:
38 * Triclinic pme stuff isn't entirely trivial, and we've experienced
39 * some bugs during development (many of them due to me). To avoid
40 * this in the future, please check the following things if you make
41 * changes in this file:
43 * 1. You should obtain identical (at least to the PME precision)
44 * energies, forces, and virial for
45 * a rectangular box and a triclinic one where the z (or y) axis is
46 * tilted a whole box side. For instance you could use these boxes:
48 * rectangular triclinic
53 * 2. You should check the energy conservation in a triclinic box.
55 * It might seem an overkill, but better safe than sorry.
77 #include "gmxcomplex.h"
81 #include "gmx_fatal.h"
87 #include "gmx_wallcycle.h"
88 #include "gmx_parallel_3dfft.h"
91 #if ( !defined(GMX_DOUBLE) && ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) ) )
92 #include "gmx_sse2_single.h"
95 #include "mpelogging.h"
98 /* #define PRT_FORCE */
99 /* conditions for on the fly time-measurement */
100 /* #define TAKETIME (step > 1 && timesteps < 10) */
101 #define TAKETIME FALSE
104 #define mpi_type MPI_DOUBLE
106 #define mpi_type MPI_FLOAT
109 /* Internal datastructures */
125 int *send_id,*recv_id;
126 pme_grid_comm_t *comm_data;
130 int dimind; /* The index of the dimension, 0=x, 1=y */
137 int *node_dest; /* The nodes to send x and q to with DD */
138 int *node_src; /* The nodes to receive x and q from with DD */
139 int *buf_index; /* Index for commnode into the buffers */
146 int *count; /* The number of atoms to send to each node */
147 int *rcount; /* The number of atoms to receive */
154 gmx_bool bSpread; /* These coordinates are used for spreading */
156 splinevec theta,dtheta;
158 rvec *fractx; /* Fractional coordinate relative to the
159 * lower cell boundary
163 typedef struct gmx_pme {
164 int ndecompdim; /* The number of decomposition dimensions */
165 int nodeid; /* Our nodeid in mpi->mpi_comm */
168 int nnodes; /* The number of nodes doing PME */
173 MPI_Comm mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
175 MPI_Datatype rvec_mpi; /* the pme vector's MPI type */
178 gmx_bool bPPnode; /* Node also does particle-particle forces */
179 gmx_bool bFEP; /* Compute Free energy contribution */
180 int nkx,nky,nkz; /* Grid dimensions */
184 real * pmegridA; /* Grids on which we do spreading/interpolation, includes overlap */
186 int pmegrid_nx,pmegrid_ny,pmegrid_nz;
187 int pmegrid_start_ix,pmegrid_start_iy,pmegrid_start_iz;
189 real * pmegrid_sendbuf;
190 real * pmegrid_recvbuf;
192 real *fftgridA; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
193 real *fftgridB; /* inside the interpolation grid, but separate for 2D PME decomp. */
194 int fftgrid_nx,fftgrid_ny,fftgrid_nz;
196 t_complex *cfftgridA; /* Grids for complex FFT data */
197 t_complex *cfftgridB;
198 int cfftgrid_nx,cfftgrid_ny,cfftgrid_nz;
200 gmx_parallel_3dfft_t pfft_setupA;
201 gmx_parallel_3dfft_t pfft_setupB;
204 real *fshx,*fshy,*fshz;
206 pme_atomcomm_t atc[2]; /* Indexed on decomposition index */
210 pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
213 pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
215 rvec *bufv; /* Communication buffer */
216 real *bufr; /* Communication buffer */
217 int buf_nalloc; /* The communication buffer size */
219 /* work data for solve_pme */
226 real * work_tmp1_alloc;
230 /* Work data for PME_redist */
231 gmx_bool redist_init;
239 int redist_buf_nalloc;
241 /* Work data for sum_qgrid */
242 real * sum_qgrid_tmp;
243 real * sum_qgrid_dd_tmp;
247 static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc)
250 int *idxptr,tix,tiy,tiz;
251 real *xptr,*fptr,tx,ty,tz;
252 real rxx,ryx,ryy,rzx,rzy,rzz;
254 int start_ix,start_iy,start_iz;
260 start_ix = pme->pmegrid_start_ix;
261 start_iy = pme->pmegrid_start_iy;
262 start_iz = pme->pmegrid_start_iz;
264 rxx = pme->recipbox[XX][XX];
265 ryx = pme->recipbox[YY][XX];
266 ryy = pme->recipbox[YY][YY];
267 rzx = pme->recipbox[ZZ][XX];
268 rzy = pme->recipbox[ZZ][YY];
269 rzz = pme->recipbox[ZZ][ZZ];
271 for(i=0; (i<atc->n); i++) {
273 idxptr = atc->idx[i];
274 fptr = atc->fractx[i];
276 /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
277 tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
278 ty = ny * ( xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
279 tz = nz * ( xptr[ZZ] * rzz + 2.0 );
285 /* Because decomposition only occurs in x and y,
286 * we never have a fraction correction in z.
288 fptr[XX] = tx - tix + pme->fshx[tix];
289 fptr[YY] = ty - tiy + pme->fshy[tiy];
292 idxptr[XX] = pme->nnx[tix];
293 idxptr[YY] = pme->nny[tiy];
294 idxptr[ZZ] = pme->nnz[tiz];
297 range_check(idxptr[XX],0,pme->pmegrid_nx);
298 range_check(idxptr[YY],0,pme->pmegrid_ny);
299 range_check(idxptr[ZZ],0,pme->pmegrid_nz);
304 static void pme_calc_pidx(int natoms, matrix recipbox, rvec x[],
310 real rxx,ryx,rzx,ryy,rzy;
313 /* Calculate PME task index (pidx) for each grid index.
314 * Here we always assign equally sized slabs to each node
315 * for load balancing reasons (the PME grid spacing is not used).
322 /* Reset the count */
323 for(i=0; i<nslab; i++)
328 if (atc->dimind == 0)
330 rxx = recipbox[XX][XX];
331 ryx = recipbox[YY][XX];
332 rzx = recipbox[ZZ][XX];
333 /* Calculate the node index in x-dimension */
334 for(i=0; (i<natoms); i++)
337 /* Fractional coordinates along box vectors */
338 s = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
339 si = (int)(s + 2*nslab) % nslab;
346 ryy = recipbox[YY][YY];
347 rzy = recipbox[ZZ][YY];
348 /* Calculate the node index in y-dimension */
349 for(i=0; (i<natoms); i++)
352 /* Fractional coordinates along box vectors */
353 s = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
354 si = (int)(s + 2*nslab) % nslab;
361 static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
365 if (atc->n > atc->nalloc) {
366 nalloc_old = atc->nalloc;
367 atc->nalloc = over_alloc_dd(atc->n);
369 if (atc->nslab > 1) {
370 srenew(atc->x,atc->nalloc);
371 srenew(atc->q,atc->nalloc);
372 srenew(atc->f,atc->nalloc);
373 for(i=nalloc_old; i<atc->nalloc; i++)
375 clear_rvec(atc->f[i]);
380 srenew(atc->theta[i] ,atc->pme_order*atc->nalloc);
381 srenew(atc->dtheta[i],atc->pme_order*atc->nalloc);
383 srenew(atc->fractx,atc->nalloc);
384 srenew(atc->idx ,atc->nalloc);
389 static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
390 int n, gmx_bool bXF, rvec *x_f, real *charge,
392 /* Redistribute particle data for PME calculation */
393 /* domain decomposition by x coordinate */
398 if(FALSE == pme->redist_init) {
399 snew(pme->scounts,atc->nslab);
400 snew(pme->rcounts,atc->nslab);
401 snew(pme->sdispls,atc->nslab);
402 snew(pme->rdispls,atc->nslab);
403 snew(pme->sidx,atc->nslab);
404 pme->redist_init = TRUE;
406 if (n > pme->redist_buf_nalloc) {
407 pme->redist_buf_nalloc = over_alloc_dd(n);
408 srenew(pme->redist_buf,pme->redist_buf_nalloc*DIM);
415 /* forward, redistribution from pp to pme */
417 /* Calculate send counts and exchange them with other nodes */
418 for(i=0; (i<atc->nslab); i++) pme->scounts[i]=0;
419 for(i=0; (i<n); i++) pme->scounts[pme->idxa[i]]++;
420 MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
422 /* Calculate send and receive displacements and index into send
427 for(i=1; i<atc->nslab; i++) {
428 pme->sdispls[i]=pme->sdispls[i-1]+pme->scounts[i-1];
429 pme->rdispls[i]=pme->rdispls[i-1]+pme->rcounts[i-1];
430 pme->sidx[i]=pme->sdispls[i];
432 /* Total # of particles to be received */
433 atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
435 pme_realloc_atomcomm_things(atc);
437 /* Copy particle coordinates into send buffer and exchange*/
438 for(i=0; (i<n); i++) {
439 ii=DIM*pme->sidx[pme->idxa[i]];
440 pme->sidx[pme->idxa[i]]++;
441 pme->redist_buf[ii+XX]=x_f[i][XX];
442 pme->redist_buf[ii+YY]=x_f[i][YY];
443 pme->redist_buf[ii+ZZ]=x_f[i][ZZ];
445 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
446 pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
447 pme->rvec_mpi, atc->mpi_comm);
450 /* Copy charge into send buffer and exchange*/
451 for(i=0; i<atc->nslab; i++) pme->sidx[i]=pme->sdispls[i];
452 for(i=0; (i<n); i++) {
453 ii=pme->sidx[pme->idxa[i]];
454 pme->sidx[pme->idxa[i]]++;
455 pme->redist_buf[ii]=charge[i];
457 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
458 atc->q, pme->rcounts, pme->rdispls, mpi_type,
461 else { /* backward, redistribution from pme to pp */
462 MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
463 pme->redist_buf, pme->scounts, pme->sdispls,
464 pme->rvec_mpi, atc->mpi_comm);
466 /* Copy data from receive buffer */
467 for(i=0; i<atc->nslab; i++)
468 pme->sidx[i] = pme->sdispls[i];
469 for(i=0; (i<n); i++) {
470 ii = DIM*pme->sidx[pme->idxa[i]];
471 x_f[i][XX] += pme->redist_buf[ii+XX];
472 x_f[i][YY] += pme->redist_buf[ii+YY];
473 x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
474 pme->sidx[pme->idxa[i]]++;
480 static void pme_dd_sendrecv(pme_atomcomm_t *atc,
481 gmx_bool bBackward,int shift,
482 void *buf_s,int nbyte_s,
483 void *buf_r,int nbyte_r)
489 if (bBackward == FALSE) {
490 dest = atc->node_dest[shift];
491 src = atc->node_src[shift];
493 dest = atc->node_src[shift];
494 src = atc->node_dest[shift];
497 if (nbyte_s > 0 && nbyte_r > 0) {
498 MPI_Sendrecv(buf_s,nbyte_s,MPI_BYTE,
500 buf_r,nbyte_r,MPI_BYTE,
502 atc->mpi_comm,&stat);
503 } else if (nbyte_s > 0) {
504 MPI_Send(buf_s,nbyte_s,MPI_BYTE,
507 } else if (nbyte_r > 0) {
508 MPI_Recv(buf_r,nbyte_r,MPI_BYTE,
510 atc->mpi_comm,&stat);
515 static void dd_pmeredist_x_q(gmx_pme_t pme,
516 int n, gmx_bool bX, rvec *x, real *charge,
519 int *commnode,*buf_index;
520 int nnodes_comm,i,nsend,local_pos,buf_pos,node,scount,rcount;
522 commnode = atc->node_dest;
523 buf_index = atc->buf_index;
525 nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
528 for(i=0; i<nnodes_comm; i++) {
529 buf_index[commnode[i]] = nsend;
530 nsend += atc->count[commnode[i]];
533 if (atc->count[atc->nodeid] + nsend != n)
534 gmx_fatal(FARGS,"%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
535 "This usually means that your system is not well equilibrated.",
536 n - (atc->count[atc->nodeid] + nsend),
537 pme->nodeid,'x'+atc->dimind);
539 if (nsend > pme->buf_nalloc) {
540 pme->buf_nalloc = over_alloc_dd(nsend);
541 srenew(pme->bufv,pme->buf_nalloc);
542 srenew(pme->bufr,pme->buf_nalloc);
545 atc->n = atc->count[atc->nodeid];
546 for(i=0; i<nnodes_comm; i++) {
547 scount = atc->count[commnode[i]];
548 /* Communicate the count */
550 fprintf(debug,"dimind %d PME node %d send to node %d: %d\n",
551 atc->dimind,atc->nodeid,commnode[i],scount);
552 pme_dd_sendrecv(atc,FALSE,i,
554 &atc->rcount[i],sizeof(int));
555 atc->n += atc->rcount[i];
558 pme_realloc_atomcomm_things(atc);
564 if (node == atc->nodeid) {
565 /* Copy direct to the receive buffer */
567 copy_rvec(x[i],atc->x[local_pos]);
569 atc->q[local_pos] = charge[i];
572 /* Copy to the send buffer */
574 copy_rvec(x[i],pme->bufv[buf_index[node]]);
576 pme->bufr[buf_index[node]] = charge[i];
582 for(i=0; i<nnodes_comm; i++) {
583 scount = atc->count[commnode[i]];
584 rcount = atc->rcount[i];
585 if (scount > 0 || rcount > 0) {
587 /* Communicate the coordinates */
588 pme_dd_sendrecv(atc,FALSE,i,
589 pme->bufv[buf_pos],scount*sizeof(rvec),
590 atc->x[local_pos],rcount*sizeof(rvec));
592 /* Communicate the charges */
593 pme_dd_sendrecv(atc,FALSE,i,
594 pme->bufr+buf_pos,scount*sizeof(real),
595 atc->q+local_pos,rcount*sizeof(real));
597 local_pos += atc->rcount[i];
602 static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
606 int *commnode,*buf_index;
607 int nnodes_comm,local_pos,buf_pos,i,scount,rcount,node;
609 commnode = atc->node_dest;
610 buf_index = atc->buf_index;
612 nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
614 local_pos = atc->count[atc->nodeid];
616 for(i=0; i<nnodes_comm; i++) {
617 scount = atc->rcount[i];
618 rcount = atc->count[commnode[i]];
619 if (scount > 0 || rcount > 0) {
620 /* Communicate the forces */
621 pme_dd_sendrecv(atc,TRUE,i,
622 atc->f[local_pos],scount*sizeof(rvec),
623 pme->bufv[buf_pos],rcount*sizeof(rvec));
626 buf_index[commnode[i]] = buf_pos;
636 if (node == atc->nodeid)
638 /* Add from the local force array */
639 rvec_inc(f[i],atc->f[local_pos]);
644 /* Add from the receive buffer */
645 rvec_inc(f[i],pme->bufv[buf_index[node]]);
655 if (node == atc->nodeid)
657 /* Copy from the local force array */
658 copy_rvec(atc->f[local_pos],f[i]);
663 /* Copy from the receive buffer */
664 copy_rvec(pme->bufv[buf_index[node]],f[i]);
673 gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
675 pme_overlap_t *overlap;
676 int send_index0,send_nindex;
677 int recv_index0,recv_nindex;
679 int i,j,k,ix,iy,iz,icnt;
680 int ipulse,send_id,recv_id,datasize;
682 real *sendptr,*recvptr;
684 /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
685 overlap = &pme->overlap[1];
687 for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
689 /* Since we have already (un)wrapped the overlap in the z-dimension,
690 * we only have to communicate 0 to nkz (not pmegrid_nz).
692 if (direction==GMX_SUM_QGRID_FORWARD)
694 send_id = overlap->send_id[ipulse];
695 recv_id = overlap->recv_id[ipulse];
696 send_index0 = overlap->comm_data[ipulse].send_index0;
697 send_nindex = overlap->comm_data[ipulse].send_nindex;
698 recv_index0 = overlap->comm_data[ipulse].recv_index0;
699 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
703 send_id = overlap->recv_id[ipulse];
704 recv_id = overlap->send_id[ipulse];
705 send_index0 = overlap->comm_data[ipulse].recv_index0;
706 send_nindex = overlap->comm_data[ipulse].recv_nindex;
707 recv_index0 = overlap->comm_data[ipulse].send_index0;
708 recv_nindex = overlap->comm_data[ipulse].send_nindex;
711 /* Copy data to contiguous send buffer */
714 fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
715 pme->nodeid,overlap->nodeid,send_id,
716 pme->pmegrid_start_iy,
717 send_index0-pme->pmegrid_start_iy,
718 send_index0-pme->pmegrid_start_iy+send_nindex);
721 for(i=0;i<pme->pmegrid_nx;i++)
724 for(j=0;j<send_nindex;j++)
726 iy = j + send_index0 - pme->pmegrid_start_iy;
727 for(k=0;k<pme->nkz;k++)
730 pme->pmegrid_sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
735 datasize = pme->pmegrid_nx * pme->nkz;
737 MPI_Sendrecv(pme->pmegrid_sendbuf,send_nindex*datasize,GMX_MPI_REAL,
739 pme->pmegrid_recvbuf,recv_nindex*datasize,GMX_MPI_REAL,
741 overlap->mpi_comm,&stat);
743 /* Get data from contiguous recv buffer */
746 fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
747 pme->nodeid,overlap->nodeid,recv_id,
748 pme->pmegrid_start_iy,
749 recv_index0-pme->pmegrid_start_iy,
750 recv_index0-pme->pmegrid_start_iy+recv_nindex);
753 for(i=0;i<pme->pmegrid_nx;i++)
756 for(j=0;j<recv_nindex;j++)
758 iy = j + recv_index0 - pme->pmegrid_start_iy;
759 for(k=0;k<pme->nkz;k++)
762 if(direction==GMX_SUM_QGRID_FORWARD)
764 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += pme->pmegrid_recvbuf[icnt++];
768 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] = pme->pmegrid_recvbuf[icnt++];
775 /* Major dimension is easier, no copying required,
776 * but we might have to sum to separate array.
777 * Since we don't copy, we have to communicate up to pmegrid_nz,
778 * not nkz as for the minor direction.
780 overlap = &pme->overlap[0];
782 for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
784 if(direction==GMX_SUM_QGRID_FORWARD)
786 send_id = overlap->send_id[ipulse];
787 recv_id = overlap->recv_id[ipulse];
788 send_index0 = overlap->comm_data[ipulse].send_index0;
789 send_nindex = overlap->comm_data[ipulse].send_nindex;
790 recv_index0 = overlap->comm_data[ipulse].recv_index0;
791 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
792 recvptr = pme->pmegrid_recvbuf;
796 send_id = overlap->recv_id[ipulse];
797 recv_id = overlap->send_id[ipulse];
798 send_index0 = overlap->comm_data[ipulse].recv_index0;
799 send_nindex = overlap->comm_data[ipulse].recv_nindex;
800 recv_index0 = overlap->comm_data[ipulse].send_index0;
801 recv_nindex = overlap->comm_data[ipulse].send_nindex;
802 recvptr = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
805 sendptr = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
806 datasize = pme->pmegrid_ny * pme->pmegrid_nz;
810 fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
811 pme->nodeid,overlap->nodeid,send_id,
812 pme->pmegrid_start_ix,
813 send_index0-pme->pmegrid_start_ix,
814 send_index0-pme->pmegrid_start_ix+send_nindex);
815 fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
816 pme->nodeid,overlap->nodeid,recv_id,
817 pme->pmegrid_start_ix,
818 recv_index0-pme->pmegrid_start_ix,
819 recv_index0-pme->pmegrid_start_ix+recv_nindex);
822 MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
824 recvptr,recv_nindex*datasize,GMX_MPI_REAL,
826 overlap->mpi_comm,&stat);
828 /* ADD data from contiguous recv buffer */
829 if(direction==GMX_SUM_QGRID_FORWARD)
831 p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
832 for(i=0;i<recv_nindex*datasize;i++)
834 p[i] += pme->pmegrid_recvbuf[i];
843 copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
845 ivec local_fft_ndata,local_fft_offset,local_fft_size;
850 /* Dimensions should be identical for A/B grid, so we just use A here */
851 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
856 local_pme_size[0] = pme->pmegrid_nx;
857 local_pme_size[1] = pme->pmegrid_ny;
858 local_pme_size[2] = pme->pmegrid_nz;
860 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
861 the offset is identical, and the PME grid always has more data (due to overlap)
866 char fn[STRLEN],format[STRLEN];
868 sprintf(fn,"pmegrid%d.pdb",pme->nodeid);
870 sprintf(fn,"pmegrid%d.txt",pme->nodeid);
871 fp2 = ffopen(fn,"w");
872 sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
874 for(ix=0;ix<local_fft_ndata[XX];ix++)
876 for(iy=0;iy<local_fft_ndata[YY];iy++)
878 for(iz=0;iz<local_fft_ndata[ZZ];iz++)
880 pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
881 fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
882 fftgrid[fftidx] = pmegrid[pmeidx];
884 val = 100*pmegrid[pmeidx];
885 if (pmegrid[pmeidx] != 0)
886 fprintf(fp,format,"ATOM",pmeidx,"CA","GLY",' ',pmeidx,' ',
887 5.0*ix,5.0*iy,5.0*iz,1.0,val);
888 if (pmegrid[pmeidx] != 0)
889 fprintf(fp2,"%-12s %5d %5d %5d %12.5e\n",
891 pme->pmegrid_start_ix + ix,
892 pme->pmegrid_start_iy + iy,
893 pme->pmegrid_start_iz + iz,
909 copy_fftgrid_to_pmegrid(gmx_pme_t pme, real *fftgrid, real *pmegrid)
911 ivec local_fft_ndata,local_fft_offset,local_fft_size;
916 /* Dimensions should be identical for A/B grid, so we just use A here */
917 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
922 local_pme_size[0] = pme->pmegrid_nx;
923 local_pme_size[1] = pme->pmegrid_ny;
924 local_pme_size[2] = pme->pmegrid_nz;
926 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
927 the offset is identical, and the PME grid always has more data (due to overlap)
929 for(ix=0;ix<local_fft_ndata[XX];ix++)
931 for(iy=0;iy<local_fft_ndata[YY];iy++)
933 for(iz=0;iz<local_fft_ndata[ZZ];iz++)
935 pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
936 fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
937 pmegrid[pmeidx] = fftgrid[fftidx];
946 wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
948 int nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix,iy,iz;
954 pnx = pme->pmegrid_nx;
955 pny = pme->pmegrid_ny;
956 pnz = pme->pmegrid_nz;
958 overlap = pme->pme_order - 1;
960 /* Add periodic overlap in z */
961 for(ix=0; ix<pnx; ix++)
963 for(iy=0; iy<pny; iy++)
965 for(iz=0; iz<overlap; iz++)
967 pmegrid[(ix*pny+iy)*pnz+iz] +=
968 pmegrid[(ix*pny+iy)*pnz+nz+iz];
973 if (pme->nnodes_minor == 1)
975 for(ix=0; ix<pnx; ix++)
977 for(iy=0; iy<overlap; iy++)
979 for(iz=0; iz<nz; iz++)
981 pmegrid[(ix*pny+iy)*pnz+iz] +=
982 pmegrid[(ix*pny+ny+iy)*pnz+iz];
988 if (pme->nnodes_major == 1)
990 ny_x = (pme->nnodes_minor == 1 ? ny : pny);
992 for(ix=0; ix<overlap; ix++)
994 for(iy=0; iy<ny_x; iy++)
996 for(iz=0; iz<nz; iz++)
998 pmegrid[(ix*pny+iy)*pnz+iz] +=
999 pmegrid[((nx+ix)*pny+iy)*pnz+iz];
1008 unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1010 int nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix,iy,iz;
1016 pnx = pme->pmegrid_nx;
1017 pny = pme->pmegrid_ny;
1018 pnz = pme->pmegrid_nz;
1020 overlap = pme->pme_order - 1;
1022 if (pme->nnodes_major == 1)
1024 ny_x = (pme->nnodes_minor == 1 ? ny : pny);
1026 for(ix=0; ix<overlap; ix++)
1028 for(iy=0; iy<ny_x; iy++)
1030 for(iz=0; iz<nz; iz++)
1032 pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
1033 pmegrid[(ix*pny+iy)*pnz+iz];
1039 if (pme->nnodes_minor == 1)
1041 for(ix=0; ix<pnx; ix++)
1043 for(iy=0; iy<overlap; iy++)
1045 for(iz=0; iz<nz; iz++)
1047 pmegrid[(ix*pny+ny+iy)*pnz+iz] =
1048 pmegrid[(ix*pny+iy)*pnz+iz];
1054 /* Copy periodic overlap in z */
1055 for(ix=0; ix<pnx; ix++)
1057 for(iy=0; iy<pny; iy++)
1059 for(iz=0; iz<overlap; iz++)
1061 pmegrid[(ix*pny+iy)*pnz+nz+iz] =
1062 pmegrid[(ix*pny+iy)*pnz+iz];
1069 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
1070 #define DO_BSPLINE(order) \
1071 for(ithx=0; (ithx<order); ithx++) \
1073 index_x = (i0+ithx)*pny*pnz; \
1074 valx = qn*thx[ithx]; \
1076 for(ithy=0; (ithy<order); ithy++) \
1078 valxy = valx*thy[ithy]; \
1079 index_xy = index_x+(j0+ithy)*pnz; \
1081 for(ithz=0; (ithz<order); ithz++) \
1083 index_xyz = index_xy+(k0+ithz); \
1084 grid[index_xyz] += valxy*thz[ithz]; \
1090 static void spread_q_bsplines(gmx_pme_t pme, pme_atomcomm_t *atc,
1094 /* spread charges from home atoms to local grid */
1096 int b,i,nn,n,ithx,ithy,ithz,i0,j0,k0;
1098 int order,norder,index_x,index_xy,index_xyz;
1100 real *thx,*thy,*thz;
1101 int localsize, bndsize;
1103 int pnx,pny,pnz,ndatatot;
1105 pnx = pme->pmegrid_nx;
1106 pny = pme->pmegrid_ny;
1107 pnz = pme->pmegrid_nz;
1108 ndatatot = pnx*pny*pnz;
1110 for(i=0;i<ndatatot;i++)
1115 order = pme->pme_order;
1117 for(nn=0; (nn<atc->n);nn++)
1124 idxptr = atc->idx[n];
1130 thx = atc->theta[XX] + norder;
1131 thy = atc->theta[YY] + norder;
1132 thz = atc->theta[ZZ] + norder;
1135 case 4: DO_BSPLINE(4); break;
1136 case 5: DO_BSPLINE(5); break;
1137 default: DO_BSPLINE(order); break;
1144 #if ( !defined(GMX_DOUBLE) && ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) ) )
1145 /* Calculate exponentials through SSE in float precision */
1146 #define CALC_EXPONENTIALS(start,end,r_aligned) \
1149 for(kx=0; kx<end; kx+=4) \
1151 tmp_sse = _mm_load_ps(r_aligned+kx); \
1152 tmp_sse = gmx_mm_exp_ps(tmp_sse); \
1153 _mm_store_ps(r_aligned+kx,tmp_sse); \
1157 #define CALC_EXPONENTIALS(start,end,r) \
1158 for(kx=start; kx<end; kx++) \
1160 r[kx] = exp(r[kx]); \
1165 static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
1166 real ewaldcoeff,real vol,
1167 gmx_bool bEnerVir,real *mesh_energy,matrix vir)
1169 /* do recip sum over local cells in grid */
1170 /* y major, z middle, x minor or continuous */
1172 int kx,ky,kz,maxkx,maxky,maxkz;
1173 int nx,ny,nz,iy,iz,kxstart,kxend;
1175 real factor=M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
1176 real ets2,struct2,vfactor,ets2vf;
1177 real eterm,d1,d2,energy=0;
1179 real virxx=0,virxy=0,virxz=0,viryy=0,viryz=0,virzz=0;
1180 real rxx,ryx,ryy,rzx,rzy,rzz;
1181 real *mhx,*mhy,*mhz,*m2,*denom,*tmp1,*m2inv;
1182 real mhxk,mhyk,mhzk,m2k;
1185 ivec local_ndata,local_offset,local_size;
1191 /* Dimensions should be identical for A/B grid, so we just use A here */
1192 gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
1198 rxx = pme->recipbox[XX][XX];
1199 ryx = pme->recipbox[YY][XX];
1200 ryy = pme->recipbox[YY][YY];
1201 rzx = pme->recipbox[ZZ][XX];
1202 rzy = pme->recipbox[ZZ][YY];
1203 rzz = pme->recipbox[ZZ][ZZ];
1209 mhx = pme->work_mhx;
1210 mhy = pme->work_mhy;
1211 mhz = pme->work_mhz;
1213 denom = pme->work_denom;
1214 tmp1 = pme->work_tmp1;
1215 m2inv = pme->work_m2inv;
1217 for(iy=0;iy<local_ndata[YY];iy++)
1219 ky = iy + local_offset[YY];
1230 by = M_PI*vol*pme->bsp_mod[YY][ky];
1232 for(iz=0;iz<local_ndata[ZZ];iz++)
1234 kz = iz + local_offset[ZZ];
1238 bz = pme->bsp_mod[ZZ][kz];
1240 /* 0.5 correction for corner points */
1247 p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
1249 /* We should skip the k-space point (0,0,0) */
1250 if (local_offset[XX] > 0 ||
1251 local_offset[YY] > 0 || ky > 0 ||
1254 kxstart = local_offset[XX];
1258 kxstart = local_offset[XX] + 1;
1261 kxend = local_offset[XX] + local_ndata[XX];
1265 /* More expensive inner loop, especially because of the storage
1266 * of the mh elements in array's.
1267 * Because x is the minor grid index, all mh elements
1268 * depend on kx for triclinic unit cells.
1271 /* Two explicit loops to avoid a conditional inside the loop */
1272 for(kx=kxstart; kx<maxkx; kx++)
1277 mhyk = mx * ryx + my * ryy;
1278 mhzk = mx * rzx + my * rzy + mz * rzz;
1279 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1284 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1285 tmp1[kx] = -factor*m2k;
1288 for(kx=maxkx; kx<kxend; kx++)
1293 mhyk = mx * ryx + my * ryy;
1294 mhzk = mx * rzx + my * rzy + mz * rzz;
1295 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1300 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1301 tmp1[kx] = -factor*m2k;
1304 for(kx=kxstart; kx<kxend; kx++)
1306 m2inv[kx] = 1.0/m2[kx];
1308 for(kx=kxstart; kx<kxend; kx++)
1310 denom[kx] = 1.0/denom[kx];
1313 CALC_EXPONENTIALS(kxstart,kxend,tmp1);
1315 for(kx=kxstart; kx<kxend; kx++,p0++)
1320 eterm = ONE_4PI_EPS0/pme->epsilon_r*tmp1[kx]*denom[kx];
1325 struct2 = 2.0*(d1*d1+d2*d2);
1327 tmp1[kx] = eterm*struct2;
1330 for(kx=kxstart; kx<kxend; kx++)
1332 ets2 = corner_fac*tmp1[kx];
1333 vfactor = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
1336 ets2vf = ets2*vfactor;
1337 virxx += ets2vf*mhx[kx]*mhx[kx] - ets2;
1338 virxy += ets2vf*mhx[kx]*mhy[kx];
1339 virxz += ets2vf*mhx[kx]*mhz[kx];
1340 viryy += ets2vf*mhy[kx]*mhy[kx] - ets2;
1341 viryz += ets2vf*mhy[kx]*mhz[kx];
1342 virzz += ets2vf*mhz[kx]*mhz[kx] - ets2;
1347 /* We don't need to calculate the energy and the virial.
1348 * In this case the triclinic overhead is small.
1351 /* Two explicit loops to avoid a conditional inside the loop */
1353 for(kx=kxstart; kx<maxkx; kx++)
1358 mhyk = mx * ryx + my * ryy;
1359 mhzk = mx * rzx + my * rzy + mz * rzz;
1360 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1361 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1362 tmp1[kx] = -factor*m2k;
1365 for(kx=maxkx; kx<kxend; kx++)
1370 mhyk = mx * ryx + my * ryy;
1371 mhzk = mx * rzx + my * rzy + mz * rzz;
1372 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1373 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1374 tmp1[kx] = -factor*m2k;
1377 for(kx=kxstart; kx<kxend; kx++)
1379 denom[kx] = 1.0/denom[kx];
1382 CALC_EXPONENTIALS(kxstart,kxend,tmp1);
1384 for(kx=kxstart; kx<kxend; kx++,p0++)
1389 eterm = ONE_4PI_EPS0/pme->epsilon_r*tmp1[kx]*denom[kx];
1400 /* Update virial with local values.
1401 * The virial is symmetric by definition.
1402 * this virial seems ok for isotropic scaling, but I'm
1403 * experiencing problems on semiisotropic membranes.
1404 * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
1406 vir[XX][XX] = 0.25*virxx;
1407 vir[YY][YY] = 0.25*viryy;
1408 vir[ZZ][ZZ] = 0.25*virzz;
1409 vir[XX][YY] = vir[YY][XX] = 0.25*virxy;
1410 vir[XX][ZZ] = vir[ZZ][XX] = 0.25*virxz;
1411 vir[YY][ZZ] = vir[ZZ][YY] = 0.25*viryz;
1413 /* This energy should be corrected for a charged system */
1414 *mesh_energy = 0.5*energy;
1417 /* Return the loop count */
1418 return local_ndata[YY]*local_ndata[ZZ]*local_ndata[XX];
1422 #define DO_FSPLINE(order) \
1423 for(ithx=0; (ithx<order); ithx++) \
1425 index_x = (i0+ithx)*pny*pnz; \
1429 for(ithy=0; (ithy<order); ithy++) \
1431 index_xy = index_x+(j0+ithy)*pnz; \
1436 for(ithz=0; (ithz<order); ithz++) \
1438 gval = grid[index_xy+(k0+ithz)]; \
1439 fxy1 += thz[ithz]*gval; \
1440 fz1 += dthz[ithz]*gval; \
1449 void gather_f_bsplines(gmx_pme_t pme,real *grid,
1450 gmx_bool bClearF,pme_atomcomm_t *atc,real scale)
1452 /* sum forces for local particles */
1453 int nn,n,ithx,ithy,ithz,i0,j0,k0;
1454 int index_x,index_xy;
1455 int nx,ny,nz,pnx,pny,pnz;
1457 real tx,ty,dx,dy,qn;
1460 real *thx,*thy,*thz,*dthx,*dthy,*dthz;
1462 real rxx,ryx,ryy,rzx,rzy,rzz;
1465 order = pme->pme_order;
1466 thx = atc->theta[XX];
1467 thy = atc->theta[YY];
1468 thz = atc->theta[ZZ];
1469 dthx = atc->dtheta[XX];
1470 dthy = atc->dtheta[YY];
1471 dthz = atc->dtheta[ZZ];
1475 pnx = pme->pmegrid_nx;
1476 pny = pme->pmegrid_ny;
1477 pnz = pme->pmegrid_nz;
1479 rxx = pme->recipbox[XX][XX];
1480 ryx = pme->recipbox[YY][XX];
1481 ryy = pme->recipbox[YY][YY];
1482 rzx = pme->recipbox[ZZ][XX];
1483 rzy = pme->recipbox[ZZ][YY];
1484 rzz = pme->recipbox[ZZ][ZZ];
1486 for(nn=0; (nn<atc->n); nn++)
1489 qn = scale*atc->q[n];
1502 idxptr = atc->idx[n];
1509 /* Pointer arithmetic alert, next six statements */
1510 thx = atc->theta[XX] + norder;
1511 thy = atc->theta[YY] + norder;
1512 thz = atc->theta[ZZ] + norder;
1513 dthx = atc->dtheta[XX] + norder;
1514 dthy = atc->dtheta[YY] + norder;
1515 dthz = atc->dtheta[ZZ] + norder;
1518 case 4: DO_FSPLINE(4); break;
1519 case 5: DO_FSPLINE(5); break;
1520 default: DO_FSPLINE(order); break;
1523 atc->f[n][XX] += -qn*( fx*nx*rxx );
1524 atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
1525 atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
1528 /* Since the energy and not forces are interpolated
1529 * the net force might not be exactly zero.
1530 * This can be solved by also interpolating F, but
1531 * that comes at a cost.
1532 * A better hack is to remove the net force every
1533 * step, but that must be done at a higher level
1534 * since this routine doesn't see all atoms if running
1535 * in parallel. Don't know how important it is? EL 990726
1539 static real gather_energy_bsplines(gmx_pme_t pme,real *grid,
1540 pme_atomcomm_t *atc)
1542 int n,ithx,ithy,ithz,i0,j0,k0;
1543 int index_x,index_xy;
1545 real energy,pot,tx,ty,qn,gval;
1546 real *thx,*thy,*thz;
1551 order = pme->pme_order;
1552 thx = atc->theta[XX];
1553 thy = atc->theta[YY];
1554 thz = atc->theta[ZZ];
1557 for(n=0; (n<atc->n); n++) {
1561 idxptr = atc->idx[n];
1568 /* Pointer arithmetic alert, next three statements */
1569 thx = atc->theta[XX] + norder;
1570 thy = atc->theta[YY] + norder;
1571 thz = atc->theta[ZZ] + norder;
1574 for(ithx=0; (ithx<order); ithx++)
1576 index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
1579 for(ithy=0; (ithy<order); ithy++)
1581 index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
1584 for(ithz=0; (ithz<order); ithz++)
1586 gval = grid[index_xy+(k0+ithz)];
1587 pot += tx*ty*thz[ithz]*gval;
1600 void make_bsplines(splinevec theta,splinevec dtheta,int order,
1601 rvec fractx[],int nr,real charge[],
1602 gmx_bool bFreeEnergy)
1604 /* construct splines for local atoms */
1607 real *data,*ddata,*xptr;
1609 for(i=0; (i<nr); i++) {
1610 /* With free energy we do not use the charge check.
1611 * In most cases this will be more efficient than calling make_bsplines
1612 * twice, since usually more than half the particles have charges.
1614 if (bFreeEnergy || charge[i] != 0.0) {
1616 for(j=0; (j<DIM); j++) {
1619 /* dr is relative offset from lower cell limit */
1620 data=&(theta[j][i*order]);
1625 for(k=3; (k<order); k++) {
1627 data[k-1]=div*dr*data[k-2];
1628 for(l=1; (l<(k-1)); l++) {
1629 data[k-l-1]=div*((dr+l)*data[k-l-2]+(k-l-dr)*
1632 data[0]=div*(1-dr)*data[0];
1635 ddata = &(dtheta[j][i*order]);
1636 ddata[0] = -data[0];
1637 for(k=1; (k<order); k++) {
1638 ddata[k]=data[k-1]-data[k];
1642 data[order-1]=div*dr*data[order-2];
1643 for(l=1; (l<(order-1)); l++) {
1644 data[order-l-1]=div*((dr+l)*data[order-l-2]+
1645 (order-l-dr)*data[order-l-1]);
1647 data[0]=div*(1-dr)*data[0];
1654 void make_dft_mod(real *mod,real *data,int ndata)
1659 for(i=0;i<ndata;i++) {
1661 for(j=0;j<ndata;j++) {
1662 arg=(2.0*M_PI*i*j)/ndata;
1663 sc+=data[j]*cos(arg);
1664 ss+=data[j]*sin(arg);
1668 for(i=0;i<ndata;i++)
1670 mod[i]=(mod[i-1]+mod[i+1])*0.5;
1675 void make_bspline_moduli(splinevec bsp_mod,int nx,int ny,int nz,int order)
1677 int nmax=max(nx,max(ny,nz));
1678 real *data,*ddata,*bsp_data;
1684 snew(bsp_data,nmax);
1690 for(k=3;k<order;k++) {
1693 for(l=1;l<(k-1);l++)
1694 data[k-l-1]=div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
1695 data[0]=div*data[0];
1699 for(k=1;k<order;k++)
1700 ddata[k]=data[k-1]-data[k];
1703 for(l=1;l<(order-1);l++)
1704 data[order-l-1]=div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
1705 data[0]=div*data[0];
1709 for(i=1;i<=order;i++)
1710 bsp_data[i]=data[i-1];
1712 make_dft_mod(bsp_mod[XX],bsp_data,nx);
1713 make_dft_mod(bsp_mod[YY],bsp_data,ny);
1714 make_dft_mod(bsp_mod[ZZ],bsp_data,nz);
1721 static void setup_coordinate_communication(pme_atomcomm_t *atc)
1729 for(i=1; i<=nslab/2; i++) {
1730 fw = (atc->nodeid + i) % nslab;
1731 bw = (atc->nodeid - i + nslab) % nslab;
1732 if (n < nslab - 1) {
1733 atc->node_dest[n] = fw;
1734 atc->node_src[n] = bw;
1737 if (n < nslab - 1) {
1738 atc->node_dest[n] = bw;
1739 atc->node_src[n] = fw;
1745 int gmx_pme_destroy(FILE *log,gmx_pme_t *pmedata)
1749 fprintf(log,"Destroying PME data structures.\n");
1752 sfree((*pmedata)->nnx);
1753 sfree((*pmedata)->nny);
1754 sfree((*pmedata)->nnz);
1756 sfree((*pmedata)->pmegridA);
1757 sfree((*pmedata)->fftgridA);
1758 sfree((*pmedata)->cfftgridA);
1759 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
1761 if((*pmedata)->pmegridB)
1763 sfree((*pmedata)->pmegridB);
1764 sfree((*pmedata)->fftgridB);
1765 sfree((*pmedata)->cfftgridB);
1766 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
1768 sfree((*pmedata)->work_mhz);
1769 sfree((*pmedata)->work_m2);
1770 sfree((*pmedata)->work_denom);
1771 sfree((*pmedata)->work_tmp1_alloc);
1772 sfree((*pmedata)->work_m2inv);
1780 static int mult_up(int n,int f)
1782 return ((n + f - 1)/f)*f;
1786 static double pme_load_imbalance(gmx_pme_t pme)
1791 nma = pme->nnodes_major;
1792 nmi = pme->nnodes_minor;
1794 n1 = mult_up(pme->nkx,nma)*mult_up(pme->nky,nmi)*pme->nkz;
1795 n2 = mult_up(pme->nkx,nma)*mult_up(pme->nkz,nmi)*pme->nky;
1796 n3 = mult_up(pme->nky,nma)*mult_up(pme->nkz,nmi)*pme->nkx;
1798 /* pme_solve is roughly double the cost of an fft */
1800 return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
1803 static void init_atomcomm(gmx_pme_t pme,pme_atomcomm_t *atc, t_commrec *cr,
1804 int dimind,gmx_bool bSpread)
1808 atc->dimind = dimind;
1813 if (pme->nnodes > 1)
1815 atc->mpi_comm = pme->mpi_comm_d[dimind];
1816 MPI_Comm_size(atc->mpi_comm,&atc->nslab);
1817 MPI_Comm_rank(atc->mpi_comm,&atc->nodeid);
1821 fprintf(debug,"For PME atom communication in dimind %d: nslab %d rank %d\n",atc->dimind,atc->nslab,atc->nodeid);
1825 atc->bSpread = bSpread;
1826 atc->pme_order = pme->pme_order;
1830 /* These three allocations are not required for particle decomp. */
1831 snew(atc->node_dest,atc->nslab);
1832 snew(atc->node_src,atc->nslab);
1833 setup_coordinate_communication(atc);
1835 snew(atc->count,atc->nslab);
1836 snew(atc->rcount,atc->nslab);
1837 snew(atc->buf_index,atc->nslab);
1842 init_overlap_comm(pme_overlap_t * ol,
1851 int lbnd,rbnd,maxlr,b,i;
1854 pme_grid_comm_t *pgc;
1856 int fft_start,fft_end,send_index1,recv_index1;
1859 ol->mpi_comm = comm;
1862 ol->nnodes = nnodes;
1863 ol->nodeid = nodeid;
1865 /* Linear translation of the PME grid wo'nt affect reciprocal space
1866 * calculations, so to optimize we only interpolate "upwards",
1867 * which also means we only have to consider overlap in one direction.
1868 * I.e., particles on this node might also be spread to grid indices
1869 * that belong to higher nodes (modulo nnodes)
1872 snew(ol->s2g0,ol->nnodes+1);
1873 snew(ol->s2g1,ol->nnodes);
1874 if (debug) { fprintf(debug,"PME slab boundaries:"); }
1875 for(i=0; i<nnodes; i++)
1877 /* s2g0 the local interpolation grid start.
1878 * s2g1 the local interpolation grid end.
1879 * Because grid overlap communication only goes forward,
1880 * the grid the slabs for fft's should be rounded down.
1882 ol->s2g0[i] = ( i *ndata + 0 )/nnodes;
1883 ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
1887 fprintf(debug," %3d %3d",ol->s2g0[i],ol->s2g1[i]);
1890 ol->s2g0[nnodes] = ndata;
1891 if (debug) { fprintf(debug,"\n"); }
1893 /* Determine with how many nodes we need to communicate the grid overlap */
1899 for(i=0; i<nnodes; i++)
1901 if ((i+b < nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
1902 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
1908 while (bCont && b < nnodes);
1909 ol->noverlap_nodes = b - 1;
1911 snew(ol->send_id,ol->noverlap_nodes);
1912 snew(ol->recv_id,ol->noverlap_nodes);
1913 for(b=0; b<ol->noverlap_nodes; b++)
1915 ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
1916 ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
1918 snew(ol->comm_data, ol->noverlap_nodes);
1920 for(b=0; b<ol->noverlap_nodes; b++)
1922 pgc = &ol->comm_data[b];
1924 fft_start = ol->s2g0[ol->send_id[b]];
1925 fft_end = ol->s2g0[ol->send_id[b]+1];
1926 if (ol->send_id[b] < nodeid)
1931 send_index1 = ol->s2g1[nodeid];
1932 send_index1 = min(send_index1,fft_end);
1933 pgc->send_index0 = fft_start;
1934 pgc->send_nindex = max(0,send_index1 - pgc->send_index0);
1936 /* We always start receiving to the first index of our slab */
1937 fft_start = ol->s2g0[ol->nodeid];
1938 fft_end = ol->s2g0[ol->nodeid+1];
1939 recv_index1 = ol->s2g1[ol->recv_id[b]];
1940 if (ol->recv_id[b] > nodeid)
1942 recv_index1 -= ndata;
1944 recv_index1 = min(recv_index1,fft_end);
1945 pgc->recv_index0 = fft_start;
1946 pgc->recv_nindex = max(0,recv_index1 - pgc->recv_index0);
1951 make_gridindex5_to_localindex(int n,int local_start,int local_range,
1952 int **global_to_local,
1953 real **fraction_shift)
1961 for(i=0; (i<5*n); i++)
1963 /* Determine the global to local grid index */
1964 gtl[i] = (i - local_start + n) % n;
1965 /* For coordinates that fall within the local grid the fraction
1966 * is correct, we don't need to shift it.
1969 if (local_range < n)
1971 /* Due to rounding issues i could be 1 beyond the lower or
1972 * upper boundary of the local grid. Correct the index for this.
1973 * If we shift the index, we need to shift the fraction by
1974 * the same amount in the other direction to not affect
1976 * Note that due to this shifting the weights at the end of
1977 * the spline might change, but that will only involve values
1978 * between zero and values close to the precision of a real,
1979 * which is anyhow the accuracy of the whole mesh calculation.
1981 /* With local_range=0 we should not change i=local_start */
1982 if (i % n != local_start)
1989 else if (gtl[i] == local_range)
1991 gtl[i] = local_range - 1;
1998 *global_to_local = gtl;
1999 *fraction_shift = fsh;
2003 gmx_pme_check_grid_restrictions(FILE *fplog,char dim,int nnodes,int *nk)
2007 if (*nk % nnodes != 0)
2009 nk_new = nnodes*(*nk/nnodes + 1);
2011 if (2*nk_new >= 3*(*nk))
2013 gmx_fatal(FARGS,"The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
2014 dim,*nk,dim,nnodes,dim);
2019 fprintf(fplog,"\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
2020 dim,*nk,dim,nnodes,dim,nk_new,dim);
2027 int gmx_pme_init(gmx_pme_t * pmedata,
2033 gmx_bool bFreeEnergy,
2034 gmx_bool bReproducible)
2038 pme_atomcomm_t *atc;
2039 int bufsizex,bufsizey,bufsize;
2043 fprintf(debug,"Creating PME data structures.\n");
2046 pme->redist_init = FALSE;
2047 pme->sum_qgrid_tmp = NULL;
2048 pme->sum_qgrid_dd_tmp = NULL;
2049 pme->buf_nalloc = 0;
2050 pme->redist_buf_nalloc = 0;
2053 pme->bPPnode = TRUE;
2055 pme->nnodes_major = nnodes_major;
2056 pme->nnodes_minor = nnodes_minor;
2061 pme->mpi_comm = cr->mpi_comm_mygroup;
2063 MPI_Comm_rank(pme->mpi_comm,&pme->nodeid);
2064 MPI_Comm_size(pme->mpi_comm,&pme->nnodes);
2068 if (pme->nnodes == 1)
2070 pme->ndecompdim = 0;
2071 pme->nodeid_major = 0;
2072 pme->nodeid_minor = 0;
2076 if (nnodes_minor == 1)
2079 pme->mpi_comm_d[0] = pme->mpi_comm;
2080 pme->mpi_comm_d[1] = NULL;
2082 pme->ndecompdim = 1;
2083 pme->nodeid_major = pme->nodeid;
2084 pme->nodeid_minor = 0;
2087 else if (nnodes_major == 1)
2090 pme->mpi_comm_d[0] = NULL;
2091 pme->mpi_comm_d[1] = pme->mpi_comm;
2093 pme->ndecompdim = 1;
2094 pme->nodeid_major = 0;
2095 pme->nodeid_minor = pme->nodeid;
2099 if (pme->nnodes % nnodes_major != 0)
2101 gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
2103 pme->ndecompdim = 2;
2106 MPI_Comm_split(pme->mpi_comm,pme->nodeid % nnodes_minor,
2107 pme->nodeid,&pme->mpi_comm_d[0]); /* My communicator along major dimension */
2108 MPI_Comm_split(pme->mpi_comm,pme->nodeid/nnodes_minor,
2109 pme->nodeid,&pme->mpi_comm_d[1]); /* My communicator along minor dimension */
2111 MPI_Comm_rank(pme->mpi_comm_d[0],&pme->nodeid_major);
2112 MPI_Comm_size(pme->mpi_comm_d[0],&pme->nnodes_major);
2113 MPI_Comm_rank(pme->mpi_comm_d[1],&pme->nodeid_minor);
2114 MPI_Comm_size(pme->mpi_comm_d[1],&pme->nnodes_minor);
2117 pme->bPPnode = (cr->duty & DUTY_PP);
2120 if (ir->ePBC == epbcSCREW)
2122 gmx_fatal(FARGS,"pme does not (yet) work with pbc = screw");
2125 pme->bFEP = ((ir->efep != efepNO) && bFreeEnergy);
2129 pme->pme_order = ir->pme_order;
2130 pme->epsilon_r = ir->epsilon_r;
2132 /* Currently pme.c supports only the fft5d FFT code.
2133 * Therefore the grid always needs to be divisible by nnodes.
2134 * When the old 1D code is also supported again, change this check.
2136 * This check should be done before calling gmx_pme_init
2137 * and fplog should be passed iso stderr.
2139 if (pme->ndecompdim >= 2)
2141 if (pme->ndecompdim >= 1)
2144 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
2145 'x',nnodes_major,&pme->nkx);
2146 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
2147 'y',nnodes_minor,&pme->nky);
2151 if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
2152 pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
2153 pme->nkz <= pme->pme_order)
2155 gmx_fatal(FARGS,"The pme grid dimensions need to be larger than pme_order (%d) and in parallel larger than 2*pme_ordern for x and/or y",pme->pme_order);
2158 if (pme->nnodes > 1) {
2162 MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
2163 MPI_Type_commit(&(pme->rvec_mpi));
2166 /* Note that the charge spreading and force gathering, which usually
2167 * takes about the same amount of time as FFT+solve_pme,
2168 * is always fully load balanced
2169 * (unless the charge distribution is inhomogeneous).
2172 imbal = pme_load_imbalance(pme);
2173 if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
2177 "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
2178 " For optimal PME load balancing\n"
2179 " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
2180 " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
2182 (int)((imbal-1)*100 + 0.5),
2183 pme->nkx,pme->nky,pme->nnodes_major,
2184 pme->nky,pme->nkz,pme->nnodes_minor);
2188 init_overlap_comm(&pme->overlap[0],pme->pme_order,
2192 pme->nnodes_major,pme->nodeid_major,pme->nkx);
2194 init_overlap_comm(&pme->overlap[1],pme->pme_order,
2198 pme->nnodes_minor,pme->nodeid_minor,pme->nky);
2200 snew(pme->bsp_mod[XX],pme->nkx);
2201 snew(pme->bsp_mod[YY],pme->nky);
2202 snew(pme->bsp_mod[ZZ],pme->nkz);
2204 /* Allocate data for the interpolation grid, including overlap */
2205 pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
2206 pme->overlap[0].s2g0[pme->nodeid_major];
2207 pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
2208 pme->overlap[1].s2g0[pme->nodeid_minor];
2209 pme->pmegrid_nz = pme->nkz + pme->pme_order - 1;
2211 pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
2212 pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
2213 pme->pmegrid_start_iz = 0;
2215 make_gridindex5_to_localindex(pme->nkx,
2216 pme->pmegrid_start_ix,
2217 pme->pmegrid_nx - (pme->pme_order-1),
2218 &pme->nnx,&pme->fshx);
2219 make_gridindex5_to_localindex(pme->nky,
2220 pme->pmegrid_start_iy,
2221 pme->pmegrid_ny - (pme->pme_order-1),
2222 &pme->nny,&pme->fshy);
2223 make_gridindex5_to_localindex(pme->nkz,
2224 pme->pmegrid_start_iz,
2225 pme->pmegrid_nz - (pme->pme_order-1),
2226 &pme->nnz,&pme->fshz);
2228 snew(pme->pmegridA,pme->pmegrid_nx*pme->pmegrid_ny*pme->pmegrid_nz);
2230 /* For non-divisible grid we need pme_order iso pme_order-1 */
2231 /* x overlap is copied in place: take padding into account.
2232 * y is always copied through a buffer: we don't need padding in z,
2233 * but we do need the overlap in x because of the communication order.
2235 bufsizex = pme->pme_order*pme->pmegrid_ny*pme->pmegrid_nz;
2236 bufsizey = pme->pme_order*pme->pmegrid_nx*pme->nkz;
2237 bufsize = (bufsizex>bufsizey) ? bufsizex : bufsizey;
2239 snew(pme->pmegrid_sendbuf,bufsize);
2240 snew(pme->pmegrid_recvbuf,bufsize);
2242 ndata[0] = pme->nkx;
2243 ndata[1] = pme->nky;
2244 ndata[2] = pme->nkz;
2246 /* This routine will allocate the grid data to fit the FFTs */
2247 gmx_parallel_3dfft_init(&pme->pfft_setupA,ndata,
2248 &pme->fftgridA,&pme->cfftgridA,
2250 pme->overlap[0].s2g0,pme->overlap[1].s2g0,
2255 snew(pme->pmegridB,pme->pmegrid_nx*pme->pmegrid_ny*pme->pmegrid_nz);
2256 gmx_parallel_3dfft_init(&pme->pfft_setupB,ndata,
2257 &pme->fftgridB,&pme->cfftgridB,
2259 pme->overlap[0].s2g0,pme->overlap[1].s2g0,
2263 pme->pmegridB = NULL;
2264 pme->fftgridB = NULL;
2265 pme->cfftgridB = NULL;
2268 make_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
2270 /* Use atc[0] for spreading */
2271 init_atomcomm(pme,&pme->atc[0],cr,nnodes_major > 1 ? 0 : 1,TRUE);
2272 if (pme->ndecompdim >= 2)
2274 init_atomcomm(pme,&pme->atc[1],cr,1,FALSE);
2277 if (pme->nnodes == 1) {
2278 pme->atc[0].n = homenr;
2279 pme_realloc_atomcomm_things(&pme->atc[0]);
2282 /* Use fft5d, order after FFT is y major, z, x minor */
2283 pme->work_nalloc = pme->nkx;
2284 snew(pme->work_mhx,pme->work_nalloc);
2285 snew(pme->work_mhy,pme->work_nalloc);
2286 snew(pme->work_mhz,pme->work_nalloc);
2287 snew(pme->work_m2,pme->work_nalloc);
2288 snew(pme->work_denom,pme->work_nalloc);
2289 /* Allocate an aligned pointer for SSE operations, including 3 extra
2290 * elements at the end since SSE operates on 4 elements at a time.
2292 snew(pme->work_tmp1_alloc,pme->work_nalloc+8);
2293 pme->work_tmp1 = (real *) (((size_t) pme->work_tmp1_alloc + 16) & (~((size_t) 15)));
2294 snew(pme->work_m2inv,pme->work_nalloc);
2301 static void spread_on_grid(gmx_pme_t pme,
2302 pme_atomcomm_t *atc,real *grid,
2303 gmx_bool bCalcSplines,gmx_bool bSpread)
2308 /* Compute fftgrid index for all atoms,
2309 * with help of some extra variables.
2311 calc_interpolation_idx(pme,atc);
2313 /* make local bsplines */
2314 make_bsplines(atc->theta,atc->dtheta,pme->pme_order,
2315 atc->fractx,atc->n,atc->q,pme->bFEP);
2320 /* put local atoms on grid. */
2321 spread_q_bsplines(pme,atc,grid);
2325 void gmx_pme_calc_energy(gmx_pme_t pme,int n,rvec *x,real *q,real *V)
2327 pme_atomcomm_t *atc;
2330 if (pme->nnodes > 1)
2332 gmx_incons("gmx_pme_calc_energy called in parallel");
2336 gmx_incons("gmx_pme_calc_energy with free energy");
2339 atc = &pme->atc_energy;
2341 atc->bSpread = TRUE;
2342 atc->pme_order = pme->pme_order;
2344 pme_realloc_atomcomm_things(atc);
2348 /* We only use the A-charges grid */
2349 grid = pme->pmegridA;
2351 spread_on_grid(pme,atc,grid,TRUE,FALSE);
2353 *V = gather_energy_bsplines(pme,grid,atc);
2357 static void reset_pmeonly_counters(t_commrec *cr,gmx_wallcycle_t wcycle,
2358 t_nrnb *nrnb,t_inputrec *ir, gmx_large_int_t step_rel)
2360 /* Reset all the counters related to performance over the run */
2361 wallcycle_stop(wcycle,ewcRUN);
2362 wallcycle_reset_all(wcycle);
2364 ir->init_step += step_rel;
2365 ir->nsteps -= step_rel;
2366 wallcycle_start(wcycle,ewcRUN);
2370 int gmx_pmeonly(gmx_pme_t pme,
2371 t_commrec *cr, t_nrnb *nrnb,
2372 gmx_wallcycle_t wcycle,
2373 real ewaldcoeff, gmx_bool bGatherOnly,
2376 gmx_pme_pp_t pme_pp;
2379 rvec *x_pp=NULL,*f_pp=NULL;
2380 real *chargeA=NULL,*chargeB=NULL;
2382 int maxshift_x=0,maxshift_y=0;
2383 real energy,dvdlambda;
2388 gmx_large_int_t step,step_rel;
2391 pme_pp = gmx_pme_pp_init(cr);
2396 do /****** this is a quasi-loop over time steps! */
2398 /* Domain decomposition */
2399 natoms = gmx_pme_recv_q_x(pme_pp,
2400 &chargeA,&chargeB,box,&x_pp,&f_pp,
2401 &maxshift_x,&maxshift_y,
2407 /* We should stop: break out of the loop */
2411 step_rel = step - ir->init_step;
2414 wallcycle_start(wcycle,ewcRUN);
2416 wallcycle_start(wcycle,ewcPMEMESH);
2420 gmx_pme_do(pme,0,natoms,x_pp,f_pp,chargeA,chargeB,box,
2421 cr,maxshift_x,maxshift_y,nrnb,wcycle,vir,ewaldcoeff,
2422 &energy,lambda,&dvdlambda,
2423 GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
2425 cycles = wallcycle_stop(wcycle,ewcPMEMESH);
2427 gmx_pme_send_force_vir_ener(pme_pp,
2428 f_pp,vir,energy,dvdlambda,
2433 if (step_rel == wcycle_get_reset_counters(wcycle))
2435 /* Reset all the counters related to performance over the run */
2436 reset_pmeonly_counters(cr,wcycle,nrnb,ir,step_rel);
2437 wcycle_set_reset_counters(wcycle, 0);
2440 } /***** end of quasi-loop, we stop with the break above */
2446 int gmx_pme_do(gmx_pme_t pme,
2447 int start, int homenr,
2449 real *chargeA, real *chargeB,
2450 matrix box, t_commrec *cr,
2451 int maxshift_x, int maxshift_y,
2452 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
2453 matrix vir, real ewaldcoeff,
2454 real *energy, real lambda,
2455 real *dvdlambda, int flags)
2457 int q,d,i,j,ntot,npme;
2461 pme_atomcomm_t *atc=NULL;
2465 real *charge=NULL,*q_d,vol;
2469 gmx_parallel_3dfft_t pfft_setup;
2471 t_complex * cfftgrid;
2473 if (pme->nnodes > 1) {
2476 if (atc->npd > atc->pd_nalloc) {
2477 atc->pd_nalloc = over_alloc_dd(atc->npd);
2478 srenew(atc->pd,atc->pd_nalloc);
2480 atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
2483 for(q=0; q<(pme->bFEP ? 2 : 1); q++) {
2485 grid = pme->pmegridA;
2486 fftgrid = pme->fftgridA;
2487 cfftgrid = pme->cfftgridA;
2488 pfft_setup = pme->pfft_setupA;
2489 charge = chargeA+start;
2491 grid = pme->pmegridB;
2492 fftgrid = pme->fftgridB;
2493 cfftgrid = pme->cfftgridB;
2494 pfft_setup = pme->pfft_setupB;
2495 charge = chargeB+start;
2497 /* Unpack structure */
2499 fprintf(debug,"PME: nnodes = %d, nodeid = %d\n",
2500 cr->nnodes,cr->nodeid);
2501 fprintf(debug,"Grid = %p\n",(void*)grid);
2503 gmx_fatal(FARGS,"No grid!");
2507 m_inv_ur0(box,pme->recipbox);
2509 if (pme->nnodes == 1) {
2511 if (DOMAINDECOMP(cr)) {
2513 pme_realloc_atomcomm_things(atc);
2519 wallcycle_start(wcycle,ewcPME_REDISTXF);
2520 for(d=pme->ndecompdim-1; d>=0; d--)
2522 if (d == pme->ndecompdim-1)
2530 n_d = pme->atc[d+1].n;
2536 if (atc->npd > atc->pd_nalloc) {
2537 atc->pd_nalloc = over_alloc_dd(atc->npd);
2538 srenew(atc->pd,atc->pd_nalloc);
2540 atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
2541 pme_calc_pidx(n_d,pme->recipbox,x_d,atc);
2544 GMX_BARRIER(cr->mpi_comm_mygroup);
2545 /* Redistribute x (only once) and qA or qB */
2546 if (DOMAINDECOMP(cr)) {
2547 dd_pmeredist_x_q(pme, n_d, q==0, x_d, q_d, atc);
2549 pmeredist_pd(pme, TRUE, n_d, q==0, x_d, q_d, atc);
2554 wallcycle_stop(wcycle,ewcPME_REDISTXF);
2558 fprintf(debug,"Node= %6d, pme local particles=%6d\n",
2561 if (flags & GMX_PME_SPREAD_Q)
2563 wallcycle_start(wcycle,ewcPME_SPREADGATHER);
2565 /* Spread the charges on a grid */
2566 GMX_MPE_LOG(ev_spread_on_grid_start);
2568 /* Spread the charges on a grid */
2569 spread_on_grid(pme,&pme->atc[0],grid,q==0,TRUE);
2570 GMX_MPE_LOG(ev_spread_on_grid_finish);
2574 inc_nrnb(nrnb,eNR_WEIGHTS,DIM*atc->n);
2576 inc_nrnb(nrnb,eNR_SPREADQBSP,
2577 pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
2579 wrap_periodic_pmegrid(pme,grid);
2581 /* sum contributions to local grid from other nodes */
2583 if (pme->nnodes > 1) {
2584 GMX_BARRIER(cr->mpi_comm_mygroup);
2585 gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_FORWARD);
2591 copy_pmegrid_to_fftgrid(pme,grid,fftgrid);
2593 wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
2596 if (flags & GMX_PME_SOLVE)
2599 GMX_BARRIER(cr->mpi_comm_mygroup);
2600 GMX_MPE_LOG(ev_gmxfft3d_start);
2601 wallcycle_start(wcycle,ewcPME_FFT);
2602 gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_REAL_TO_COMPLEX,fftgrid,cfftgrid);
2603 wallcycle_stop(wcycle,ewcPME_FFT);
2604 GMX_MPE_LOG(ev_gmxfft3d_finish);
2607 /* solve in k-space for our local cells */
2609 GMX_BARRIER(cr->mpi_comm_mygroup);
2610 GMX_MPE_LOG(ev_solve_pme_start);
2611 wallcycle_start(wcycle,ewcPME_SOLVE);
2613 solve_pme_yzx(pme,cfftgrid,ewaldcoeff,vol,
2614 flags & GMX_PME_CALC_ENER_VIR,
2615 &energy_AB[q],vir_AB[q]);
2616 wallcycle_stop(wcycle,ewcPME_SOLVE);
2618 GMX_MPE_LOG(ev_solve_pme_finish);
2619 inc_nrnb(nrnb,eNR_SOLVEPME,loop_count);
2622 if (flags & GMX_PME_CALC_F)
2626 GMX_BARRIER(cr->mpi_comm_mygroup);
2627 GMX_MPE_LOG(ev_gmxfft3d_start);
2629 wallcycle_start(wcycle,ewcPME_FFT);
2630 gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_COMPLEX_TO_REAL,cfftgrid,fftgrid);
2631 wallcycle_stop(wcycle,ewcPME_FFT);
2634 GMX_MPE_LOG(ev_gmxfft3d_finish);
2636 if (pme->nodeid == 0)
2638 ntot = pme->nkx*pme->nky*pme->nkz;
2639 npme = ntot*log((real)ntot)/log(2.0);
2640 inc_nrnb(nrnb,eNR_FFT,2*npme);
2643 wallcycle_start(wcycle,ewcPME_SPREADGATHER);
2645 copy_fftgrid_to_pmegrid(pme,fftgrid,grid);
2647 /* distribute local grid to all nodes */
2649 if (pme->nnodes > 1) {
2650 GMX_BARRIER(cr->mpi_comm_mygroup);
2651 gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_BACKWARD);
2656 unwrap_periodic_pmegrid(pme,grid);
2658 /* interpolate forces for our local atoms */
2659 GMX_BARRIER(cr->mpi_comm_mygroup);
2660 GMX_MPE_LOG(ev_gather_f_bsplines_start);
2664 /* If we are running without parallelization,
2665 * atc->f is the actual force array, not a buffer,
2666 * therefore we should not clear it.
2668 bClearF = (q == 0 && PAR(cr));
2669 gather_f_bsplines(pme,grid,bClearF,&pme->atc[0],
2670 pme->bFEP ? (q==0 ? 1.0-lambda : lambda) : 1.0);
2673 GMX_MPE_LOG(ev_gather_f_bsplines_finish);
2675 inc_nrnb(nrnb,eNR_GATHERFBSP,
2676 pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
2677 wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
2681 if ((flags & GMX_PME_CALC_F) && pme->nnodes > 1) {
2682 wallcycle_start(wcycle,ewcPME_REDISTXF);
2683 for(d=0; d<pme->ndecompdim; d++)
2686 if (d == pme->ndecompdim - 1)
2693 n_d = pme->atc[d+1].n;
2694 f_d = pme->atc[d+1].f;
2696 GMX_BARRIER(cr->mpi_comm_mygroup);
2697 if (DOMAINDECOMP(cr)) {
2698 dd_pmeredist_f(pme,atc,n_d,f_d,
2699 d==pme->ndecompdim-1 && pme->bPPnode);
2701 pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
2705 wallcycle_stop(wcycle,ewcPME_REDISTXF);
2710 *energy = energy_AB[0];
2711 m_add(vir,vir_AB[0],vir);
2713 *energy = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
2714 *dvdlambda += energy_AB[1] - energy_AB[0];
2715 for(i=0; i<DIM; i++)
2716 for(j=0; j<DIM; j++)
2717 vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] + lambda*vir_AB[1][i][j];
2721 fprintf(debug,"PME mesh energy: %g\n",*energy);